{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 7130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007012622720897616, "grad_norm": 0.6737998127937317, "learning_rate": 8.333333333333333e-07, "loss": 0.2609, "step": 1 }, { "epoch": 0.001402524544179523, "grad_norm": 0.416408896446228, "learning_rate": 1.6666666666666667e-06, "loss": 0.081, "step": 2 }, { "epoch": 0.0021037868162692847, "grad_norm": 0.6339009404182434, "learning_rate": 2.5e-06, "loss": 0.2618, "step": 3 }, { "epoch": 0.002805049088359046, "grad_norm": 0.38643860816955566, "learning_rate": 3.3333333333333333e-06, "loss": 0.0816, "step": 4 }, { "epoch": 0.0035063113604488078, "grad_norm": 0.44642364978790283, "learning_rate": 4.166666666666667e-06, "loss": 0.08, "step": 5 }, { "epoch": 0.004207573632538569, "grad_norm": 0.44730836153030396, "learning_rate": 5e-06, "loss": 0.0791, "step": 6 }, { "epoch": 0.004908835904628331, "grad_norm": 0.931324303150177, "learning_rate": 5.833333333333334e-06, "loss": 0.2667, "step": 7 }, { "epoch": 0.005610098176718092, "grad_norm": 0.43420666456222534, "learning_rate": 6.666666666666667e-06, "loss": 0.078, "step": 8 }, { "epoch": 0.006311360448807854, "grad_norm": 0.4643440544605255, "learning_rate": 7.5e-06, "loss": 0.0778, "step": 9 }, { "epoch": 0.0070126227208976155, "grad_norm": 1.0165714025497437, "learning_rate": 8.333333333333334e-06, "loss": 0.2721, "step": 10 }, { "epoch": 0.007713884992987377, "grad_norm": 0.6277392506599426, "learning_rate": 9.166666666666666e-06, "loss": 0.0726, "step": 11 }, { "epoch": 0.008415147265077139, "grad_norm": 0.6145909428596497, "learning_rate": 1e-05, "loss": 0.0734, "step": 12 }, { "epoch": 0.0091164095371669, "grad_norm": 0.6616089940071106, "learning_rate": 1.0833333333333334e-05, "loss": 0.0714, "step": 13 }, { "epoch": 0.009817671809256662, "grad_norm": 0.7784654498100281, "learning_rate": 1.1666666666666668e-05, "loss": 0.0657, "step": 14 }, { "epoch": 0.010518934081346423, "grad_norm": 2.3128066062927246, "learning_rate": 1.25e-05, "loss": 0.2782, "step": 15 }, { "epoch": 0.011220196353436185, "grad_norm": 14.616716384887695, "learning_rate": 1.3333333333333333e-05, "loss": 0.6532, "step": 16 }, { "epoch": 0.011921458625525946, "grad_norm": 1.3609784841537476, "learning_rate": 1.4166666666666668e-05, "loss": 0.0479, "step": 17 }, { "epoch": 0.012622720897615708, "grad_norm": 1.1924632787704468, "learning_rate": 1.5e-05, "loss": 0.0464, "step": 18 }, { "epoch": 0.01332398316970547, "grad_norm": 22.30014991760254, "learning_rate": 1.5833333333333333e-05, "loss": 0.7659, "step": 19 }, { "epoch": 0.014025245441795231, "grad_norm": 13.758585929870605, "learning_rate": 1.6666666666666667e-05, "loss": 0.4593, "step": 20 }, { "epoch": 0.014726507713884993, "grad_norm": 1.1601301431655884, "learning_rate": 1.75e-05, "loss": 0.0419, "step": 21 }, { "epoch": 0.015427769985974754, "grad_norm": 1.1238924264907837, "learning_rate": 1.8333333333333333e-05, "loss": 0.0384, "step": 22 }, { "epoch": 0.016129032258064516, "grad_norm": 1.0541298389434814, "learning_rate": 1.9166666666666667e-05, "loss": 0.0305, "step": 23 }, { "epoch": 0.016830294530154277, "grad_norm": 1.1610890626907349, "learning_rate": 2e-05, "loss": 0.026, "step": 24 }, { "epoch": 0.01753155680224404, "grad_norm": 0.5723670125007629, "learning_rate": 2.0833333333333336e-05, "loss": 0.0169, "step": 25 }, { "epoch": 0.0182328190743338, "grad_norm": 0.3416072428226471, "learning_rate": 2.1666666666666667e-05, "loss": 0.0103, "step": 26 }, { "epoch": 0.018934081346423562, "grad_norm": 19.442529678344727, "learning_rate": 2.25e-05, "loss": 0.811, "step": 27 }, { "epoch": 0.019635343618513323, "grad_norm": 14.161864280700684, "learning_rate": 2.3333333333333336e-05, "loss": 0.963, "step": 28 }, { "epoch": 0.020336605890603085, "grad_norm": 0.4935978055000305, "learning_rate": 2.4166666666666667e-05, "loss": 0.0166, "step": 29 }, { "epoch": 0.021037868162692847, "grad_norm": 0.5946357250213623, "learning_rate": 2.5e-05, "loss": 0.0184, "step": 30 }, { "epoch": 0.021739130434782608, "grad_norm": 0.514384925365448, "learning_rate": 2.5833333333333336e-05, "loss": 0.0154, "step": 31 }, { "epoch": 0.02244039270687237, "grad_norm": 0.49021121859550476, "learning_rate": 2.6666666666666667e-05, "loss": 0.0143, "step": 32 }, { "epoch": 0.02314165497896213, "grad_norm": 0.3502137064933777, "learning_rate": 2.7500000000000004e-05, "loss": 0.0085, "step": 33 }, { "epoch": 0.023842917251051893, "grad_norm": 0.19762448966503143, "learning_rate": 2.8333333333333335e-05, "loss": 0.0068, "step": 34 }, { "epoch": 0.024544179523141654, "grad_norm": 12.221427917480469, "learning_rate": 2.916666666666667e-05, "loss": 1.0152, "step": 35 }, { "epoch": 0.025245441795231416, "grad_norm": 20.549226760864258, "learning_rate": 3e-05, "loss": 1.1504, "step": 36 }, { "epoch": 0.025946704067321177, "grad_norm": 23.738576889038086, "learning_rate": 3.0833333333333335e-05, "loss": 0.8281, "step": 37 }, { "epoch": 0.02664796633941094, "grad_norm": 16.651905059814453, "learning_rate": 3.1666666666666666e-05, "loss": 1.0854, "step": 38 }, { "epoch": 0.0273492286115007, "grad_norm": 2.4389359951019287, "learning_rate": 3.2500000000000004e-05, "loss": 0.2921, "step": 39 }, { "epoch": 0.028050490883590462, "grad_norm": 0.4685153663158417, "learning_rate": 3.3333333333333335e-05, "loss": 0.0574, "step": 40 }, { "epoch": 0.028751753155680224, "grad_norm": 0.44632649421691895, "learning_rate": 3.4166666666666666e-05, "loss": 0.0638, "step": 41 }, { "epoch": 0.029453015427769985, "grad_norm": 0.402079701423645, "learning_rate": 3.5e-05, "loss": 0.0573, "step": 42 }, { "epoch": 0.030154277699859747, "grad_norm": 1.6201813220977783, "learning_rate": 3.5833333333333335e-05, "loss": 0.3043, "step": 43 }, { "epoch": 0.030855539971949508, "grad_norm": 0.42095673084259033, "learning_rate": 3.6666666666666666e-05, "loss": 0.0601, "step": 44 }, { "epoch": 0.03155680224403927, "grad_norm": 0.39913901686668396, "learning_rate": 3.7500000000000003e-05, "loss": 0.061, "step": 45 }, { "epoch": 0.03225806451612903, "grad_norm": 0.4014030992984772, "learning_rate": 3.8333333333333334e-05, "loss": 0.0607, "step": 46 }, { "epoch": 0.03295932678821879, "grad_norm": 2.2400283813476562, "learning_rate": 3.9166666666666665e-05, "loss": 0.2999, "step": 47 }, { "epoch": 0.033660589060308554, "grad_norm": 3.3916993141174316, "learning_rate": 4e-05, "loss": 0.5654, "step": 48 }, { "epoch": 0.034361851332398316, "grad_norm": 0.4089498221874237, "learning_rate": 4.0833333333333334e-05, "loss": 0.0579, "step": 49 }, { "epoch": 0.03506311360448808, "grad_norm": 2.3870582580566406, "learning_rate": 4.166666666666667e-05, "loss": 0.5222, "step": 50 }, { "epoch": 0.03576437587657784, "grad_norm": 0.3163868486881256, "learning_rate": 4.25e-05, "loss": 0.0704, "step": 51 }, { "epoch": 0.0364656381486676, "grad_norm": 0.3164120018482208, "learning_rate": 4.3333333333333334e-05, "loss": 0.0692, "step": 52 }, { "epoch": 0.03716690042075736, "grad_norm": 0.32624387741088867, "learning_rate": 4.4166666666666665e-05, "loss": 0.0624, "step": 53 }, { "epoch": 0.037868162692847124, "grad_norm": 0.3106810748577118, "learning_rate": 4.5e-05, "loss": 0.065, "step": 54 }, { "epoch": 0.038569424964936885, "grad_norm": 0.7320694923400879, "learning_rate": 4.5833333333333334e-05, "loss": 0.2863, "step": 55 }, { "epoch": 0.03927068723702665, "grad_norm": 0.3112699091434479, "learning_rate": 4.666666666666667e-05, "loss": 0.0664, "step": 56 }, { "epoch": 0.03997194950911641, "grad_norm": 0.29146918654441833, "learning_rate": 4.75e-05, "loss": 0.0696, "step": 57 }, { "epoch": 0.04067321178120617, "grad_norm": 0.29670488834381104, "learning_rate": 4.8333333333333334e-05, "loss": 0.0678, "step": 58 }, { "epoch": 0.04137447405329593, "grad_norm": 5.181276321411133, "learning_rate": 4.9166666666666665e-05, "loss": 0.7196, "step": 59 }, { "epoch": 0.04207573632538569, "grad_norm": 0.3253697454929352, "learning_rate": 5e-05, "loss": 0.0595, "step": 60 }, { "epoch": 0.042776998597475455, "grad_norm": 0.33001095056533813, "learning_rate": 4.999999753185356e-05, "loss": 0.066, "step": 61 }, { "epoch": 0.043478260869565216, "grad_norm": 1.127942681312561, "learning_rate": 4.999999012741473e-05, "loss": 0.2924, "step": 62 }, { "epoch": 0.04417952314165498, "grad_norm": 1.0445268154144287, "learning_rate": 4.999997778668497e-05, "loss": 0.2847, "step": 63 }, { "epoch": 0.04488078541374474, "grad_norm": 0.7399264574050903, "learning_rate": 4.999996050966671e-05, "loss": 0.2821, "step": 64 }, { "epoch": 0.0455820476858345, "grad_norm": 0.3060811460018158, "learning_rate": 4.999993829636337e-05, "loss": 0.0705, "step": 65 }, { "epoch": 0.04628330995792426, "grad_norm": 1.2458640336990356, "learning_rate": 4.9999911146779336e-05, "loss": 0.4791, "step": 66 }, { "epoch": 0.046984572230014024, "grad_norm": 0.30813613533973694, "learning_rate": 4.999987906091996e-05, "loss": 0.0767, "step": 67 }, { "epoch": 0.047685834502103785, "grad_norm": 4.0801825523376465, "learning_rate": 4.999984203879159e-05, "loss": 0.9128, "step": 68 }, { "epoch": 0.04838709677419355, "grad_norm": 0.421724408864975, "learning_rate": 4.9999800080401535e-05, "loss": 0.2617, "step": 69 }, { "epoch": 0.04908835904628331, "grad_norm": 0.4040254056453705, "learning_rate": 4.9999753185758066e-05, "loss": 0.0856, "step": 70 }, { "epoch": 0.04978962131837307, "grad_norm": 0.35313764214515686, "learning_rate": 4.9999701354870455e-05, "loss": 0.2566, "step": 71 }, { "epoch": 0.05049088359046283, "grad_norm": 0.3563598096370697, "learning_rate": 4.999964458774893e-05, "loss": 0.0897, "step": 72 }, { "epoch": 0.05119214586255259, "grad_norm": 0.36064594984054565, "learning_rate": 4.9999582884404704e-05, "loss": 0.2594, "step": 73 }, { "epoch": 0.051893408134642355, "grad_norm": 0.3209397494792938, "learning_rate": 4.9999516244849956e-05, "loss": 0.0875, "step": 74 }, { "epoch": 0.052594670406732116, "grad_norm": 0.8883861899375916, "learning_rate": 4.999944466909785e-05, "loss": 0.4294, "step": 75 }, { "epoch": 0.05329593267882188, "grad_norm": 0.36159881949424744, "learning_rate": 4.999936815716251e-05, "loss": 0.2589, "step": 76 }, { "epoch": 0.05399719495091164, "grad_norm": 0.3405170142650604, "learning_rate": 4.9999286709059055e-05, "loss": 0.2587, "step": 77 }, { "epoch": 0.0546984572230014, "grad_norm": 0.27104246616363525, "learning_rate": 4.999920032480356e-05, "loss": 0.0897, "step": 78 }, { "epoch": 0.05539971949509116, "grad_norm": 0.322324275970459, "learning_rate": 4.9999109004413074e-05, "loss": 0.0896, "step": 79 }, { "epoch": 0.056100981767180924, "grad_norm": 0.34943217039108276, "learning_rate": 4.999901274790565e-05, "loss": 0.252, "step": 80 }, { "epoch": 0.056802244039270686, "grad_norm": 0.3287108838558197, "learning_rate": 4.999891155530026e-05, "loss": 0.0913, "step": 81 }, { "epoch": 0.05750350631136045, "grad_norm": 0.3263850510120392, "learning_rate": 4.9998805426616926e-05, "loss": 0.2544, "step": 82 }, { "epoch": 0.05820476858345021, "grad_norm": 0.3294674754142761, "learning_rate": 4.9998694361876575e-05, "loss": 0.2483, "step": 83 }, { "epoch": 0.05890603085553997, "grad_norm": 0.2775851786136627, "learning_rate": 4.999857836110114e-05, "loss": 0.09, "step": 84 }, { "epoch": 0.05960729312762973, "grad_norm": 0.3607025146484375, "learning_rate": 4.999845742431355e-05, "loss": 0.2578, "step": 85 }, { "epoch": 0.06030855539971949, "grad_norm": 0.34455248713493347, "learning_rate": 4.999833155153765e-05, "loss": 0.0893, "step": 86 }, { "epoch": 0.061009817671809255, "grad_norm": 0.36242982745170593, "learning_rate": 4.9998200742798304e-05, "loss": 0.0884, "step": 87 }, { "epoch": 0.061711079943899017, "grad_norm": 0.3750418424606323, "learning_rate": 4.9998064998121344e-05, "loss": 0.2584, "step": 88 }, { "epoch": 0.06241234221598878, "grad_norm": 0.27058157324790955, "learning_rate": 4.999792431753359e-05, "loss": 0.0868, "step": 89 }, { "epoch": 0.06311360448807854, "grad_norm": 0.4057469666004181, "learning_rate": 4.999777870106278e-05, "loss": 0.0795, "step": 90 }, { "epoch": 0.0638148667601683, "grad_norm": 0.26330703496932983, "learning_rate": 4.9997628148737705e-05, "loss": 0.084, "step": 91 }, { "epoch": 0.06451612903225806, "grad_norm": 0.25789445638656616, "learning_rate": 4.9997472660588074e-05, "loss": 0.0826, "step": 92 }, { "epoch": 0.06521739130434782, "grad_norm": 0.25533944368362427, "learning_rate": 4.999731223664459e-05, "loss": 0.0814, "step": 93 }, { "epoch": 0.06591865357643759, "grad_norm": 0.2601306736469269, "learning_rate": 4.9997146876938925e-05, "loss": 0.0721, "step": 94 }, { "epoch": 0.06661991584852735, "grad_norm": 0.44854968786239624, "learning_rate": 4.999697658150374e-05, "loss": 0.2703, "step": 95 }, { "epoch": 0.06732117812061711, "grad_norm": 0.25064802169799805, "learning_rate": 4.999680135037265e-05, "loss": 0.0682, "step": 96 }, { "epoch": 0.06802244039270687, "grad_norm": 0.2412257194519043, "learning_rate": 4.999662118358026e-05, "loss": 0.0649, "step": 97 }, { "epoch": 0.06872370266479663, "grad_norm": 0.285335898399353, "learning_rate": 4.999643608116215e-05, "loss": 0.0622, "step": 98 }, { "epoch": 0.0694249649368864, "grad_norm": 0.400689959526062, "learning_rate": 4.999624604315486e-05, "loss": 0.0547, "step": 99 }, { "epoch": 0.07012622720897616, "grad_norm": 1.262488603591919, "learning_rate": 4.9996051069595906e-05, "loss": 0.0432, "step": 100 }, { "epoch": 0.07082748948106592, "grad_norm": 0.7657336592674255, "learning_rate": 4.99958511605238e-05, "loss": 0.0342, "step": 101 }, { "epoch": 0.07152875175315568, "grad_norm": 1.4300060272216797, "learning_rate": 4.9995646315978016e-05, "loss": 0.0304, "step": 102 }, { "epoch": 0.07223001402524544, "grad_norm": 2.1169662475585938, "learning_rate": 4.999543653599899e-05, "loss": 0.0351, "step": 103 }, { "epoch": 0.0729312762973352, "grad_norm": 0.34646227955818176, "learning_rate": 4.999522182062815e-05, "loss": 0.0509, "step": 104 }, { "epoch": 0.07363253856942496, "grad_norm": 0.32244250178337097, "learning_rate": 4.999500216990789e-05, "loss": 0.0591, "step": 105 }, { "epoch": 0.07433380084151472, "grad_norm": 0.34062477946281433, "learning_rate": 4.999477758388158e-05, "loss": 0.0423, "step": 106 }, { "epoch": 0.07503506311360449, "grad_norm": 0.9004980325698853, "learning_rate": 4.999454806259356e-05, "loss": 0.2873, "step": 107 }, { "epoch": 0.07573632538569425, "grad_norm": 4.096434116363525, "learning_rate": 4.999431360608915e-05, "loss": 0.3359, "step": 108 }, { "epoch": 0.07643758765778401, "grad_norm": 1.5060309171676636, "learning_rate": 4.9994074214414666e-05, "loss": 0.305, "step": 109 }, { "epoch": 0.07713884992987377, "grad_norm": 0.396414190530777, "learning_rate": 4.999382988761735e-05, "loss": 0.0603, "step": 110 }, { "epoch": 0.07784011220196353, "grad_norm": 0.7888501286506653, "learning_rate": 4.999358062574545e-05, "loss": 0.2883, "step": 111 }, { "epoch": 0.0785413744740533, "grad_norm": 0.32773908972740173, "learning_rate": 4.9993326428848186e-05, "loss": 0.0574, "step": 112 }, { "epoch": 0.07924263674614306, "grad_norm": 0.850200891494751, "learning_rate": 4.999306729697576e-05, "loss": 0.2965, "step": 113 }, { "epoch": 0.07994389901823282, "grad_norm": 1.1642791032791138, "learning_rate": 4.999280323017932e-05, "loss": 0.2948, "step": 114 }, { "epoch": 0.08064516129032258, "grad_norm": 0.2936810553073883, "learning_rate": 4.999253422851101e-05, "loss": 0.0697, "step": 115 }, { "epoch": 0.08134642356241234, "grad_norm": 0.6018584966659546, "learning_rate": 4.999226029202396e-05, "loss": 0.2776, "step": 116 }, { "epoch": 0.0820476858345021, "grad_norm": 0.28296959400177, "learning_rate": 4.999198142077225e-05, "loss": 0.0647, "step": 117 }, { "epoch": 0.08274894810659186, "grad_norm": 0.27370065450668335, "learning_rate": 4.999169761481093e-05, "loss": 0.0735, "step": 118 }, { "epoch": 0.08345021037868162, "grad_norm": 0.274480402469635, "learning_rate": 4.999140887419606e-05, "loss": 0.0744, "step": 119 }, { "epoch": 0.08415147265077139, "grad_norm": 11.641546249389648, "learning_rate": 4.999111519898464e-05, "loss": 0.749, "step": 120 }, { "epoch": 0.08485273492286115, "grad_norm": 0.26808515191078186, "learning_rate": 4.999081658923466e-05, "loss": 0.0754, "step": 121 }, { "epoch": 0.08555399719495091, "grad_norm": 0.2665489912033081, "learning_rate": 4.999051304500508e-05, "loss": 0.0772, "step": 122 }, { "epoch": 0.08625525946704067, "grad_norm": 0.2707854211330414, "learning_rate": 4.999020456635583e-05, "loss": 0.0775, "step": 123 }, { "epoch": 0.08695652173913043, "grad_norm": 0.47864794731140137, "learning_rate": 4.9989891153347844e-05, "loss": 0.2615, "step": 124 }, { "epoch": 0.0876577840112202, "grad_norm": 0.2902073860168457, "learning_rate": 4.998957280604297e-05, "loss": 0.0711, "step": 125 }, { "epoch": 0.08835904628330996, "grad_norm": 0.49670839309692383, "learning_rate": 4.998924952450409e-05, "loss": 0.2575, "step": 126 }, { "epoch": 0.08906030855539972, "grad_norm": 0.2947511374950409, "learning_rate": 4.9988921308795025e-05, "loss": 0.0727, "step": 127 }, { "epoch": 0.08976157082748948, "grad_norm": 0.48412907123565674, "learning_rate": 4.99885881589806e-05, "loss": 0.2629, "step": 128 }, { "epoch": 0.09046283309957924, "grad_norm": 0.4479179382324219, "learning_rate": 4.998825007512657e-05, "loss": 0.2606, "step": 129 }, { "epoch": 0.091164095371669, "grad_norm": 1.0980526208877563, "learning_rate": 4.998790705729971e-05, "loss": 0.4548, "step": 130 }, { "epoch": 0.09186535764375876, "grad_norm": 0.4329683184623718, "learning_rate": 4.998755910556773e-05, "loss": 0.2604, "step": 131 }, { "epoch": 0.09256661991584852, "grad_norm": 0.27008187770843506, "learning_rate": 4.9987206219999364e-05, "loss": 0.0816, "step": 132 }, { "epoch": 0.09326788218793829, "grad_norm": 0.43291333317756653, "learning_rate": 4.9986848400664255e-05, "loss": 0.2626, "step": 133 }, { "epoch": 0.09396914446002805, "grad_norm": 4.712279319763184, "learning_rate": 4.998648564763308e-05, "loss": 0.5275, "step": 134 }, { "epoch": 0.09467040673211781, "grad_norm": 0.3583643436431885, "learning_rate": 4.998611796097746e-05, "loss": 0.0825, "step": 135 }, { "epoch": 0.09537166900420757, "grad_norm": 0.39033272862434387, "learning_rate": 4.998574534076998e-05, "loss": 0.2593, "step": 136 }, { "epoch": 0.09607293127629733, "grad_norm": 0.2935941517353058, "learning_rate": 4.9985367787084245e-05, "loss": 0.0886, "step": 137 }, { "epoch": 0.0967741935483871, "grad_norm": 0.5468927025794983, "learning_rate": 4.998498529999478e-05, "loss": 0.0926, "step": 138 }, { "epoch": 0.09747545582047686, "grad_norm": 0.5334767699241638, "learning_rate": 4.998459787957711e-05, "loss": 0.0907, "step": 139 }, { "epoch": 0.09817671809256662, "grad_norm": 0.2960275709629059, "learning_rate": 4.9984205525907736e-05, "loss": 0.0902, "step": 140 }, { "epoch": 0.09887798036465638, "grad_norm": 0.2929839789867401, "learning_rate": 4.998380823906413e-05, "loss": 0.0892, "step": 141 }, { "epoch": 0.09957924263674614, "grad_norm": 0.449198454618454, "learning_rate": 4.998340601912473e-05, "loss": 0.2555, "step": 142 }, { "epoch": 0.1002805049088359, "grad_norm": 0.468272864818573, "learning_rate": 4.998299886616896e-05, "loss": 0.0845, "step": 143 }, { "epoch": 0.10098176718092566, "grad_norm": 0.3350108563899994, "learning_rate": 4.998258678027722e-05, "loss": 0.0846, "step": 144 }, { "epoch": 0.10168302945301542, "grad_norm": 3.8650717735290527, "learning_rate": 4.998216976153087e-05, "loss": 0.3372, "step": 145 }, { "epoch": 0.10238429172510519, "grad_norm": 0.3744242489337921, "learning_rate": 4.9981747810012246e-05, "loss": 0.2593, "step": 146 }, { "epoch": 0.10308555399719495, "grad_norm": 0.4089118242263794, "learning_rate": 4.998132092580468e-05, "loss": 0.0815, "step": 147 }, { "epoch": 0.10378681626928471, "grad_norm": 0.37165239453315735, "learning_rate": 4.9980889108992437e-05, "loss": 0.2541, "step": 148 }, { "epoch": 0.10448807854137447, "grad_norm": 0.3906348645687103, "learning_rate": 4.998045235966079e-05, "loss": 0.2583, "step": 149 }, { "epoch": 0.10518934081346423, "grad_norm": 0.27961039543151855, "learning_rate": 4.998001067789599e-05, "loss": 0.0869, "step": 150 }, { "epoch": 0.105890603085554, "grad_norm": 0.34451207518577576, "learning_rate": 4.9979564063785226e-05, "loss": 0.0845, "step": 151 }, { "epoch": 0.10659186535764376, "grad_norm": 0.3160581886768341, "learning_rate": 4.99791125174167e-05, "loss": 0.0829, "step": 152 }, { "epoch": 0.10729312762973352, "grad_norm": 0.3014097213745117, "learning_rate": 4.997865603887956e-05, "loss": 0.0815, "step": 153 }, { "epoch": 0.10799438990182328, "grad_norm": 0.28286251425743103, "learning_rate": 4.997819462826394e-05, "loss": 0.085, "step": 154 }, { "epoch": 0.10869565217391304, "grad_norm": 0.3890722692012787, "learning_rate": 4.997772828566095e-05, "loss": 0.2519, "step": 155 }, { "epoch": 0.1093969144460028, "grad_norm": 0.393470823764801, "learning_rate": 4.997725701116267e-05, "loss": 0.2581, "step": 156 }, { "epoch": 0.11009817671809256, "grad_norm": 0.30160367488861084, "learning_rate": 4.9976780804862156e-05, "loss": 0.0792, "step": 157 }, { "epoch": 0.11079943899018233, "grad_norm": 1.0258678197860718, "learning_rate": 4.997629966685343e-05, "loss": 0.442, "step": 158 }, { "epoch": 0.11150070126227209, "grad_norm": 0.32134032249450684, "learning_rate": 4.997581359723149e-05, "loss": 0.0722, "step": 159 }, { "epoch": 0.11220196353436185, "grad_norm": 0.42971935868263245, "learning_rate": 4.9975322596092324e-05, "loss": 0.2623, "step": 160 }, { "epoch": 0.11290322580645161, "grad_norm": 1.0330092906951904, "learning_rate": 4.997482666353287e-05, "loss": 0.4518, "step": 161 }, { "epoch": 0.11360448807854137, "grad_norm": 0.2668275535106659, "learning_rate": 4.997432579965106e-05, "loss": 0.0833, "step": 162 }, { "epoch": 0.11430575035063113, "grad_norm": 0.3018666207790375, "learning_rate": 4.9973820004545776e-05, "loss": 0.079, "step": 163 }, { "epoch": 0.1150070126227209, "grad_norm": 0.37382772564888, "learning_rate": 4.9973309278316896e-05, "loss": 0.0732, "step": 164 }, { "epoch": 0.11570827489481066, "grad_norm": 0.27213433384895325, "learning_rate": 4.997279362106527e-05, "loss": 0.0773, "step": 165 }, { "epoch": 0.11640953716690042, "grad_norm": 0.46830257773399353, "learning_rate": 4.997227303289271e-05, "loss": 0.2617, "step": 166 }, { "epoch": 0.11711079943899018, "grad_norm": 1.0025969743728638, "learning_rate": 4.9971747513902014e-05, "loss": 0.4407, "step": 167 }, { "epoch": 0.11781206171107994, "grad_norm": 0.2614319920539856, "learning_rate": 4.997121706419693e-05, "loss": 0.0827, "step": 168 }, { "epoch": 0.1185133239831697, "grad_norm": 0.39839813113212585, "learning_rate": 4.997068168388221e-05, "loss": 0.2448, "step": 169 }, { "epoch": 0.11921458625525946, "grad_norm": 0.3157074451446533, "learning_rate": 4.997014137306357e-05, "loss": 0.0722, "step": 170 }, { "epoch": 0.11991584852734923, "grad_norm": 0.30587008595466614, "learning_rate": 4.996959613184767e-05, "loss": 0.0715, "step": 171 }, { "epoch": 0.12061711079943899, "grad_norm": 0.27153757214546204, "learning_rate": 4.9969045960342197e-05, "loss": 0.0772, "step": 172 }, { "epoch": 0.12131837307152875, "grad_norm": 0.25602760910987854, "learning_rate": 4.996849085865578e-05, "loss": 0.0758, "step": 173 }, { "epoch": 0.12201963534361851, "grad_norm": 0.39778053760528564, "learning_rate": 4.9967930826898e-05, "loss": 0.2585, "step": 174 }, { "epoch": 0.12272089761570827, "grad_norm": 0.2721147835254669, "learning_rate": 4.9967365865179466e-05, "loss": 0.0756, "step": 175 }, { "epoch": 0.12342215988779803, "grad_norm": 0.26313072443008423, "learning_rate": 4.996679597361171e-05, "loss": 0.0822, "step": 176 }, { "epoch": 0.1241234221598878, "grad_norm": 0.4407467246055603, "learning_rate": 4.996622115230727e-05, "loss": 0.2571, "step": 177 }, { "epoch": 0.12482468443197756, "grad_norm": 0.4349636435508728, "learning_rate": 4.996564140137965e-05, "loss": 0.263, "step": 178 }, { "epoch": 0.12552594670406733, "grad_norm": 9.540214538574219, "learning_rate": 4.9965056720943304e-05, "loss": 0.8505, "step": 179 }, { "epoch": 0.12622720897615708, "grad_norm": 1.0185797214508057, "learning_rate": 4.996446711111369e-05, "loss": 0.4354, "step": 180 }, { "epoch": 0.12692847124824685, "grad_norm": 0.26561102271080017, "learning_rate": 4.996387257200723e-05, "loss": 0.0835, "step": 181 }, { "epoch": 0.1276297335203366, "grad_norm": 6.932794570922852, "learning_rate": 4.9963273103741305e-05, "loss": 0.7936, "step": 182 }, { "epoch": 0.12833099579242638, "grad_norm": 0.2710464596748352, "learning_rate": 4.9962668706434295e-05, "loss": 0.0863, "step": 183 }, { "epoch": 0.12903225806451613, "grad_norm": 0.2774156332015991, "learning_rate": 4.996205938020553e-05, "loss": 0.088, "step": 184 }, { "epoch": 0.1297335203366059, "grad_norm": 0.343814492225647, "learning_rate": 4.996144512517533e-05, "loss": 0.0786, "step": 185 }, { "epoch": 0.13043478260869565, "grad_norm": 0.2816801369190216, "learning_rate": 4.996082594146497e-05, "loss": 0.0904, "step": 186 }, { "epoch": 0.13113604488078542, "grad_norm": 0.3439931869506836, "learning_rate": 4.996020182919672e-05, "loss": 0.2502, "step": 187 }, { "epoch": 0.13183730715287517, "grad_norm": 0.2968119978904724, "learning_rate": 4.995957278849381e-05, "loss": 0.0851, "step": 188 }, { "epoch": 0.13253856942496495, "grad_norm": 0.3382357060909271, "learning_rate": 4.995893881948044e-05, "loss": 0.088, "step": 189 }, { "epoch": 0.1332398316970547, "grad_norm": 0.3458051383495331, "learning_rate": 4.9958299922281785e-05, "loss": 0.251, "step": 190 }, { "epoch": 0.13394109396914447, "grad_norm": 0.3315119743347168, "learning_rate": 4.995765609702401e-05, "loss": 0.2489, "step": 191 }, { "epoch": 0.13464235624123422, "grad_norm": 0.3358020782470703, "learning_rate": 4.995700734383423e-05, "loss": 0.2566, "step": 192 }, { "epoch": 0.135343618513324, "grad_norm": 0.8695023655891418, "learning_rate": 4.995635366284054e-05, "loss": 0.4152, "step": 193 }, { "epoch": 0.13604488078541374, "grad_norm": 0.28649893403053284, "learning_rate": 4.995569505417201e-05, "loss": 0.0924, "step": 194 }, { "epoch": 0.13674614305750352, "grad_norm": 0.32338714599609375, "learning_rate": 4.99550315179587e-05, "loss": 0.0897, "step": 195 }, { "epoch": 0.13744740532959326, "grad_norm": 0.40431204438209534, "learning_rate": 4.995436305433161e-05, "loss": 0.0887, "step": 196 }, { "epoch": 0.13814866760168304, "grad_norm": 0.8727932572364807, "learning_rate": 4.995368966342273e-05, "loss": 0.4125, "step": 197 }, { "epoch": 0.1388499298737728, "grad_norm": 0.291513055562973, "learning_rate": 4.995301134536503e-05, "loss": 0.0938, "step": 198 }, { "epoch": 0.13955119214586256, "grad_norm": 0.29052549600601196, "learning_rate": 4.995232810029243e-05, "loss": 0.0939, "step": 199 }, { "epoch": 0.1402524544179523, "grad_norm": 0.3701472878456116, "learning_rate": 4.995163992833986e-05, "loss": 0.0923, "step": 200 }, { "epoch": 0.14095371669004209, "grad_norm": 0.34452569484710693, "learning_rate": 4.995094682964319e-05, "loss": 0.0886, "step": 201 }, { "epoch": 0.14165497896213183, "grad_norm": 0.32716313004493713, "learning_rate": 4.995024880433926e-05, "loss": 0.257, "step": 202 }, { "epoch": 0.1423562412342216, "grad_norm": 0.3228546380996704, "learning_rate": 4.9949545852565926e-05, "loss": 0.0881, "step": 203 }, { "epoch": 0.14305750350631136, "grad_norm": 0.3638657033443451, "learning_rate": 4.994883797446196e-05, "loss": 0.0835, "step": 204 }, { "epoch": 0.14375876577840113, "grad_norm": 0.30177637934684753, "learning_rate": 4.994812517016715e-05, "loss": 0.0856, "step": 205 }, { "epoch": 0.14446002805049088, "grad_norm": 0.27158689498901367, "learning_rate": 4.9947407439822225e-05, "loss": 0.0884, "step": 206 }, { "epoch": 0.14516129032258066, "grad_norm": 0.2709910571575165, "learning_rate": 4.994668478356892e-05, "loss": 0.0799, "step": 207 }, { "epoch": 0.1458625525946704, "grad_norm": 0.2734312117099762, "learning_rate": 4.9945957201549906e-05, "loss": 0.08, "step": 208 }, { "epoch": 0.14656381486676018, "grad_norm": 0.2667778432369232, "learning_rate": 4.994522469390887e-05, "loss": 0.0859, "step": 209 }, { "epoch": 0.14726507713884993, "grad_norm": 0.2585766613483429, "learning_rate": 4.994448726079042e-05, "loss": 0.0763, "step": 210 }, { "epoch": 0.1479663394109397, "grad_norm": 0.26246362924575806, "learning_rate": 4.9943744902340175e-05, "loss": 0.0613, "step": 211 }, { "epoch": 0.14866760168302945, "grad_norm": 0.24721889197826385, "learning_rate": 4.9942997618704724e-05, "loss": 0.0731, "step": 212 }, { "epoch": 0.14936886395511922, "grad_norm": 0.25076860189437866, "learning_rate": 4.994224541003161e-05, "loss": 0.0794, "step": 213 }, { "epoch": 0.15007012622720897, "grad_norm": 0.24934066832065582, "learning_rate": 4.9941488276469355e-05, "loss": 0.0779, "step": 214 }, { "epoch": 0.15077138849929875, "grad_norm": 0.24150732159614563, "learning_rate": 4.994072621816746e-05, "loss": 0.0757, "step": 215 }, { "epoch": 0.1514726507713885, "grad_norm": 0.45080387592315674, "learning_rate": 4.99399592352764e-05, "loss": 0.2756, "step": 216 }, { "epoch": 0.15217391304347827, "grad_norm": 0.2079201489686966, "learning_rate": 4.993918732794761e-05, "loss": 0.0583, "step": 217 }, { "epoch": 0.15287517531556802, "grad_norm": 0.21744827926158905, "learning_rate": 4.993841049633351e-05, "loss": 0.0649, "step": 218 }, { "epoch": 0.1535764375876578, "grad_norm": 0.525833010673523, "learning_rate": 4.993762874058748e-05, "loss": 0.2641, "step": 219 }, { "epoch": 0.15427769985974754, "grad_norm": 0.5066496133804321, "learning_rate": 4.993684206086388e-05, "loss": 0.2839, "step": 220 }, { "epoch": 0.15497896213183732, "grad_norm": 0.509026288986206, "learning_rate": 4.993605045731804e-05, "loss": 0.2732, "step": 221 }, { "epoch": 0.15568022440392706, "grad_norm": 0.2329457700252533, "learning_rate": 4.993525393010628e-05, "loss": 0.0702, "step": 222 }, { "epoch": 0.15638148667601684, "grad_norm": 0.20849530398845673, "learning_rate": 4.993445247938585e-05, "loss": 0.0617, "step": 223 }, { "epoch": 0.1570827489481066, "grad_norm": 0.21080382168293, "learning_rate": 4.9933646105315027e-05, "loss": 0.0624, "step": 224 }, { "epoch": 0.15778401122019636, "grad_norm": 0.1671798676252365, "learning_rate": 4.9932834808052996e-05, "loss": 0.0449, "step": 225 }, { "epoch": 0.1584852734922861, "grad_norm": 0.20914548635482788, "learning_rate": 4.9932018587759975e-05, "loss": 0.0609, "step": 226 }, { "epoch": 0.15918653576437589, "grad_norm": 0.15776701271533966, "learning_rate": 4.9931197444597124e-05, "loss": 0.044, "step": 227 }, { "epoch": 0.15988779803646563, "grad_norm": 19.427234649658203, "learning_rate": 4.993037137872657e-05, "loss": 1.4031, "step": 228 }, { "epoch": 0.1605890603085554, "grad_norm": 1.2697529792785645, "learning_rate": 4.992954039031143e-05, "loss": 0.4941, "step": 229 }, { "epoch": 0.16129032258064516, "grad_norm": 0.15186026692390442, "learning_rate": 4.992870447951578e-05, "loss": 0.0364, "step": 230 }, { "epoch": 0.16199158485273493, "grad_norm": 0.20261384546756744, "learning_rate": 4.992786364650467e-05, "loss": 0.0597, "step": 231 }, { "epoch": 0.16269284712482468, "grad_norm": 0.2043345421552658, "learning_rate": 4.9927017891444136e-05, "loss": 0.0602, "step": 232 }, { "epoch": 0.16339410939691446, "grad_norm": 0.5600248575210571, "learning_rate": 4.992616721450116e-05, "loss": 0.2909, "step": 233 }, { "epoch": 0.1640953716690042, "grad_norm": 9.573511123657227, "learning_rate": 4.992531161584371e-05, "loss": 0.9617, "step": 234 }, { "epoch": 0.16479663394109398, "grad_norm": 0.5033115744590759, "learning_rate": 4.992445109564073e-05, "loss": 0.287, "step": 235 }, { "epoch": 0.16549789621318373, "grad_norm": 0.5206232070922852, "learning_rate": 4.992358565406214e-05, "loss": 0.2608, "step": 236 }, { "epoch": 0.1661991584852735, "grad_norm": 1.113784670829773, "learning_rate": 4.992271529127881e-05, "loss": 0.4774, "step": 237 }, { "epoch": 0.16690042075736325, "grad_norm": 0.5042740702629089, "learning_rate": 4.9921840007462594e-05, "loss": 0.2508, "step": 238 }, { "epoch": 0.16760168302945302, "grad_norm": 0.4459116756916046, "learning_rate": 4.9920959802786324e-05, "loss": 0.277, "step": 239 }, { "epoch": 0.16830294530154277, "grad_norm": 0.2361891269683838, "learning_rate": 4.99200746774238e-05, "loss": 0.0768, "step": 240 }, { "epoch": 0.16900420757363255, "grad_norm": 5.365720748901367, "learning_rate": 4.991918463154979e-05, "loss": 0.662, "step": 241 }, { "epoch": 0.1697054698457223, "grad_norm": 0.3727276921272278, "learning_rate": 4.9918289665340026e-05, "loss": 0.2554, "step": 242 }, { "epoch": 0.17040673211781207, "grad_norm": 0.24112652242183685, "learning_rate": 4.991738977897123e-05, "loss": 0.0757, "step": 243 }, { "epoch": 0.17110799438990182, "grad_norm": 0.24592800438404083, "learning_rate": 4.9916484972621084e-05, "loss": 0.0838, "step": 244 }, { "epoch": 0.1718092566619916, "grad_norm": 0.3961750268936157, "learning_rate": 4.991557524646825e-05, "loss": 0.2584, "step": 245 }, { "epoch": 0.17251051893408134, "grad_norm": 0.2484811544418335, "learning_rate": 4.991466060069234e-05, "loss": 0.0853, "step": 246 }, { "epoch": 0.17321178120617112, "grad_norm": 0.8793482184410095, "learning_rate": 4.9913741035473965e-05, "loss": 0.4267, "step": 247 }, { "epoch": 0.17391304347826086, "grad_norm": 0.3405586779117584, "learning_rate": 4.991281655099469e-05, "loss": 0.2484, "step": 248 }, { "epoch": 0.17461430575035064, "grad_norm": 0.3241943120956421, "learning_rate": 4.991188714743706e-05, "loss": 0.2585, "step": 249 }, { "epoch": 0.1753155680224404, "grad_norm": 0.31021037697792053, "learning_rate": 4.991095282498458e-05, "loss": 0.2558, "step": 250 }, { "epoch": 0.17601683029453016, "grad_norm": 0.2590515613555908, "learning_rate": 4.991001358382174e-05, "loss": 0.09, "step": 251 }, { "epoch": 0.1767180925666199, "grad_norm": 0.8347843885421753, "learning_rate": 4.990906942413399e-05, "loss": 0.4105, "step": 252 }, { "epoch": 0.1774193548387097, "grad_norm": 0.32771119475364685, "learning_rate": 4.990812034610776e-05, "loss": 0.25, "step": 253 }, { "epoch": 0.17812061711079943, "grad_norm": 0.3103218674659729, "learning_rate": 4.990716634993045e-05, "loss": 0.0895, "step": 254 }, { "epoch": 0.1788218793828892, "grad_norm": 0.28891319036483765, "learning_rate": 4.9906207435790414e-05, "loss": 0.2502, "step": 255 }, { "epoch": 0.17952314165497896, "grad_norm": 1.2832095623016357, "learning_rate": 4.9905243603877004e-05, "loss": 0.2596, "step": 256 }, { "epoch": 0.18022440392706873, "grad_norm": 0.30304932594299316, "learning_rate": 4.990427485438053e-05, "loss": 0.2499, "step": 257 }, { "epoch": 0.18092566619915848, "grad_norm": 0.27757903933525085, "learning_rate": 4.990330118749227e-05, "loss": 0.0963, "step": 258 }, { "epoch": 0.18162692847124826, "grad_norm": 0.2811563313007355, "learning_rate": 4.990232260340447e-05, "loss": 0.0974, "step": 259 }, { "epoch": 0.182328190743338, "grad_norm": 0.37628060579299927, "learning_rate": 4.9901339102310375e-05, "loss": 0.1024, "step": 260 }, { "epoch": 0.18302945301542778, "grad_norm": 0.28913789987564087, "learning_rate": 4.990035068440415e-05, "loss": 0.0991, "step": 261 }, { "epoch": 0.18373071528751753, "grad_norm": 0.2867695391178131, "learning_rate": 4.989935734988098e-05, "loss": 0.0984, "step": 262 }, { "epoch": 0.1844319775596073, "grad_norm": 0.28791242837905884, "learning_rate": 4.989835909893698e-05, "loss": 0.0992, "step": 263 }, { "epoch": 0.18513323983169705, "grad_norm": 0.36766406893730164, "learning_rate": 4.9897355931769294e-05, "loss": 0.1037, "step": 264 }, { "epoch": 0.18583450210378682, "grad_norm": 0.30257436633110046, "learning_rate": 4.9896347848575964e-05, "loss": 0.25, "step": 265 }, { "epoch": 0.18653576437587657, "grad_norm": 0.32402658462524414, "learning_rate": 4.9895334849556044e-05, "loss": 0.0989, "step": 266 }, { "epoch": 0.18723702664796635, "grad_norm": 0.35825371742248535, "learning_rate": 4.989431693490957e-05, "loss": 0.1011, "step": 267 }, { "epoch": 0.1879382889200561, "grad_norm": 0.2973780930042267, "learning_rate": 4.989329410483751e-05, "loss": 0.0958, "step": 268 }, { "epoch": 0.18863955119214587, "grad_norm": 0.30316290259361267, "learning_rate": 4.989226635954183e-05, "loss": 0.0954, "step": 269 }, { "epoch": 0.18934081346423562, "grad_norm": 0.27011147141456604, "learning_rate": 4.989123369922547e-05, "loss": 0.0945, "step": 270 }, { "epoch": 0.1900420757363254, "grad_norm": 0.28861624002456665, "learning_rate": 4.9890196124092313e-05, "loss": 0.0924, "step": 271 }, { "epoch": 0.19074333800841514, "grad_norm": 0.26316800713539124, "learning_rate": 4.988915363434725e-05, "loss": 0.0921, "step": 272 }, { "epoch": 0.19144460028050492, "grad_norm": 0.30640730261802673, "learning_rate": 4.988810623019611e-05, "loss": 0.2537, "step": 273 }, { "epoch": 0.19214586255259467, "grad_norm": 1.8029754161834717, "learning_rate": 4.98870539118457e-05, "loss": 0.3145, "step": 274 }, { "epoch": 0.19284712482468444, "grad_norm": 0.255055695772171, "learning_rate": 4.9885996679503815e-05, "loss": 0.0898, "step": 275 }, { "epoch": 0.1935483870967742, "grad_norm": 0.25440922379493713, "learning_rate": 4.98849345333792e-05, "loss": 0.0896, "step": 276 }, { "epoch": 0.19424964936886396, "grad_norm": 0.26189056038856506, "learning_rate": 4.988386747368158e-05, "loss": 0.0871, "step": 277 }, { "epoch": 0.1949509116409537, "grad_norm": 0.2509978711605072, "learning_rate": 4.988279550062165e-05, "loss": 0.089, "step": 278 }, { "epoch": 0.1956521739130435, "grad_norm": 0.3035840690135956, "learning_rate": 4.988171861441106e-05, "loss": 0.2543, "step": 279 }, { "epoch": 0.19635343618513323, "grad_norm": 0.2571934163570404, "learning_rate": 4.988063681526246e-05, "loss": 0.0853, "step": 280 }, { "epoch": 0.197054698457223, "grad_norm": 0.3193231523036957, "learning_rate": 4.9879550103389444e-05, "loss": 0.253, "step": 281 }, { "epoch": 0.19775596072931276, "grad_norm": 0.2549659311771393, "learning_rate": 4.987845847900658e-05, "loss": 0.084, "step": 282 }, { "epoch": 0.19845722300140253, "grad_norm": 0.24674060940742493, "learning_rate": 4.987736194232943e-05, "loss": 0.0868, "step": 283 }, { "epoch": 0.19915848527349228, "grad_norm": 0.24441787600517273, "learning_rate": 4.987626049357449e-05, "loss": 0.0866, "step": 284 }, { "epoch": 0.19985974754558206, "grad_norm": 0.2557724118232727, "learning_rate": 4.9875154132959243e-05, "loss": 0.0834, "step": 285 }, { "epoch": 0.2005610098176718, "grad_norm": 0.24466341733932495, "learning_rate": 4.987404286070216e-05, "loss": 0.0817, "step": 286 }, { "epoch": 0.20126227208976158, "grad_norm": 0.2396153211593628, "learning_rate": 4.987292667702263e-05, "loss": 0.085, "step": 287 }, { "epoch": 0.20196353436185133, "grad_norm": 0.33054494857788086, "learning_rate": 4.9871805582141085e-05, "loss": 0.2586, "step": 288 }, { "epoch": 0.2026647966339411, "grad_norm": 0.24127113819122314, "learning_rate": 4.987067957627886e-05, "loss": 0.08, "step": 289 }, { "epoch": 0.20336605890603085, "grad_norm": 0.2591036856174469, "learning_rate": 4.98695486596583e-05, "loss": 0.0753, "step": 290 }, { "epoch": 0.20406732117812063, "grad_norm": 0.3395358920097351, "learning_rate": 4.98684128325027e-05, "loss": 0.2623, "step": 291 }, { "epoch": 0.20476858345021037, "grad_norm": 0.23429878056049347, "learning_rate": 4.9867272095036324e-05, "loss": 0.0772, "step": 292 }, { "epoch": 0.20546984572230015, "grad_norm": 0.23134566843509674, "learning_rate": 4.986612644748442e-05, "loss": 0.076, "step": 293 }, { "epoch": 0.2061711079943899, "grad_norm": 0.3670242428779602, "learning_rate": 4.986497589007321e-05, "loss": 0.2655, "step": 294 }, { "epoch": 0.20687237026647967, "grad_norm": 0.2599996030330658, "learning_rate": 4.986382042302985e-05, "loss": 0.065, "step": 295 }, { "epoch": 0.20757363253856942, "grad_norm": 0.2329290360212326, "learning_rate": 4.986266004658251e-05, "loss": 0.0804, "step": 296 }, { "epoch": 0.2082748948106592, "grad_norm": 0.3797686994075775, "learning_rate": 4.9861494760960306e-05, "loss": 0.2608, "step": 297 }, { "epoch": 0.20897615708274894, "grad_norm": 0.9227772951126099, "learning_rate": 4.98603245663933e-05, "loss": 0.4545, "step": 298 }, { "epoch": 0.20967741935483872, "grad_norm": 0.3717903792858124, "learning_rate": 4.9859149463112586e-05, "loss": 0.2661, "step": 299 }, { "epoch": 0.21037868162692847, "grad_norm": 0.23223714530467987, "learning_rate": 4.9857969451350164e-05, "loss": 0.0684, "step": 300 }, { "epoch": 0.21107994389901824, "grad_norm": 0.23738601803779602, "learning_rate": 4.985678453133904e-05, "loss": 0.0552, "step": 301 }, { "epoch": 0.211781206171108, "grad_norm": 0.2283981442451477, "learning_rate": 4.985559470331317e-05, "loss": 0.0736, "step": 302 }, { "epoch": 0.21248246844319776, "grad_norm": 0.2304806262254715, "learning_rate": 4.9854399967507506e-05, "loss": 0.0793, "step": 303 }, { "epoch": 0.2131837307152875, "grad_norm": 0.37785378098487854, "learning_rate": 4.985320032415792e-05, "loss": 0.2698, "step": 304 }, { "epoch": 0.2138849929873773, "grad_norm": 0.3872238099575043, "learning_rate": 4.9851995773501315e-05, "loss": 0.2611, "step": 305 }, { "epoch": 0.21458625525946703, "grad_norm": 0.21849776804447174, "learning_rate": 4.9850786315775525e-05, "loss": 0.066, "step": 306 }, { "epoch": 0.2152875175315568, "grad_norm": 0.3847895860671997, "learning_rate": 4.984957195121934e-05, "loss": 0.2698, "step": 307 }, { "epoch": 0.21598877980364656, "grad_norm": 0.22245678305625916, "learning_rate": 4.984835268007255e-05, "loss": 0.0725, "step": 308 }, { "epoch": 0.21669004207573633, "grad_norm": 0.2230721116065979, "learning_rate": 4.9847128502575916e-05, "loss": 0.0715, "step": 309 }, { "epoch": 0.21739130434782608, "grad_norm": 0.4115176498889923, "learning_rate": 4.984589941897113e-05, "loss": 0.2625, "step": 310 }, { "epoch": 0.21809256661991586, "grad_norm": 0.22501502931118011, "learning_rate": 4.9844665429500895e-05, "loss": 0.0705, "step": 311 }, { "epoch": 0.2187938288920056, "grad_norm": 0.4025037884712219, "learning_rate": 4.984342653440886e-05, "loss": 0.2638, "step": 312 }, { "epoch": 0.21949509116409538, "grad_norm": 0.40614446997642517, "learning_rate": 4.9842182733939635e-05, "loss": 0.2638, "step": 313 }, { "epoch": 0.22019635343618513, "grad_norm": 0.23159518837928772, "learning_rate": 4.984093402833883e-05, "loss": 0.0786, "step": 314 }, { "epoch": 0.2208976157082749, "grad_norm": 0.2191554307937622, "learning_rate": 4.983968041785298e-05, "loss": 0.0713, "step": 315 }, { "epoch": 0.22159887798036465, "grad_norm": 0.40401139855384827, "learning_rate": 4.9838421902729644e-05, "loss": 0.2621, "step": 316 }, { "epoch": 0.22230014025245443, "grad_norm": 0.21113920211791992, "learning_rate": 4.98371584832173e-05, "loss": 0.0641, "step": 317 }, { "epoch": 0.22300140252454417, "grad_norm": 0.20844130218029022, "learning_rate": 4.98358901595654e-05, "loss": 0.0638, "step": 318 }, { "epoch": 0.22370266479663395, "grad_norm": 0.21600013971328735, "learning_rate": 4.983461693202439e-05, "loss": 0.0651, "step": 319 }, { "epoch": 0.2244039270687237, "grad_norm": 0.21861328184604645, "learning_rate": 4.983333880084568e-05, "loss": 0.0707, "step": 320 }, { "epoch": 0.22510518934081347, "grad_norm": 0.3780987858772278, "learning_rate": 4.983205576628163e-05, "loss": 0.2675, "step": 321 }, { "epoch": 0.22580645161290322, "grad_norm": 0.21367427706718445, "learning_rate": 4.9830767828585566e-05, "loss": 0.07, "step": 322 }, { "epoch": 0.226507713884993, "grad_norm": 0.21772611141204834, "learning_rate": 4.9829474988011813e-05, "loss": 0.0701, "step": 323 }, { "epoch": 0.22720897615708274, "grad_norm": 0.23117288947105408, "learning_rate": 4.982817724481564e-05, "loss": 0.0778, "step": 324 }, { "epoch": 0.22791023842917252, "grad_norm": 0.21202294528484344, "learning_rate": 4.982687459925328e-05, "loss": 0.0686, "step": 325 }, { "epoch": 0.22861150070126227, "grad_norm": 0.42791709303855896, "learning_rate": 4.9825567051581944e-05, "loss": 0.2674, "step": 326 }, { "epoch": 0.22931276297335204, "grad_norm": 0.40409332513809204, "learning_rate": 4.9824254602059815e-05, "loss": 0.2727, "step": 327 }, { "epoch": 0.2300140252454418, "grad_norm": 0.19239529967308044, "learning_rate": 4.982293725094604e-05, "loss": 0.0586, "step": 328 }, { "epoch": 0.23071528751753156, "grad_norm": 0.22719323635101318, "learning_rate": 4.982161499850073e-05, "loss": 0.0762, "step": 329 }, { "epoch": 0.2314165497896213, "grad_norm": 0.22618328034877777, "learning_rate": 4.9820287844984956e-05, "loss": 0.075, "step": 330 }, { "epoch": 0.2321178120617111, "grad_norm": 0.20985504984855652, "learning_rate": 4.981895579066078e-05, "loss": 0.0669, "step": 331 }, { "epoch": 0.23281907433380084, "grad_norm": 0.22742348909378052, "learning_rate": 4.981761883579122e-05, "loss": 0.0744, "step": 332 }, { "epoch": 0.2335203366058906, "grad_norm": 8.938209533691406, "learning_rate": 4.981627698064025e-05, "loss": 0.9939, "step": 333 }, { "epoch": 0.23422159887798036, "grad_norm": 0.2256510853767395, "learning_rate": 4.9814930225472824e-05, "loss": 0.073, "step": 334 }, { "epoch": 0.23492286115007013, "grad_norm": 0.20603115856647491, "learning_rate": 4.981357857055486e-05, "loss": 0.0653, "step": 335 }, { "epoch": 0.23562412342215988, "grad_norm": 1.0092064142227173, "learning_rate": 4.9812222016153254e-05, "loss": 0.4713, "step": 336 }, { "epoch": 0.23632538569424966, "grad_norm": 0.44663774967193604, "learning_rate": 4.981086056253584e-05, "loss": 0.2662, "step": 337 }, { "epoch": 0.2370266479663394, "grad_norm": 0.20599833130836487, "learning_rate": 4.9809494209971467e-05, "loss": 0.0662, "step": 338 }, { "epoch": 0.23772791023842918, "grad_norm": 0.20768706500530243, "learning_rate": 4.980812295872991e-05, "loss": 0.0672, "step": 339 }, { "epoch": 0.23842917251051893, "grad_norm": 0.46793609857559204, "learning_rate": 4.980674680908192e-05, "loss": 0.2605, "step": 340 }, { "epoch": 0.2391304347826087, "grad_norm": 6.021070957183838, "learning_rate": 4.980536576129921e-05, "loss": 0.7834, "step": 341 }, { "epoch": 0.23983169705469845, "grad_norm": 0.22994506359100342, "learning_rate": 4.9803979815654505e-05, "loss": 0.0769, "step": 342 }, { "epoch": 0.24053295932678823, "grad_norm": 0.2292703539133072, "learning_rate": 4.9802588972421434e-05, "loss": 0.0767, "step": 343 }, { "epoch": 0.24123422159887797, "grad_norm": 0.23139742016792297, "learning_rate": 4.980119323187463e-05, "loss": 0.0782, "step": 344 }, { "epoch": 0.24193548387096775, "grad_norm": 0.23098242282867432, "learning_rate": 4.979979259428968e-05, "loss": 0.0782, "step": 345 }, { "epoch": 0.2426367461430575, "grad_norm": 0.2215109020471573, "learning_rate": 4.979838705994315e-05, "loss": 0.071, "step": 346 }, { "epoch": 0.24333800841514727, "grad_norm": 0.2182546854019165, "learning_rate": 4.979697662911256e-05, "loss": 0.0709, "step": 347 }, { "epoch": 0.24403927068723702, "grad_norm": 0.2707327902317047, "learning_rate": 4.97955613020764e-05, "loss": 0.0743, "step": 348 }, { "epoch": 0.2447405329593268, "grad_norm": 0.6338597536087036, "learning_rate": 4.9794141079114135e-05, "loss": 0.259, "step": 349 }, { "epoch": 0.24544179523141654, "grad_norm": 0.39071857929229736, "learning_rate": 4.9792715960506187e-05, "loss": 0.2591, "step": 350 }, { "epoch": 0.24614305750350632, "grad_norm": 0.23533998429775238, "learning_rate": 4.979128594653395e-05, "loss": 0.072, "step": 351 }, { "epoch": 0.24684431977559607, "grad_norm": 0.22338388860225677, "learning_rate": 4.978985103747977e-05, "loss": 0.0652, "step": 352 }, { "epoch": 0.24754558204768584, "grad_norm": 0.4078010618686676, "learning_rate": 4.9788411233627e-05, "loss": 0.2547, "step": 353 }, { "epoch": 0.2482468443197756, "grad_norm": 4.14848518371582, "learning_rate": 4.97869665352599e-05, "loss": 0.4587, "step": 354 }, { "epoch": 0.24894810659186536, "grad_norm": 0.4267268180847168, "learning_rate": 4.9785516942663756e-05, "loss": 0.2574, "step": 355 }, { "epoch": 0.2496493688639551, "grad_norm": 2.1621205806732178, "learning_rate": 4.9784062456124775e-05, "loss": 0.4076, "step": 356 }, { "epoch": 0.2503506311360449, "grad_norm": 0.4625076353549957, "learning_rate": 4.978260307593015e-05, "loss": 0.2505, "step": 357 }, { "epoch": 0.25105189340813466, "grad_norm": 0.3201969265937805, "learning_rate": 4.978113880236805e-05, "loss": 0.079, "step": 358 }, { "epoch": 0.2517531556802244, "grad_norm": 0.6193740367889404, "learning_rate": 4.977966963572758e-05, "loss": 0.2691, "step": 359 }, { "epoch": 0.25245441795231416, "grad_norm": 1.1117448806762695, "learning_rate": 4.977819557629884e-05, "loss": 0.2502, "step": 360 }, { "epoch": 0.25315568022440393, "grad_norm": 0.41331925988197327, "learning_rate": 4.977671662437288e-05, "loss": 0.0935, "step": 361 }, { "epoch": 0.2538569424964937, "grad_norm": 0.6518324017524719, "learning_rate": 4.9775232780241735e-05, "loss": 0.1044, "step": 362 }, { "epoch": 0.25455820476858343, "grad_norm": 0.5056829452514648, "learning_rate": 4.977374404419837e-05, "loss": 0.0942, "step": 363 }, { "epoch": 0.2552594670406732, "grad_norm": 0.3945491909980774, "learning_rate": 4.977225041653676e-05, "loss": 0.0872, "step": 364 }, { "epoch": 0.255960729312763, "grad_norm": 0.41435596346855164, "learning_rate": 4.9770751897551816e-05, "loss": 0.2528, "step": 365 }, { "epoch": 0.25666199158485276, "grad_norm": 1.0752402544021606, "learning_rate": 4.976924848753942e-05, "loss": 0.2716, "step": 366 }, { "epoch": 0.2573632538569425, "grad_norm": 2.242830753326416, "learning_rate": 4.976774018679643e-05, "loss": 0.0993, "step": 367 }, { "epoch": 0.25806451612903225, "grad_norm": 0.3835015594959259, "learning_rate": 4.976622699562066e-05, "loss": 0.2506, "step": 368 }, { "epoch": 0.258765778401122, "grad_norm": 0.2619876265525818, "learning_rate": 4.976470891431089e-05, "loss": 0.0863, "step": 369 }, { "epoch": 0.2594670406732118, "grad_norm": 0.5994099378585815, "learning_rate": 4.976318594316687e-05, "loss": 0.2409, "step": 370 }, { "epoch": 0.2601683029453015, "grad_norm": 0.27995193004608154, "learning_rate": 4.9761658082489307e-05, "loss": 0.0758, "step": 371 }, { "epoch": 0.2608695652173913, "grad_norm": 0.36166948080062866, "learning_rate": 4.976012533257989e-05, "loss": 0.2551, "step": 372 }, { "epoch": 0.2615708274894811, "grad_norm": 6.7222676277160645, "learning_rate": 4.9758587693741254e-05, "loss": 0.8995, "step": 373 }, { "epoch": 0.26227208976157085, "grad_norm": 0.929538369178772, "learning_rate": 4.9757045166277014e-05, "loss": 0.4456, "step": 374 }, { "epoch": 0.26297335203366057, "grad_norm": 0.30816730856895447, "learning_rate": 4.9755497750491744e-05, "loss": 0.083, "step": 375 }, { "epoch": 0.26367461430575034, "grad_norm": 0.316251277923584, "learning_rate": 4.975394544669098e-05, "loss": 0.078, "step": 376 }, { "epoch": 0.2643758765778401, "grad_norm": 0.2523244023323059, "learning_rate": 4.975238825518123e-05, "loss": 0.0869, "step": 377 }, { "epoch": 0.2650771388499299, "grad_norm": 0.3928658366203308, "learning_rate": 4.975082617626996e-05, "loss": 0.0837, "step": 378 }, { "epoch": 0.2657784011220196, "grad_norm": 0.2759401202201843, "learning_rate": 4.974925921026561e-05, "loss": 0.083, "step": 379 }, { "epoch": 0.2664796633941094, "grad_norm": 0.3051683306694031, "learning_rate": 4.9747687357477586e-05, "loss": 0.0776, "step": 380 }, { "epoch": 0.26718092566619916, "grad_norm": 0.3596304655075073, "learning_rate": 4.974611061821625e-05, "loss": 0.2573, "step": 381 }, { "epoch": 0.26788218793828894, "grad_norm": 0.3551197946071625, "learning_rate": 4.974452899279292e-05, "loss": 0.2593, "step": 382 }, { "epoch": 0.26858345021037866, "grad_norm": 0.2501370906829834, "learning_rate": 4.9742942481519897e-05, "loss": 0.0845, "step": 383 }, { "epoch": 0.26928471248246844, "grad_norm": 0.617230236530304, "learning_rate": 4.974135108471044e-05, "loss": 0.2402, "step": 384 }, { "epoch": 0.2699859747545582, "grad_norm": 0.4218927025794983, "learning_rate": 4.973975480267879e-05, "loss": 0.2622, "step": 385 }, { "epoch": 0.270687237026648, "grad_norm": 0.3798485994338989, "learning_rate": 4.973815363574011e-05, "loss": 0.2532, "step": 386 }, { "epoch": 0.2713884992987377, "grad_norm": 0.5771559476852417, "learning_rate": 4.973654758421056e-05, "loss": 0.087, "step": 387 }, { "epoch": 0.2720897615708275, "grad_norm": 0.2532389163970947, "learning_rate": 4.973493664840726e-05, "loss": 0.0845, "step": 388 }, { "epoch": 0.27279102384291726, "grad_norm": 3.2845327854156494, "learning_rate": 4.97333208286483e-05, "loss": 0.4361, "step": 389 }, { "epoch": 0.27349228611500703, "grad_norm": 3.773503303527832, "learning_rate": 4.9731700125252724e-05, "loss": 0.6089, "step": 390 }, { "epoch": 0.27419354838709675, "grad_norm": 0.2509194016456604, "learning_rate": 4.973007453854053e-05, "loss": 0.0856, "step": 391 }, { "epoch": 0.27489481065918653, "grad_norm": 0.2801031470298767, "learning_rate": 4.9728444068832706e-05, "loss": 0.0824, "step": 392 }, { "epoch": 0.2755960729312763, "grad_norm": 0.3518745005130768, "learning_rate": 4.972680871645119e-05, "loss": 0.258, "step": 393 }, { "epoch": 0.2762973352033661, "grad_norm": 2.767435312271118, "learning_rate": 4.9725168481718874e-05, "loss": 0.3348, "step": 394 }, { "epoch": 0.2769985974754558, "grad_norm": 0.9225994944572449, "learning_rate": 4.972352336495964e-05, "loss": 0.2329, "step": 395 }, { "epoch": 0.2776998597475456, "grad_norm": 0.4242871105670929, "learning_rate": 4.9721873366498304e-05, "loss": 0.0931, "step": 396 }, { "epoch": 0.27840112201963535, "grad_norm": 0.2771666944026947, "learning_rate": 4.972021848666068e-05, "loss": 0.0913, "step": 397 }, { "epoch": 0.2791023842917251, "grad_norm": 2.1785690784454346, "learning_rate": 4.9718558725773513e-05, "loss": 0.2248, "step": 398 }, { "epoch": 0.27980364656381485, "grad_norm": 0.5499383807182312, "learning_rate": 4.971689408416452e-05, "loss": 0.109, "step": 399 }, { "epoch": 0.2805049088359046, "grad_norm": 0.6320145130157471, "learning_rate": 4.97152245621624e-05, "loss": 0.1086, "step": 400 }, { "epoch": 0.2812061711079944, "grad_norm": 0.30803006887435913, "learning_rate": 4.97135501600968e-05, "loss": 0.2564, "step": 401 }, { "epoch": 0.28190743338008417, "grad_norm": 0.30871251225471497, "learning_rate": 4.971187087829834e-05, "loss": 0.2545, "step": 402 }, { "epoch": 0.2826086956521739, "grad_norm": 0.4196130633354187, "learning_rate": 4.971018671709858e-05, "loss": 0.1006, "step": 403 }, { "epoch": 0.28330995792426367, "grad_norm": 0.5244584083557129, "learning_rate": 4.970849767683007e-05, "loss": 0.1066, "step": 404 }, { "epoch": 0.28401122019635344, "grad_norm": 0.2978798747062683, "learning_rate": 4.9706803757826315e-05, "loss": 0.0927, "step": 405 }, { "epoch": 0.2847124824684432, "grad_norm": 0.2841836214065552, "learning_rate": 4.970510496042178e-05, "loss": 0.0916, "step": 406 }, { "epoch": 0.28541374474053294, "grad_norm": 0.4206264615058899, "learning_rate": 4.97034012849519e-05, "loss": 0.2519, "step": 407 }, { "epoch": 0.2861150070126227, "grad_norm": 0.3436738848686218, "learning_rate": 4.970169273175307e-05, "loss": 0.2548, "step": 408 }, { "epoch": 0.2868162692847125, "grad_norm": 0.3259830176830292, "learning_rate": 4.969997930116264e-05, "loss": 0.0914, "step": 409 }, { "epoch": 0.28751753155680226, "grad_norm": 0.32724055647850037, "learning_rate": 4.969826099351892e-05, "loss": 0.2572, "step": 410 }, { "epoch": 0.288218793828892, "grad_norm": 1.8583519458770752, "learning_rate": 4.969653780916121e-05, "loss": 0.2758, "step": 411 }, { "epoch": 0.28892005610098176, "grad_norm": 0.39576396346092224, "learning_rate": 4.9694809748429756e-05, "loss": 0.0848, "step": 412 }, { "epoch": 0.28962131837307153, "grad_norm": 0.29500314593315125, "learning_rate": 4.969307681166576e-05, "loss": 0.087, "step": 413 }, { "epoch": 0.2903225806451613, "grad_norm": 0.366472452878952, "learning_rate": 4.969133899921139e-05, "loss": 0.2521, "step": 414 }, { "epoch": 0.29102384291725103, "grad_norm": 0.2539809048175812, "learning_rate": 4.968959631140978e-05, "loss": 0.0873, "step": 415 }, { "epoch": 0.2917251051893408, "grad_norm": 0.2875874638557434, "learning_rate": 4.9687848748605036e-05, "loss": 0.0862, "step": 416 }, { "epoch": 0.2924263674614306, "grad_norm": 0.3355823755264282, "learning_rate": 4.968609631114222e-05, "loss": 0.256, "step": 417 }, { "epoch": 0.29312762973352036, "grad_norm": 0.3311085104942322, "learning_rate": 4.9684338999367336e-05, "loss": 0.2559, "step": 418 }, { "epoch": 0.2938288920056101, "grad_norm": 0.34592464566230774, "learning_rate": 4.968257681362738e-05, "loss": 0.2557, "step": 419 }, { "epoch": 0.29453015427769985, "grad_norm": 1.417022705078125, "learning_rate": 4.9680809754270295e-05, "loss": 0.6043, "step": 420 }, { "epoch": 0.2952314165497896, "grad_norm": 0.2510850429534912, "learning_rate": 4.9679037821645e-05, "loss": 0.0862, "step": 421 }, { "epoch": 0.2959326788218794, "grad_norm": 0.28335535526275635, "learning_rate": 4.967726101610135e-05, "loss": 0.0853, "step": 422 }, { "epoch": 0.2966339410939691, "grad_norm": 0.2789595425128937, "learning_rate": 4.967547933799019e-05, "loss": 0.0846, "step": 423 }, { "epoch": 0.2973352033660589, "grad_norm": 0.8539701104164124, "learning_rate": 4.9673692787663314e-05, "loss": 0.4262, "step": 424 }, { "epoch": 0.2980364656381487, "grad_norm": 0.3776937425136566, "learning_rate": 4.967190136547347e-05, "loss": 0.2549, "step": 425 }, { "epoch": 0.29873772791023845, "grad_norm": 0.3433000445365906, "learning_rate": 4.9670105071774385e-05, "loss": 0.2555, "step": 426 }, { "epoch": 0.29943899018232817, "grad_norm": 4.10491943359375, "learning_rate": 4.9668303906920747e-05, "loss": 0.5343, "step": 427 }, { "epoch": 0.30014025245441794, "grad_norm": 0.3174687922000885, "learning_rate": 4.966649787126818e-05, "loss": 0.2556, "step": 428 }, { "epoch": 0.3008415147265077, "grad_norm": 0.26853328943252563, "learning_rate": 4.9664686965173316e-05, "loss": 0.0901, "step": 429 }, { "epoch": 0.3015427769985975, "grad_norm": 0.3341194987297058, "learning_rate": 4.9662871188993694e-05, "loss": 0.0938, "step": 430 }, { "epoch": 0.3022440392706872, "grad_norm": 0.33745110034942627, "learning_rate": 4.9661050543087854e-05, "loss": 0.094, "step": 431 }, { "epoch": 0.302945301542777, "grad_norm": 0.8763027191162109, "learning_rate": 4.965922502781529e-05, "loss": 0.4051, "step": 432 }, { "epoch": 0.30364656381486677, "grad_norm": 0.30836260318756104, "learning_rate": 4.9657394643536446e-05, "loss": 0.252, "step": 433 }, { "epoch": 0.30434782608695654, "grad_norm": 0.3050583302974701, "learning_rate": 4.965555939061274e-05, "loss": 0.2512, "step": 434 }, { "epoch": 0.30504908835904626, "grad_norm": 0.35423755645751953, "learning_rate": 4.965371926940655e-05, "loss": 0.2435, "step": 435 }, { "epoch": 0.30575035063113604, "grad_norm": 0.4966770112514496, "learning_rate": 4.96518742802812e-05, "loss": 0.104, "step": 436 }, { "epoch": 0.3064516129032258, "grad_norm": 0.4316244423389435, "learning_rate": 4.965002442360098e-05, "loss": 0.1051, "step": 437 }, { "epoch": 0.3071528751753156, "grad_norm": 0.33840543031692505, "learning_rate": 4.964816969973117e-05, "loss": 0.0992, "step": 438 }, { "epoch": 0.3078541374474053, "grad_norm": 0.4919021427631378, "learning_rate": 4.9646310109037975e-05, "loss": 0.103, "step": 439 }, { "epoch": 0.3085553997194951, "grad_norm": 0.32021504640579224, "learning_rate": 4.964444565188857e-05, "loss": 0.0987, "step": 440 }, { "epoch": 0.30925666199158486, "grad_norm": 0.4201069474220276, "learning_rate": 4.96425763286511e-05, "loss": 0.0976, "step": 441 }, { "epoch": 0.30995792426367463, "grad_norm": 0.29684045910835266, "learning_rate": 4.964070213969467e-05, "loss": 0.2519, "step": 442 }, { "epoch": 0.31065918653576435, "grad_norm": 0.32813337445259094, "learning_rate": 4.963882308538934e-05, "loss": 0.0932, "step": 443 }, { "epoch": 0.31136044880785413, "grad_norm": 0.4086003303527832, "learning_rate": 4.963693916610614e-05, "loss": 0.2367, "step": 444 }, { "epoch": 0.3120617110799439, "grad_norm": 0.2998374402523041, "learning_rate": 4.9635050382217036e-05, "loss": 0.0876, "step": 445 }, { "epoch": 0.3127629733520337, "grad_norm": 0.2779095470905304, "learning_rate": 4.963315673409498e-05, "loss": 0.0864, "step": 446 }, { "epoch": 0.3134642356241234, "grad_norm": 0.2594299614429474, "learning_rate": 4.963125822211388e-05, "loss": 0.0888, "step": 447 }, { "epoch": 0.3141654978962132, "grad_norm": 0.25463730096817017, "learning_rate": 4.962935484664859e-05, "loss": 0.0875, "step": 448 }, { "epoch": 0.31486676016830295, "grad_norm": 0.8506668210029602, "learning_rate": 4.962744660807495e-05, "loss": 0.4204, "step": 449 }, { "epoch": 0.3155680224403927, "grad_norm": 0.2667442560195923, "learning_rate": 4.962553350676973e-05, "loss": 0.08, "step": 450 }, { "epoch": 0.31626928471248245, "grad_norm": 0.25296568870544434, "learning_rate": 4.962361554311068e-05, "loss": 0.0794, "step": 451 }, { "epoch": 0.3169705469845722, "grad_norm": 0.25738415122032166, "learning_rate": 4.962169271747651e-05, "loss": 0.079, "step": 452 }, { "epoch": 0.317671809256662, "grad_norm": 0.25537872314453125, "learning_rate": 4.961976503024688e-05, "loss": 0.0778, "step": 453 }, { "epoch": 0.31837307152875177, "grad_norm": 0.3663192689418793, "learning_rate": 4.961783248180242e-05, "loss": 0.2532, "step": 454 }, { "epoch": 0.3190743338008415, "grad_norm": 0.24685700237751007, "learning_rate": 4.9615895072524716e-05, "loss": 0.0824, "step": 455 }, { "epoch": 0.31977559607293127, "grad_norm": 0.2456488311290741, "learning_rate": 4.9613952802796304e-05, "loss": 0.0736, "step": 456 }, { "epoch": 0.32047685834502104, "grad_norm": 6.093946933746338, "learning_rate": 4.9612005673000696e-05, "loss": 0.6581, "step": 457 }, { "epoch": 0.3211781206171108, "grad_norm": 0.23248949646949768, "learning_rate": 4.961005368352235e-05, "loss": 0.0636, "step": 458 }, { "epoch": 0.32187938288920054, "grad_norm": 0.3815716803073883, "learning_rate": 4.96080968347467e-05, "loss": 0.2642, "step": 459 }, { "epoch": 0.3225806451612903, "grad_norm": 0.35341301560401917, "learning_rate": 4.960613512706011e-05, "loss": 0.2586, "step": 460 }, { "epoch": 0.3232819074333801, "grad_norm": 0.2432442605495453, "learning_rate": 4.960416856084994e-05, "loss": 0.0798, "step": 461 }, { "epoch": 0.32398316970546986, "grad_norm": 0.23733285069465637, "learning_rate": 4.960219713650449e-05, "loss": 0.0649, "step": 462 }, { "epoch": 0.3246844319775596, "grad_norm": 0.3932836353778839, "learning_rate": 4.960022085441302e-05, "loss": 0.2545, "step": 463 }, { "epoch": 0.32538569424964936, "grad_norm": 0.24230723083019257, "learning_rate": 4.959823971496574e-05, "loss": 0.08, "step": 464 }, { "epoch": 0.32608695652173914, "grad_norm": 0.2423563152551651, "learning_rate": 4.959625371855384e-05, "loss": 0.0802, "step": 465 }, { "epoch": 0.3267882187938289, "grad_norm": 6.302280426025391, "learning_rate": 4.959426286556946e-05, "loss": 0.9551, "step": 466 }, { "epoch": 0.32748948106591863, "grad_norm": 0.24419474601745605, "learning_rate": 4.95922671564057e-05, "loss": 0.08, "step": 467 }, { "epoch": 0.3281907433380084, "grad_norm": 0.24518023431301117, "learning_rate": 4.95902665914566e-05, "loss": 0.075, "step": 468 }, { "epoch": 0.3288920056100982, "grad_norm": 0.435700923204422, "learning_rate": 4.9588261171117184e-05, "loss": 0.2595, "step": 469 }, { "epoch": 0.32959326788218796, "grad_norm": 0.23336763679981232, "learning_rate": 4.958625089578343e-05, "loss": 0.073, "step": 470 }, { "epoch": 0.3302945301542777, "grad_norm": 0.23896564543247223, "learning_rate": 4.958423576585226e-05, "loss": 0.0802, "step": 471 }, { "epoch": 0.33099579242636745, "grad_norm": 0.3903424143791199, "learning_rate": 4.958221578172157e-05, "loss": 0.2502, "step": 472 }, { "epoch": 0.3316970546984572, "grad_norm": 0.5267605185508728, "learning_rate": 4.958019094379023e-05, "loss": 0.253, "step": 473 }, { "epoch": 0.332398316970547, "grad_norm": 0.26232609152793884, "learning_rate": 4.957816125245801e-05, "loss": 0.0695, "step": 474 }, { "epoch": 0.3330995792426367, "grad_norm": 0.4005662798881531, "learning_rate": 4.9576126708125696e-05, "loss": 0.2446, "step": 475 }, { "epoch": 0.3338008415147265, "grad_norm": 0.2572593092918396, "learning_rate": 4.9574087311195014e-05, "loss": 0.0702, "step": 476 }, { "epoch": 0.3345021037868163, "grad_norm": 0.2594940960407257, "learning_rate": 4.9572043062068647e-05, "loss": 0.0702, "step": 477 }, { "epoch": 0.33520336605890605, "grad_norm": 0.2391783595085144, "learning_rate": 4.956999396115023e-05, "loss": 0.0819, "step": 478 }, { "epoch": 0.33590462833099577, "grad_norm": 0.24152472615242004, "learning_rate": 4.956794000884436e-05, "loss": 0.0741, "step": 479 }, { "epoch": 0.33660589060308554, "grad_norm": 0.37559774518013, "learning_rate": 4.9565881205556594e-05, "loss": 0.2677, "step": 480 }, { "epoch": 0.3373071528751753, "grad_norm": 0.22898507118225098, "learning_rate": 4.956381755169345e-05, "loss": 0.073, "step": 481 }, { "epoch": 0.3380084151472651, "grad_norm": 0.3705040514469147, "learning_rate": 4.9561749047662406e-05, "loss": 0.2655, "step": 482 }, { "epoch": 0.3387096774193548, "grad_norm": 0.23721250891685486, "learning_rate": 4.955967569387188e-05, "loss": 0.0736, "step": 483 }, { "epoch": 0.3394109396914446, "grad_norm": 0.37000712752342224, "learning_rate": 4.955759749073126e-05, "loss": 0.2664, "step": 484 }, { "epoch": 0.34011220196353437, "grad_norm": 0.2349139004945755, "learning_rate": 4.9555514438650894e-05, "loss": 0.0736, "step": 485 }, { "epoch": 0.34081346423562414, "grad_norm": 0.23758095502853394, "learning_rate": 4.9553426538042094e-05, "loss": 0.0802, "step": 486 }, { "epoch": 0.34151472650771386, "grad_norm": 0.3750643730163574, "learning_rate": 4.95513337893171e-05, "loss": 0.2679, "step": 487 }, { "epoch": 0.34221598877980364, "grad_norm": 0.23759862780570984, "learning_rate": 4.954923619288915e-05, "loss": 0.0793, "step": 488 }, { "epoch": 0.3429172510518934, "grad_norm": 0.9699450135231018, "learning_rate": 4.95471337491724e-05, "loss": 0.4602, "step": 489 }, { "epoch": 0.3436185133239832, "grad_norm": 0.23618046939373016, "learning_rate": 4.954502645858199e-05, "loss": 0.0666, "step": 490 }, { "epoch": 0.3443197755960729, "grad_norm": 1.5510258674621582, "learning_rate": 4.9542914321534e-05, "loss": 0.6455, "step": 491 }, { "epoch": 0.3450210378681627, "grad_norm": 0.3733394742012024, "learning_rate": 4.954079733844549e-05, "loss": 0.2668, "step": 492 }, { "epoch": 0.34572230014025246, "grad_norm": 0.23936617374420166, "learning_rate": 4.9538675509734445e-05, "loss": 0.0815, "step": 493 }, { "epoch": 0.34642356241234223, "grad_norm": 0.38999930024147034, "learning_rate": 4.953654883581984e-05, "loss": 0.2604, "step": 494 }, { "epoch": 0.34712482468443195, "grad_norm": 0.2408713698387146, "learning_rate": 4.953441731712158e-05, "loss": 0.0825, "step": 495 }, { "epoch": 0.34782608695652173, "grad_norm": 0.3959548771381378, "learning_rate": 4.953228095406054e-05, "loss": 0.251, "step": 496 }, { "epoch": 0.3485273492286115, "grad_norm": 0.37747806310653687, "learning_rate": 4.9530139747058545e-05, "loss": 0.2584, "step": 497 }, { "epoch": 0.3492286115007013, "grad_norm": 0.24324338138103485, "learning_rate": 4.9527993696538386e-05, "loss": 0.084, "step": 498 }, { "epoch": 0.349929873772791, "grad_norm": 0.3429815173149109, "learning_rate": 4.9525842802923796e-05, "loss": 0.2611, "step": 499 }, { "epoch": 0.3506311360448808, "grad_norm": 0.24990200996398926, "learning_rate": 4.952368706663948e-05, "loss": 0.0713, "step": 500 }, { "epoch": 0.35133239831697055, "grad_norm": 1.3953312635421753, "learning_rate": 4.95215264881111e-05, "loss": 0.6065, "step": 501 }, { "epoch": 0.3520336605890603, "grad_norm": 0.2545188367366791, "learning_rate": 4.9519361067765244e-05, "loss": 0.0804, "step": 502 }, { "epoch": 0.35273492286115005, "grad_norm": 0.24843181669712067, "learning_rate": 4.95171908060295e-05, "loss": 0.0864, "step": 503 }, { "epoch": 0.3534361851332398, "grad_norm": 0.24827539920806885, "learning_rate": 4.951501570333237e-05, "loss": 0.0805, "step": 504 }, { "epoch": 0.3541374474053296, "grad_norm": 0.26023411750793457, "learning_rate": 4.9512835760103345e-05, "loss": 0.075, "step": 505 }, { "epoch": 0.3548387096774194, "grad_norm": 0.323438435792923, "learning_rate": 4.951065097677285e-05, "loss": 0.2564, "step": 506 }, { "epoch": 0.3555399719495091, "grad_norm": 0.2497410774230957, "learning_rate": 4.9508461353772284e-05, "loss": 0.0808, "step": 507 }, { "epoch": 0.35624123422159887, "grad_norm": 0.35721510648727417, "learning_rate": 4.950626689153399e-05, "loss": 0.2444, "step": 508 }, { "epoch": 0.35694249649368864, "grad_norm": 0.2562461793422699, "learning_rate": 4.950406759049127e-05, "loss": 0.0729, "step": 509 }, { "epoch": 0.3576437587657784, "grad_norm": 0.27770695090293884, "learning_rate": 4.9501863451078356e-05, "loss": 0.0669, "step": 510 }, { "epoch": 0.35834502103786814, "grad_norm": 0.25719892978668213, "learning_rate": 4.9499654473730495e-05, "loss": 0.0722, "step": 511 }, { "epoch": 0.3590462833099579, "grad_norm": 0.2502214312553406, "learning_rate": 4.949744065888383e-05, "loss": 0.0864, "step": 512 }, { "epoch": 0.3597475455820477, "grad_norm": 0.25199466943740845, "learning_rate": 4.9495222006975486e-05, "loss": 0.0713, "step": 513 }, { "epoch": 0.36044880785413747, "grad_norm": 0.24408985674381256, "learning_rate": 4.9492998518443554e-05, "loss": 0.0785, "step": 514 }, { "epoch": 0.3611500701262272, "grad_norm": 0.23611286282539368, "learning_rate": 4.949077019372704e-05, "loss": 0.0765, "step": 515 }, { "epoch": 0.36185133239831696, "grad_norm": 0.3320540487766266, "learning_rate": 4.948853703326596e-05, "loss": 0.2577, "step": 516 }, { "epoch": 0.36255259467040674, "grad_norm": 0.9060179591178894, "learning_rate": 4.948629903750123e-05, "loss": 0.4178, "step": 517 }, { "epoch": 0.3632538569424965, "grad_norm": 0.22528652846813202, "learning_rate": 4.9484056206874756e-05, "loss": 0.0592, "step": 518 }, { "epoch": 0.36395511921458623, "grad_norm": 6.064340114593506, "learning_rate": 4.948180854182939e-05, "loss": 0.7961, "step": 519 }, { "epoch": 0.364656381486676, "grad_norm": 0.24764154851436615, "learning_rate": 4.9479556042808936e-05, "loss": 0.0842, "step": 520 }, { "epoch": 0.3653576437587658, "grad_norm": 0.366883784532547, "learning_rate": 4.947729871025816e-05, "loss": 0.2428, "step": 521 }, { "epoch": 0.36605890603085556, "grad_norm": 0.24858447909355164, "learning_rate": 4.9475036544622766e-05, "loss": 0.0848, "step": 522 }, { "epoch": 0.3667601683029453, "grad_norm": 5.854316711425781, "learning_rate": 4.947276954634943e-05, "loss": 0.7868, "step": 523 }, { "epoch": 0.36746143057503505, "grad_norm": 0.2413279414176941, "learning_rate": 4.9470497715885764e-05, "loss": 0.0782, "step": 524 }, { "epoch": 0.36816269284712483, "grad_norm": 0.3332425057888031, "learning_rate": 4.946822105368037e-05, "loss": 0.2618, "step": 525 }, { "epoch": 0.3688639551192146, "grad_norm": 0.24769669771194458, "learning_rate": 4.9465939560182736e-05, "loss": 0.0861, "step": 526 }, { "epoch": 0.3695652173913043, "grad_norm": 0.24813514947891235, "learning_rate": 4.9463653235843385e-05, "loss": 0.0716, "step": 527 }, { "epoch": 0.3702664796633941, "grad_norm": 0.8456823825836182, "learning_rate": 4.946136208111375e-05, "loss": 0.4291, "step": 528 }, { "epoch": 0.3709677419354839, "grad_norm": 0.24801166355609894, "learning_rate": 4.94590660964462e-05, "loss": 0.0801, "step": 529 }, { "epoch": 0.37166900420757365, "grad_norm": 4.597894191741943, "learning_rate": 4.9456765282294114e-05, "loss": 0.529, "step": 530 }, { "epoch": 0.37237026647966337, "grad_norm": 0.32557475566864014, "learning_rate": 4.945445963911177e-05, "loss": 0.2515, "step": 531 }, { "epoch": 0.37307152875175315, "grad_norm": 0.2610253393650055, "learning_rate": 4.945214916735442e-05, "loss": 0.0839, "step": 532 }, { "epoch": 0.3737727910238429, "grad_norm": 0.2529914975166321, "learning_rate": 4.944983386747828e-05, "loss": 0.0896, "step": 533 }, { "epoch": 0.3744740532959327, "grad_norm": 0.3177853226661682, "learning_rate": 4.944751373994051e-05, "loss": 0.2481, "step": 534 }, { "epoch": 0.3751753155680224, "grad_norm": 0.8122949600219727, "learning_rate": 4.9445188785199216e-05, "loss": 0.4199, "step": 535 }, { "epoch": 0.3758765778401122, "grad_norm": 0.2638053297996521, "learning_rate": 4.944285900371347e-05, "loss": 0.0856, "step": 536 }, { "epoch": 0.37657784011220197, "grad_norm": 0.28154072165489197, "learning_rate": 4.944052439594329e-05, "loss": 0.0806, "step": 537 }, { "epoch": 0.37727910238429174, "grad_norm": 2.6858110427856445, "learning_rate": 4.9438184962349646e-05, "loss": 0.5112, "step": 538 }, { "epoch": 0.37798036465638146, "grad_norm": 0.7946494817733765, "learning_rate": 4.9435840703394463e-05, "loss": 0.4142, "step": 539 }, { "epoch": 0.37868162692847124, "grad_norm": 0.29052111506462097, "learning_rate": 4.943349161954062e-05, "loss": 0.2527, "step": 540 }, { "epoch": 0.379382889200561, "grad_norm": 0.263764888048172, "learning_rate": 4.943113771125195e-05, "loss": 0.094, "step": 541 }, { "epoch": 0.3800841514726508, "grad_norm": 0.280325710773468, "learning_rate": 4.942877897899324e-05, "loss": 0.2473, "step": 542 }, { "epoch": 0.3807854137447405, "grad_norm": 2.1489508152008057, "learning_rate": 4.942641542323022e-05, "loss": 0.4251, "step": 543 }, { "epoch": 0.3814866760168303, "grad_norm": 0.2721588909626007, "learning_rate": 4.942404704442958e-05, "loss": 0.0966, "step": 544 }, { "epoch": 0.38218793828892006, "grad_norm": 0.35811668634414673, "learning_rate": 4.942167384305895e-05, "loss": 0.0992, "step": 545 }, { "epoch": 0.38288920056100983, "grad_norm": 1.2233481407165527, "learning_rate": 4.9419295819586934e-05, "loss": 0.5505, "step": 546 }, { "epoch": 0.38359046283309955, "grad_norm": 0.7455750703811646, "learning_rate": 4.941691297448308e-05, "loss": 0.393, "step": 547 }, { "epoch": 0.38429172510518933, "grad_norm": 0.7228630781173706, "learning_rate": 4.9414525308217876e-05, "loss": 0.3854, "step": 548 }, { "epoch": 0.3849929873772791, "grad_norm": 0.4493880271911621, "learning_rate": 4.941213282126278e-05, "loss": 0.1147, "step": 549 }, { "epoch": 0.3856942496493689, "grad_norm": 0.2854968011379242, "learning_rate": 4.940973551409018e-05, "loss": 0.2544, "step": 550 }, { "epoch": 0.3863955119214586, "grad_norm": 0.5284735560417175, "learning_rate": 4.9407333387173435e-05, "loss": 0.118, "step": 551 }, { "epoch": 0.3870967741935484, "grad_norm": 0.6891767382621765, "learning_rate": 4.9404926440986857e-05, "loss": 0.3865, "step": 552 }, { "epoch": 0.38779803646563815, "grad_norm": 0.3001798987388611, "learning_rate": 4.9402514676005684e-05, "loss": 0.1047, "step": 553 }, { "epoch": 0.3884992987377279, "grad_norm": 0.3024451434612274, "learning_rate": 4.940009809270614e-05, "loss": 0.105, "step": 554 }, { "epoch": 0.38920056100981765, "grad_norm": 0.5323725342750549, "learning_rate": 4.939767669156537e-05, "loss": 0.1214, "step": 555 }, { "epoch": 0.3899018232819074, "grad_norm": 0.7071083784103394, "learning_rate": 4.93952504730615e-05, "loss": 0.3758, "step": 556 }, { "epoch": 0.3906030855539972, "grad_norm": 0.9557965397834778, "learning_rate": 4.939281943767358e-05, "loss": 0.2072, "step": 557 }, { "epoch": 0.391304347826087, "grad_norm": 0.23597246408462524, "learning_rate": 4.939038358588162e-05, "loss": 0.2408, "step": 558 }, { "epoch": 0.3920056100981767, "grad_norm": 0.22535587847232819, "learning_rate": 4.938794291816659e-05, "loss": 0.2414, "step": 559 }, { "epoch": 0.39270687237026647, "grad_norm": 0.973599910736084, "learning_rate": 4.9385497435010394e-05, "loss": 0.1838, "step": 560 }, { "epoch": 0.39340813464235624, "grad_norm": 0.24448958039283752, "learning_rate": 4.938304713689591e-05, "loss": 0.2508, "step": 561 }, { "epoch": 0.394109396914446, "grad_norm": 0.24042154848575592, "learning_rate": 4.938059202430695e-05, "loss": 0.2481, "step": 562 }, { "epoch": 0.39481065918653574, "grad_norm": 0.2282569259405136, "learning_rate": 4.937813209772827e-05, "loss": 0.2334, "step": 563 }, { "epoch": 0.3955119214586255, "grad_norm": 0.33488380908966064, "learning_rate": 4.937566735764559e-05, "loss": 0.1135, "step": 564 }, { "epoch": 0.3962131837307153, "grad_norm": 0.22197645902633667, "learning_rate": 4.937319780454559e-05, "loss": 0.2359, "step": 565 }, { "epoch": 0.39691444600280507, "grad_norm": 0.43901681900024414, "learning_rate": 4.937072343891588e-05, "loss": 0.1249, "step": 566 }, { "epoch": 0.3976157082748948, "grad_norm": 0.6939346790313721, "learning_rate": 4.9368244261245025e-05, "loss": 0.1476, "step": 567 }, { "epoch": 0.39831697054698456, "grad_norm": 0.2490689903497696, "learning_rate": 4.936576027202254e-05, "loss": 0.2512, "step": 568 }, { "epoch": 0.39901823281907434, "grad_norm": 0.43641090393066406, "learning_rate": 4.93632714717389e-05, "loss": 0.1233, "step": 569 }, { "epoch": 0.3997194950911641, "grad_norm": 0.4251331090927124, "learning_rate": 4.936077786088552e-05, "loss": 0.1213, "step": 570 }, { "epoch": 0.40042075736325383, "grad_norm": 0.2264283299446106, "learning_rate": 4.935827943995477e-05, "loss": 0.2447, "step": 571 }, { "epoch": 0.4011220196353436, "grad_norm": 0.40848588943481445, "learning_rate": 4.935577620943996e-05, "loss": 0.1182, "step": 572 }, { "epoch": 0.4018232819074334, "grad_norm": 0.3230941891670227, "learning_rate": 4.935326816983537e-05, "loss": 0.1096, "step": 573 }, { "epoch": 0.40252454417952316, "grad_norm": 0.44627073407173157, "learning_rate": 4.935075532163621e-05, "loss": 0.1146, "step": 574 }, { "epoch": 0.4032258064516129, "grad_norm": 0.3144536018371582, "learning_rate": 4.934823766533864e-05, "loss": 0.107, "step": 575 }, { "epoch": 0.40392706872370265, "grad_norm": 0.34260332584381104, "learning_rate": 4.934571520143978e-05, "loss": 0.1041, "step": 576 }, { "epoch": 0.40462833099579243, "grad_norm": 0.3950185477733612, "learning_rate": 4.9343187930437696e-05, "loss": 0.1029, "step": 577 }, { "epoch": 0.4053295932678822, "grad_norm": 0.31233471632003784, "learning_rate": 4.9340655852831406e-05, "loss": 0.0978, "step": 578 }, { "epoch": 0.4060308555399719, "grad_norm": 0.3319379389286041, "learning_rate": 4.933811896912087e-05, "loss": 0.0927, "step": 579 }, { "epoch": 0.4067321178120617, "grad_norm": 0.27647536993026733, "learning_rate": 4.933557727980699e-05, "loss": 0.0954, "step": 580 }, { "epoch": 0.4074333800841515, "grad_norm": 0.8266716599464417, "learning_rate": 4.933303078539164e-05, "loss": 0.4131, "step": 581 }, { "epoch": 0.40813464235624125, "grad_norm": 0.28029704093933105, "learning_rate": 4.9330479486377635e-05, "loss": 0.0881, "step": 582 }, { "epoch": 0.40883590462833097, "grad_norm": 0.27097228169441223, "learning_rate": 4.9327923383268716e-05, "loss": 0.0853, "step": 583 }, { "epoch": 0.40953716690042075, "grad_norm": 0.2647046744823456, "learning_rate": 4.9325362476569606e-05, "loss": 0.0845, "step": 584 }, { "epoch": 0.4102384291725105, "grad_norm": 0.26458799839019775, "learning_rate": 4.932279676678594e-05, "loss": 0.0834, "step": 585 }, { "epoch": 0.4109396914446003, "grad_norm": 0.26074841618537903, "learning_rate": 4.932022625442434e-05, "loss": 0.0816, "step": 586 }, { "epoch": 0.41164095371669, "grad_norm": 0.2572190463542938, "learning_rate": 4.931765093999235e-05, "loss": 0.08, "step": 587 }, { "epoch": 0.4123422159887798, "grad_norm": 3.1966333389282227, "learning_rate": 4.931507082399847e-05, "loss": 0.5002, "step": 588 }, { "epoch": 0.41304347826086957, "grad_norm": 0.36195218563079834, "learning_rate": 4.931248590695216e-05, "loss": 0.2506, "step": 589 }, { "epoch": 0.41374474053295934, "grad_norm": 0.9274333119392395, "learning_rate": 4.93098961893638e-05, "loss": 0.4368, "step": 590 }, { "epoch": 0.41444600280504906, "grad_norm": 0.24635867774486542, "learning_rate": 4.930730167174474e-05, "loss": 0.0777, "step": 591 }, { "epoch": 0.41514726507713884, "grad_norm": 0.25121134519577026, "learning_rate": 4.930470235460728e-05, "loss": 0.0834, "step": 592 }, { "epoch": 0.4158485273492286, "grad_norm": 0.25119951367378235, "learning_rate": 4.930209823846464e-05, "loss": 0.0784, "step": 593 }, { "epoch": 0.4165497896213184, "grad_norm": 0.24903064966201782, "learning_rate": 4.929948932383104e-05, "loss": 0.0826, "step": 594 }, { "epoch": 0.4172510518934081, "grad_norm": 0.24783915281295776, "learning_rate": 4.929687561122158e-05, "loss": 0.0825, "step": 595 }, { "epoch": 0.4179523141654979, "grad_norm": 0.2473335564136505, "learning_rate": 4.9294257101152365e-05, "loss": 0.0817, "step": 596 }, { "epoch": 0.41865357643758766, "grad_norm": 0.247135192155838, "learning_rate": 4.9291633794140406e-05, "loss": 0.0813, "step": 597 }, { "epoch": 0.41935483870967744, "grad_norm": 0.9803175330162048, "learning_rate": 4.9289005690703695e-05, "loss": 0.4528, "step": 598 }, { "epoch": 0.42005610098176716, "grad_norm": 0.24079735577106476, "learning_rate": 4.928637279136115e-05, "loss": 0.076, "step": 599 }, { "epoch": 0.42075736325385693, "grad_norm": 0.385967493057251, "learning_rate": 4.928373509663264e-05, "loss": 0.2651, "step": 600 }, { "epoch": 0.4214586255259467, "grad_norm": 0.4192684292793274, "learning_rate": 4.928109260703899e-05, "loss": 0.2594, "step": 601 }, { "epoch": 0.4221598877980365, "grad_norm": 0.2461949586868286, "learning_rate": 4.927844532310195e-05, "loss": 0.0805, "step": 602 }, { "epoch": 0.4228611500701262, "grad_norm": 0.9534791707992554, "learning_rate": 4.9275793245344246e-05, "loss": 0.4338, "step": 603 }, { "epoch": 0.423562412342216, "grad_norm": 0.2455025017261505, "learning_rate": 4.927313637428953e-05, "loss": 0.0773, "step": 604 }, { "epoch": 0.42426367461430575, "grad_norm": 2.624152898788452, "learning_rate": 4.9270474710462394e-05, "loss": 0.4565, "step": 605 }, { "epoch": 0.42496493688639553, "grad_norm": 0.24251051247119904, "learning_rate": 4.92678082543884e-05, "loss": 0.0767, "step": 606 }, { "epoch": 0.42566619915848525, "grad_norm": 0.246474027633667, "learning_rate": 4.926513700659404e-05, "loss": 0.0798, "step": 607 }, { "epoch": 0.426367461430575, "grad_norm": 0.329734742641449, "learning_rate": 4.9262460967606775e-05, "loss": 0.2596, "step": 608 }, { "epoch": 0.4270687237026648, "grad_norm": 0.3410508632659912, "learning_rate": 4.925978013795496e-05, "loss": 0.2544, "step": 609 }, { "epoch": 0.4277699859747546, "grad_norm": 0.2491086721420288, "learning_rate": 4.925709451816795e-05, "loss": 0.0813, "step": 610 }, { "epoch": 0.4284712482468443, "grad_norm": 0.25808578729629517, "learning_rate": 4.9254404108776023e-05, "loss": 0.0767, "step": 611 }, { "epoch": 0.42917251051893407, "grad_norm": 0.24720081686973572, "learning_rate": 4.9251708910310405e-05, "loss": 0.081, "step": 612 }, { "epoch": 0.42987377279102384, "grad_norm": 0.2578522562980652, "learning_rate": 4.924900892330326e-05, "loss": 0.0779, "step": 613 }, { "epoch": 0.4305750350631136, "grad_norm": 0.24603991210460663, "learning_rate": 4.924630414828773e-05, "loss": 0.0855, "step": 614 }, { "epoch": 0.43127629733520334, "grad_norm": 0.25102120637893677, "learning_rate": 4.9243594585797836e-05, "loss": 0.0816, "step": 615 }, { "epoch": 0.4319775596072931, "grad_norm": 0.2449728548526764, "learning_rate": 4.924088023636863e-05, "loss": 0.085, "step": 616 }, { "epoch": 0.4326788218793829, "grad_norm": 0.2444487363100052, "learning_rate": 4.923816110053603e-05, "loss": 0.0846, "step": 617 }, { "epoch": 0.43338008415147267, "grad_norm": 1.7872462272644043, "learning_rate": 4.923543717883695e-05, "loss": 0.3597, "step": 618 }, { "epoch": 0.4340813464235624, "grad_norm": 0.3473317623138428, "learning_rate": 4.923270847180923e-05, "loss": 0.2542, "step": 619 }, { "epoch": 0.43478260869565216, "grad_norm": 0.24856089055538177, "learning_rate": 4.922997497999166e-05, "loss": 0.081, "step": 620 }, { "epoch": 0.43548387096774194, "grad_norm": 0.24300028383731842, "learning_rate": 4.922723670392396e-05, "loss": 0.0843, "step": 621 }, { "epoch": 0.4361851332398317, "grad_norm": 0.2543219029903412, "learning_rate": 4.9224493644146835e-05, "loss": 0.0808, "step": 622 }, { "epoch": 0.43688639551192143, "grad_norm": 0.24961449205875397, "learning_rate": 4.922174580120188e-05, "loss": 0.0807, "step": 623 }, { "epoch": 0.4375876577840112, "grad_norm": 0.24836528301239014, "learning_rate": 4.921899317563168e-05, "loss": 0.0813, "step": 624 }, { "epoch": 0.438288920056101, "grad_norm": 0.3522462844848633, "learning_rate": 4.9216235767979736e-05, "loss": 0.2623, "step": 625 }, { "epoch": 0.43899018232819076, "grad_norm": 0.3753505051136017, "learning_rate": 4.921347357879049e-05, "loss": 0.255, "step": 626 }, { "epoch": 0.4396914446002805, "grad_norm": 0.24231280386447906, "learning_rate": 4.921070660860937e-05, "loss": 0.0833, "step": 627 }, { "epoch": 0.44039270687237025, "grad_norm": 0.9075778722763062, "learning_rate": 4.9207934857982705e-05, "loss": 0.4412, "step": 628 }, { "epoch": 0.44109396914446003, "grad_norm": 0.3492523729801178, "learning_rate": 4.9205158327457773e-05, "loss": 0.2546, "step": 629 }, { "epoch": 0.4417952314165498, "grad_norm": 0.3650231957435608, "learning_rate": 4.920237701758282e-05, "loss": 0.2676, "step": 630 }, { "epoch": 0.4424964936886395, "grad_norm": 0.24332167208194733, "learning_rate": 4.9199590928907005e-05, "loss": 0.0839, "step": 631 }, { "epoch": 0.4431977559607293, "grad_norm": 0.34359535574913025, "learning_rate": 4.919680006198046e-05, "loss": 0.2558, "step": 632 }, { "epoch": 0.4438990182328191, "grad_norm": 0.8826388120651245, "learning_rate": 4.919400441735424e-05, "loss": 0.4245, "step": 633 }, { "epoch": 0.44460028050490885, "grad_norm": 0.24471056461334229, "learning_rate": 4.9191203995580347e-05, "loss": 0.0857, "step": 634 }, { "epoch": 0.44530154277699857, "grad_norm": 0.33600544929504395, "learning_rate": 4.918839879721173e-05, "loss": 0.2564, "step": 635 }, { "epoch": 0.44600280504908835, "grad_norm": 0.2523267865180969, "learning_rate": 4.9185588822802295e-05, "loss": 0.0831, "step": 636 }, { "epoch": 0.4467040673211781, "grad_norm": 0.2475283145904541, "learning_rate": 4.918277407290686e-05, "loss": 0.0874, "step": 637 }, { "epoch": 0.4474053295932679, "grad_norm": 0.25848421454429626, "learning_rate": 4.91799545480812e-05, "loss": 0.0839, "step": 638 }, { "epoch": 0.4481065918653576, "grad_norm": 0.2578701674938202, "learning_rate": 4.917713024888204e-05, "loss": 0.0849, "step": 639 }, { "epoch": 0.4488078541374474, "grad_norm": 0.25654157996177673, "learning_rate": 4.917430117586705e-05, "loss": 0.0843, "step": 640 }, { "epoch": 0.44950911640953717, "grad_norm": 0.31858590245246887, "learning_rate": 4.917146732959482e-05, "loss": 0.2541, "step": 641 }, { "epoch": 0.45021037868162694, "grad_norm": 0.32731345295906067, "learning_rate": 4.916862871062492e-05, "loss": 0.2555, "step": 642 }, { "epoch": 0.45091164095371666, "grad_norm": 0.33012130856513977, "learning_rate": 4.916578531951782e-05, "loss": 0.2531, "step": 643 }, { "epoch": 0.45161290322580644, "grad_norm": 0.319018691778183, "learning_rate": 4.9162937156834965e-05, "loss": 0.2464, "step": 644 }, { "epoch": 0.4523141654978962, "grad_norm": 0.3184910714626312, "learning_rate": 4.916008422313872e-05, "loss": 0.2516, "step": 645 }, { "epoch": 0.453015427769986, "grad_norm": 0.31947338581085205, "learning_rate": 4.91572265189924e-05, "loss": 0.2479, "step": 646 }, { "epoch": 0.4537166900420757, "grad_norm": 0.35082271695137024, "learning_rate": 4.915436404496028e-05, "loss": 0.2494, "step": 647 }, { "epoch": 0.4544179523141655, "grad_norm": 0.24971771240234375, "learning_rate": 4.915149680160755e-05, "loss": 0.0875, "step": 648 }, { "epoch": 0.45511921458625526, "grad_norm": 0.25460556149482727, "learning_rate": 4.914862478950034e-05, "loss": 0.0839, "step": 649 }, { "epoch": 0.45582047685834504, "grad_norm": 0.329667329788208, "learning_rate": 4.914574800920576e-05, "loss": 0.2495, "step": 650 }, { "epoch": 0.45652173913043476, "grad_norm": 0.3202732801437378, "learning_rate": 4.914286646129181e-05, "loss": 0.2488, "step": 651 }, { "epoch": 0.45722300140252453, "grad_norm": 0.25581812858581543, "learning_rate": 4.913998014632748e-05, "loss": 0.0843, "step": 652 }, { "epoch": 0.4579242636746143, "grad_norm": 0.32283952832221985, "learning_rate": 4.913708906488266e-05, "loss": 0.2485, "step": 653 }, { "epoch": 0.4586255259467041, "grad_norm": 0.253989577293396, "learning_rate": 4.9134193217528194e-05, "loss": 0.0881, "step": 654 }, { "epoch": 0.4593267882187938, "grad_norm": 0.2547711133956909, "learning_rate": 4.913129260483589e-05, "loss": 0.0889, "step": 655 }, { "epoch": 0.4600280504908836, "grad_norm": 0.3162064254283905, "learning_rate": 4.912838722737847e-05, "loss": 0.2512, "step": 656 }, { "epoch": 0.46072931276297335, "grad_norm": 0.2539234459400177, "learning_rate": 4.9125477085729614e-05, "loss": 0.0885, "step": 657 }, { "epoch": 0.46143057503506313, "grad_norm": 0.25389134883880615, "learning_rate": 4.9122562180463926e-05, "loss": 0.0836, "step": 658 }, { "epoch": 0.46213183730715285, "grad_norm": 0.2547670900821686, "learning_rate": 4.911964251215695e-05, "loss": 0.0886, "step": 659 }, { "epoch": 0.4628330995792426, "grad_norm": 0.3330293595790863, "learning_rate": 4.91167180813852e-05, "loss": 0.2531, "step": 660 }, { "epoch": 0.4635343618513324, "grad_norm": 0.25107240676879883, "learning_rate": 4.9113788888726095e-05, "loss": 0.0828, "step": 661 }, { "epoch": 0.4642356241234222, "grad_norm": 0.31728997826576233, "learning_rate": 4.911085493475802e-05, "loss": 0.2519, "step": 662 }, { "epoch": 0.4649368863955119, "grad_norm": 0.24644207954406738, "learning_rate": 4.910791622006028e-05, "loss": 0.0806, "step": 663 }, { "epoch": 0.46563814866760167, "grad_norm": 0.34919828176498413, "learning_rate": 4.910497274521314e-05, "loss": 0.2456, "step": 664 }, { "epoch": 0.46633941093969145, "grad_norm": 0.2523018419742584, "learning_rate": 4.9102024510797775e-05, "loss": 0.0865, "step": 665 }, { "epoch": 0.4670406732117812, "grad_norm": 0.33324116468429565, "learning_rate": 4.909907151739633e-05, "loss": 0.258, "step": 666 }, { "epoch": 0.46774193548387094, "grad_norm": 0.34726446866989136, "learning_rate": 4.909611376559189e-05, "loss": 0.2622, "step": 667 }, { "epoch": 0.4684431977559607, "grad_norm": 0.2511356472969055, "learning_rate": 4.909315125596845e-05, "loss": 0.0811, "step": 668 }, { "epoch": 0.4691444600280505, "grad_norm": 0.3595215082168579, "learning_rate": 4.9090183989110974e-05, "loss": 0.2443, "step": 669 }, { "epoch": 0.46984572230014027, "grad_norm": 0.3181239068508148, "learning_rate": 4.908721196560535e-05, "loss": 0.2529, "step": 670 }, { "epoch": 0.47054698457223, "grad_norm": 0.3520553410053253, "learning_rate": 4.908423518603841e-05, "loss": 0.243, "step": 671 }, { "epoch": 0.47124824684431976, "grad_norm": 0.25563710927963257, "learning_rate": 4.908125365099792e-05, "loss": 0.0869, "step": 672 }, { "epoch": 0.47194950911640954, "grad_norm": 4.106253623962402, "learning_rate": 4.907826736107259e-05, "loss": 0.8695, "step": 673 }, { "epoch": 0.4726507713884993, "grad_norm": 0.25791627168655396, "learning_rate": 4.9075276316852076e-05, "loss": 0.0882, "step": 674 }, { "epoch": 0.47335203366058903, "grad_norm": 3.987487316131592, "learning_rate": 4.907228051892696e-05, "loss": 0.6726, "step": 675 }, { "epoch": 0.4740532959326788, "grad_norm": 0.8213611841201782, "learning_rate": 4.906927996788876e-05, "loss": 0.4184, "step": 676 }, { "epoch": 0.4747545582047686, "grad_norm": 0.3162400424480438, "learning_rate": 4.906627466432995e-05, "loss": 0.2521, "step": 677 }, { "epoch": 0.47545582047685836, "grad_norm": 0.3017626404762268, "learning_rate": 4.906326460884393e-05, "loss": 0.2515, "step": 678 }, { "epoch": 0.4761570827489481, "grad_norm": 0.30746015906333923, "learning_rate": 4.906024980202505e-05, "loss": 0.2417, "step": 679 }, { "epoch": 0.47685834502103785, "grad_norm": 0.2688722312450409, "learning_rate": 4.9057230244468566e-05, "loss": 0.0947, "step": 680 }, { "epoch": 0.47755960729312763, "grad_norm": 0.27959832549095154, "learning_rate": 4.905420593677072e-05, "loss": 0.0923, "step": 681 }, { "epoch": 0.4782608695652174, "grad_norm": 0.2875322699546814, "learning_rate": 4.905117687952864e-05, "loss": 0.0911, "step": 682 }, { "epoch": 0.4789621318373071, "grad_norm": 0.267642617225647, "learning_rate": 4.9048143073340436e-05, "loss": 0.2452, "step": 683 }, { "epoch": 0.4796633941093969, "grad_norm": 0.28645002841949463, "learning_rate": 4.904510451880513e-05, "loss": 0.0934, "step": 684 }, { "epoch": 0.4803646563814867, "grad_norm": 0.26284071803092957, "learning_rate": 4.9042061216522705e-05, "loss": 0.2493, "step": 685 }, { "epoch": 0.48106591865357645, "grad_norm": 0.27408137917518616, "learning_rate": 4.903901316709406e-05, "loss": 0.0969, "step": 686 }, { "epoch": 0.48176718092566617, "grad_norm": 1.6145774126052856, "learning_rate": 4.903596037112102e-05, "loss": 0.2646, "step": 687 }, { "epoch": 0.48246844319775595, "grad_norm": 0.27304908633232117, "learning_rate": 4.903290282920638e-05, "loss": 0.0968, "step": 688 }, { "epoch": 0.4831697054698457, "grad_norm": 0.2999502718448639, "learning_rate": 4.9029840541953864e-05, "loss": 0.0971, "step": 689 }, { "epoch": 0.4838709677419355, "grad_norm": 0.26187384128570557, "learning_rate": 4.9026773509968115e-05, "loss": 0.245, "step": 690 }, { "epoch": 0.4845722300140252, "grad_norm": 0.25527021288871765, "learning_rate": 4.902370173385473e-05, "loss": 0.244, "step": 691 }, { "epoch": 0.485273492286115, "grad_norm": 0.7412816882133484, "learning_rate": 4.902062521422022e-05, "loss": 0.3939, "step": 692 }, { "epoch": 0.48597475455820477, "grad_norm": 0.3135283589363098, "learning_rate": 4.901754395167207e-05, "loss": 0.1009, "step": 693 }, { "epoch": 0.48667601683029454, "grad_norm": 0.2817038297653198, "learning_rate": 4.901445794681867e-05, "loss": 0.0992, "step": 694 }, { "epoch": 0.48737727910238426, "grad_norm": 0.3084399402141571, "learning_rate": 4.901136720026936e-05, "loss": 0.0996, "step": 695 }, { "epoch": 0.48807854137447404, "grad_norm": 0.3122844398021698, "learning_rate": 4.900827171263441e-05, "loss": 0.1004, "step": 696 }, { "epoch": 0.4887798036465638, "grad_norm": 0.2823612689971924, "learning_rate": 4.9005171484525035e-05, "loss": 0.1, "step": 697 }, { "epoch": 0.4894810659186536, "grad_norm": 0.3083200454711914, "learning_rate": 4.9002066516553377e-05, "loss": 0.0996, "step": 698 }, { "epoch": 0.4901823281907433, "grad_norm": 0.30182570219039917, "learning_rate": 4.8998956809332516e-05, "loss": 0.0981, "step": 699 }, { "epoch": 0.4908835904628331, "grad_norm": 0.30629318952560425, "learning_rate": 4.8995842363476465e-05, "loss": 0.0991, "step": 700 }, { "epoch": 0.49158485273492286, "grad_norm": 1.5551071166992188, "learning_rate": 4.899272317960019e-05, "loss": 0.3907, "step": 701 }, { "epoch": 0.49228611500701264, "grad_norm": 0.2764781713485718, "learning_rate": 4.898959925831956e-05, "loss": 0.098, "step": 702 }, { "epoch": 0.49298737727910236, "grad_norm": 0.2718639373779297, "learning_rate": 4.898647060025142e-05, "loss": 0.0966, "step": 703 }, { "epoch": 0.49368863955119213, "grad_norm": 0.31221479177474976, "learning_rate": 4.898333720601352e-05, "loss": 0.098, "step": 704 }, { "epoch": 0.4943899018232819, "grad_norm": 0.27180320024490356, "learning_rate": 4.898019907622454e-05, "loss": 0.2483, "step": 705 }, { "epoch": 0.4950911640953717, "grad_norm": 1.1813615560531616, "learning_rate": 4.8977056211504136e-05, "loss": 0.2603, "step": 706 }, { "epoch": 0.4957924263674614, "grad_norm": 0.32602459192276, "learning_rate": 4.897390861247285e-05, "loss": 0.0962, "step": 707 }, { "epoch": 0.4964936886395512, "grad_norm": 0.27008339762687683, "learning_rate": 4.8970756279752196e-05, "loss": 0.0965, "step": 708 }, { "epoch": 0.49719495091164095, "grad_norm": 0.2991069555282593, "learning_rate": 4.896759921396459e-05, "loss": 0.0967, "step": 709 }, { "epoch": 0.49789621318373073, "grad_norm": 0.270218163728714, "learning_rate": 4.896443741573341e-05, "loss": 0.0965, "step": 710 }, { "epoch": 0.49859747545582045, "grad_norm": 0.2691139578819275, "learning_rate": 4.8961270885682965e-05, "loss": 0.0963, "step": 711 }, { "epoch": 0.4992987377279102, "grad_norm": 0.2977120578289032, "learning_rate": 4.895809962443849e-05, "loss": 0.0963, "step": 712 }, { "epoch": 0.5, "grad_norm": 0.26946088671684265, "learning_rate": 4.895492363262614e-05, "loss": 0.2487, "step": 713 }, { "epoch": 0.5007012622720898, "grad_norm": 0.2672184109687805, "learning_rate": 4.895174291087304e-05, "loss": 0.2501, "step": 714 }, { "epoch": 0.5014025245441796, "grad_norm": 0.2610655426979065, "learning_rate": 4.894855745980722e-05, "loss": 0.094, "step": 715 }, { "epoch": 0.5021037868162693, "grad_norm": 0.2764429748058319, "learning_rate": 4.8945367280057645e-05, "loss": 0.2455, "step": 716 }, { "epoch": 0.5028050490883591, "grad_norm": 0.2727576792240143, "learning_rate": 4.894217237225423e-05, "loss": 0.2522, "step": 717 }, { "epoch": 0.5035063113604488, "grad_norm": 0.26200541853904724, "learning_rate": 4.893897273702782e-05, "loss": 0.0942, "step": 718 }, { "epoch": 0.5042075736325385, "grad_norm": 0.26024267077445984, "learning_rate": 4.893576837501019e-05, "loss": 0.0931, "step": 719 }, { "epoch": 0.5049088359046283, "grad_norm": 0.27946773171424866, "learning_rate": 4.893255928683403e-05, "loss": 0.2533, "step": 720 }, { "epoch": 0.5056100981767181, "grad_norm": 0.29563286900520325, "learning_rate": 4.892934547313299e-05, "loss": 0.2546, "step": 721 }, { "epoch": 0.5063113604488079, "grad_norm": 0.26053547859191895, "learning_rate": 4.892612693454165e-05, "loss": 0.0934, "step": 722 }, { "epoch": 0.5070126227208976, "grad_norm": 0.2800077497959137, "learning_rate": 4.89229036716955e-05, "loss": 0.0917, "step": 723 }, { "epoch": 0.5077138849929874, "grad_norm": 0.2852308750152588, "learning_rate": 4.891967568523099e-05, "loss": 0.2523, "step": 724 }, { "epoch": 0.5084151472650772, "grad_norm": 0.296840101480484, "learning_rate": 4.891644297578549e-05, "loss": 0.0931, "step": 725 }, { "epoch": 0.5091164095371669, "grad_norm": 0.2543898820877075, "learning_rate": 4.8913205543997307e-05, "loss": 0.0913, "step": 726 }, { "epoch": 0.5098176718092566, "grad_norm": 0.2533358633518219, "learning_rate": 4.890996339050568e-05, "loss": 0.0907, "step": 727 }, { "epoch": 0.5105189340813464, "grad_norm": 0.2525581419467926, "learning_rate": 4.8906716515950754e-05, "loss": 0.0903, "step": 728 }, { "epoch": 0.5112201963534362, "grad_norm": 0.7845492959022522, "learning_rate": 4.890346492097366e-05, "loss": 0.4137, "step": 729 }, { "epoch": 0.511921458625526, "grad_norm": 0.25061750411987305, "learning_rate": 4.890020860621641e-05, "loss": 0.0895, "step": 730 }, { "epoch": 0.5126227208976157, "grad_norm": 0.2625655233860016, "learning_rate": 4.889694757232198e-05, "loss": 0.087, "step": 731 }, { "epoch": 0.5133239831697055, "grad_norm": 0.3053722679615021, "learning_rate": 4.8893681819934267e-05, "loss": 0.2531, "step": 732 }, { "epoch": 0.5140252454417953, "grad_norm": 0.30602261424064636, "learning_rate": 4.889041134969809e-05, "loss": 0.2553, "step": 733 }, { "epoch": 0.514726507713885, "grad_norm": 0.31956321001052856, "learning_rate": 4.8887136162259224e-05, "loss": 0.2492, "step": 734 }, { "epoch": 0.5154277699859747, "grad_norm": 0.2636028230190277, "learning_rate": 4.888385625826435e-05, "loss": 0.0868, "step": 735 }, { "epoch": 0.5161290322580645, "grad_norm": 0.83425372838974, "learning_rate": 4.888057163836108e-05, "loss": 0.4256, "step": 736 }, { "epoch": 0.5168302945301543, "grad_norm": 2.0387136936187744, "learning_rate": 4.8877282303197996e-05, "loss": 0.4702, "step": 737 }, { "epoch": 0.517531556802244, "grad_norm": 0.2613907754421234, "learning_rate": 4.887398825342456e-05, "loss": 0.0859, "step": 738 }, { "epoch": 0.5182328190743338, "grad_norm": 0.2498553842306137, "learning_rate": 4.88706894896912e-05, "loss": 0.0892, "step": 739 }, { "epoch": 0.5189340813464236, "grad_norm": 0.7991546988487244, "learning_rate": 4.886738601264925e-05, "loss": 0.4116, "step": 740 }, { "epoch": 0.5196353436185134, "grad_norm": 0.26637622714042664, "learning_rate": 4.8864077822951e-05, "loss": 0.0871, "step": 741 }, { "epoch": 0.520336605890603, "grad_norm": 0.2523796856403351, "learning_rate": 4.8860764921249655e-05, "loss": 0.0903, "step": 742 }, { "epoch": 0.5210378681626928, "grad_norm": 0.25157052278518677, "learning_rate": 4.885744730819935e-05, "loss": 0.0905, "step": 743 }, { "epoch": 0.5217391304347826, "grad_norm": 0.25450435280799866, "learning_rate": 4.885412498445514e-05, "loss": 0.091, "step": 744 }, { "epoch": 0.5224403927068724, "grad_norm": 0.2564050853252411, "learning_rate": 4.885079795067305e-05, "loss": 0.0916, "step": 745 }, { "epoch": 0.5231416549789621, "grad_norm": 0.2707728445529938, "learning_rate": 4.884746620751e-05, "loss": 0.089, "step": 746 }, { "epoch": 0.5238429172510519, "grad_norm": 0.2830233573913574, "learning_rate": 4.8844129755623836e-05, "loss": 0.0891, "step": 747 }, { "epoch": 0.5245441795231417, "grad_norm": 0.2902025580406189, "learning_rate": 4.8840788595673354e-05, "loss": 0.2504, "step": 748 }, { "epoch": 0.5252454417952315, "grad_norm": 0.2779942750930786, "learning_rate": 4.8837442728318274e-05, "loss": 0.089, "step": 749 }, { "epoch": 0.5259467040673211, "grad_norm": 0.30391693115234375, "learning_rate": 4.883409215421924e-05, "loss": 0.2534, "step": 750 }, { "epoch": 0.5266479663394109, "grad_norm": 0.30364367365837097, "learning_rate": 4.883073687403783e-05, "loss": 0.2553, "step": 751 }, { "epoch": 0.5273492286115007, "grad_norm": 0.2620435953140259, "learning_rate": 4.8827376888436555e-05, "loss": 0.0869, "step": 752 }, { "epoch": 0.5280504908835905, "grad_norm": 0.2643459141254425, "learning_rate": 4.8824012198078836e-05, "loss": 0.0873, "step": 753 }, { "epoch": 0.5287517531556802, "grad_norm": 0.32410210371017456, "learning_rate": 4.8820642803629054e-05, "loss": 0.2531, "step": 754 }, { "epoch": 0.52945301542777, "grad_norm": 0.30094748735427856, "learning_rate": 4.88172687057525e-05, "loss": 0.2558, "step": 755 }, { "epoch": 0.5301542776998598, "grad_norm": 0.25741755962371826, "learning_rate": 4.881388990511537e-05, "loss": 0.0859, "step": 756 }, { "epoch": 0.5308555399719496, "grad_norm": 0.8122571706771851, "learning_rate": 4.881050640238485e-05, "loss": 0.424, "step": 757 }, { "epoch": 0.5315568022440392, "grad_norm": 0.24859191477298737, "learning_rate": 4.8807118198229e-05, "loss": 0.0884, "step": 758 }, { "epoch": 0.532258064516129, "grad_norm": 0.24971440434455872, "learning_rate": 4.880372529331681e-05, "loss": 0.0887, "step": 759 }, { "epoch": 0.5329593267882188, "grad_norm": 0.24795487523078918, "learning_rate": 4.8800327688318246e-05, "loss": 0.0886, "step": 760 }, { "epoch": 0.5336605890603086, "grad_norm": 0.24936164915561676, "learning_rate": 4.879692538390416e-05, "loss": 0.0882, "step": 761 }, { "epoch": 0.5343618513323983, "grad_norm": 0.27788427472114563, "learning_rate": 4.879351838074634e-05, "loss": 0.0819, "step": 762 }, { "epoch": 0.5350631136044881, "grad_norm": 0.2479659467935562, "learning_rate": 4.87901066795175e-05, "loss": 0.0878, "step": 763 }, { "epoch": 0.5357643758765779, "grad_norm": 0.2536613941192627, "learning_rate": 4.8786690280891295e-05, "loss": 0.0838, "step": 764 }, { "epoch": 0.5364656381486677, "grad_norm": 0.25151169300079346, "learning_rate": 4.878326918554229e-05, "loss": 0.0829, "step": 765 }, { "epoch": 0.5371669004207573, "grad_norm": 0.25654929876327515, "learning_rate": 4.8779843394146e-05, "loss": 0.0769, "step": 766 }, { "epoch": 0.5378681626928471, "grad_norm": 0.242430180311203, "learning_rate": 4.877641290737884e-05, "loss": 0.0852, "step": 767 }, { "epoch": 0.5385694249649369, "grad_norm": 0.2447461634874344, "learning_rate": 4.877297772591817e-05, "loss": 0.0807, "step": 768 }, { "epoch": 0.5392706872370266, "grad_norm": 0.2588830292224884, "learning_rate": 4.876953785044228e-05, "loss": 0.0691, "step": 769 }, { "epoch": 0.5399719495091164, "grad_norm": 0.35162994265556335, "learning_rate": 4.8766093281630366e-05, "loss": 0.2569, "step": 770 }, { "epoch": 0.5406732117812062, "grad_norm": 0.24198296666145325, "learning_rate": 4.876264402016257e-05, "loss": 0.0839, "step": 771 }, { "epoch": 0.541374474053296, "grad_norm": 0.3604099452495575, "learning_rate": 4.875919006671995e-05, "loss": 0.2581, "step": 772 }, { "epoch": 0.5420757363253857, "grad_norm": 0.23156067728996277, "learning_rate": 4.8755731421984506e-05, "loss": 0.0695, "step": 773 }, { "epoch": 0.5427769985974754, "grad_norm": 0.23768606781959534, "learning_rate": 4.875226808663915e-05, "loss": 0.0807, "step": 774 }, { "epoch": 0.5434782608695652, "grad_norm": 0.9231970906257629, "learning_rate": 4.8748800061367716e-05, "loss": 0.4483, "step": 775 }, { "epoch": 0.544179523141655, "grad_norm": 0.36346161365509033, "learning_rate": 4.8745327346854974e-05, "loss": 0.2644, "step": 776 }, { "epoch": 0.5448807854137447, "grad_norm": 0.23724210262298584, "learning_rate": 4.874184994378662e-05, "loss": 0.0801, "step": 777 }, { "epoch": 0.5455820476858345, "grad_norm": 0.39254194498062134, "learning_rate": 4.8738367852849276e-05, "loss": 0.2547, "step": 778 }, { "epoch": 0.5462833099579243, "grad_norm": 0.2310318797826767, "learning_rate": 4.8734881074730486e-05, "loss": 0.0746, "step": 779 }, { "epoch": 0.5469845722300141, "grad_norm": 0.23677921295166016, "learning_rate": 4.8731389610118715e-05, "loss": 0.0804, "step": 780 }, { "epoch": 0.5476858345021038, "grad_norm": 0.38721612095832825, "learning_rate": 4.872789345970335e-05, "loss": 0.2591, "step": 781 }, { "epoch": 0.5483870967741935, "grad_norm": 0.23847006261348724, "learning_rate": 4.872439262417474e-05, "loss": 0.0809, "step": 782 }, { "epoch": 0.5490883590462833, "grad_norm": 0.3854837119579315, "learning_rate": 4.872088710422411e-05, "loss": 0.2602, "step": 783 }, { "epoch": 0.5497896213183731, "grad_norm": 0.23665423691272736, "learning_rate": 4.8717376900543633e-05, "loss": 0.0794, "step": 784 }, { "epoch": 0.5504908835904628, "grad_norm": 0.238959401845932, "learning_rate": 4.8713862013826414e-05, "loss": 0.0798, "step": 785 }, { "epoch": 0.5511921458625526, "grad_norm": 0.4133860170841217, "learning_rate": 4.871034244476645e-05, "loss": 0.256, "step": 786 }, { "epoch": 0.5518934081346424, "grad_norm": 0.23792241513729095, "learning_rate": 4.8706818194058726e-05, "loss": 0.0796, "step": 787 }, { "epoch": 0.5525946704067322, "grad_norm": 0.22479191422462463, "learning_rate": 4.870328926239907e-05, "loss": 0.0725, "step": 788 }, { "epoch": 0.5532959326788219, "grad_norm": 0.21571429073810577, "learning_rate": 4.86997556504843e-05, "loss": 0.0644, "step": 789 }, { "epoch": 0.5539971949509116, "grad_norm": 0.22238442301750183, "learning_rate": 4.869621735901213e-05, "loss": 0.0708, "step": 790 }, { "epoch": 0.5546984572230014, "grad_norm": 0.39497771859169006, "learning_rate": 4.869267438868119e-05, "loss": 0.2702, "step": 791 }, { "epoch": 0.5553997194950911, "grad_norm": 0.9795714616775513, "learning_rate": 4.868912674019106e-05, "loss": 0.4595, "step": 792 }, { "epoch": 0.5561009817671809, "grad_norm": 0.23683913052082062, "learning_rate": 4.8685574414242224e-05, "loss": 0.0782, "step": 793 }, { "epoch": 0.5568022440392707, "grad_norm": 0.23791305720806122, "learning_rate": 4.868201741153609e-05, "loss": 0.0783, "step": 794 }, { "epoch": 0.5575035063113605, "grad_norm": 0.23569625616073608, "learning_rate": 4.867845573277501e-05, "loss": 0.0778, "step": 795 }, { "epoch": 0.5582047685834503, "grad_norm": 0.22259075939655304, "learning_rate": 4.8674889378662224e-05, "loss": 0.0707, "step": 796 }, { "epoch": 0.55890603085554, "grad_norm": 0.21249957382678986, "learning_rate": 4.867131834990192e-05, "loss": 0.064, "step": 797 }, { "epoch": 0.5596072931276297, "grad_norm": 0.22923441231250763, "learning_rate": 4.866774264719921e-05, "loss": 0.0713, "step": 798 }, { "epoch": 0.5603085553997195, "grad_norm": 0.2281545102596283, "learning_rate": 4.866416227126013e-05, "loss": 0.0707, "step": 799 }, { "epoch": 0.5610098176718092, "grad_norm": 0.21980874240398407, "learning_rate": 4.86605772227916e-05, "loss": 0.0686, "step": 800 }, { "epoch": 0.561711079943899, "grad_norm": 0.2183590829372406, "learning_rate": 4.865698750250153e-05, "loss": 0.0678, "step": 801 }, { "epoch": 0.5624123422159888, "grad_norm": 0.45009830594062805, "learning_rate": 4.8653393111098696e-05, "loss": 0.2667, "step": 802 }, { "epoch": 0.5631136044880786, "grad_norm": 0.22247673571109772, "learning_rate": 4.864979404929283e-05, "loss": 0.0676, "step": 803 }, { "epoch": 0.5638148667601683, "grad_norm": 0.18470662832260132, "learning_rate": 4.864619031779456e-05, "loss": 0.0502, "step": 804 }, { "epoch": 0.5645161290322581, "grad_norm": 0.2177731692790985, "learning_rate": 4.8642581917315454e-05, "loss": 0.0651, "step": 805 }, { "epoch": 0.5652173913043478, "grad_norm": 0.2371188998222351, "learning_rate": 4.863896884856799e-05, "loss": 0.0732, "step": 806 }, { "epoch": 0.5659186535764376, "grad_norm": 0.2155594527721405, "learning_rate": 4.8635351112265596e-05, "loss": 0.0639, "step": 807 }, { "epoch": 0.5666199158485273, "grad_norm": 0.2365003079175949, "learning_rate": 4.8631728709122574e-05, "loss": 0.0702, "step": 808 }, { "epoch": 0.5673211781206171, "grad_norm": 0.2383153736591339, "learning_rate": 4.862810163985418e-05, "loss": 0.0692, "step": 809 }, { "epoch": 0.5680224403927069, "grad_norm": 0.23824547231197357, "learning_rate": 4.8624469905176606e-05, "loss": 0.0686, "step": 810 }, { "epoch": 0.5687237026647967, "grad_norm": 0.5625584125518799, "learning_rate": 4.862083350580692e-05, "loss": 0.2804, "step": 811 }, { "epoch": 0.5694249649368864, "grad_norm": 0.24158518016338348, "learning_rate": 4.861719244246315e-05, "loss": 0.0654, "step": 812 }, { "epoch": 0.5701262272089762, "grad_norm": 0.19111515581607819, "learning_rate": 4.861354671586422e-05, "loss": 0.0498, "step": 813 }, { "epoch": 0.5708274894810659, "grad_norm": 0.596274733543396, "learning_rate": 4.860989632672999e-05, "loss": 0.2928, "step": 814 }, { "epoch": 0.5715287517531557, "grad_norm": 2.1724631786346436, "learning_rate": 4.860624127578124e-05, "loss": 0.7562, "step": 815 }, { "epoch": 0.5722300140252454, "grad_norm": 0.242104172706604, "learning_rate": 4.860258156373964e-05, "loss": 0.0641, "step": 816 }, { "epoch": 0.5729312762973352, "grad_norm": 1.4252182245254517, "learning_rate": 4.8598917191327856e-05, "loss": 0.5337, "step": 817 }, { "epoch": 0.573632538569425, "grad_norm": 0.21480168402194977, "learning_rate": 4.859524815926938e-05, "loss": 0.0574, "step": 818 }, { "epoch": 0.5743338008415148, "grad_norm": 0.2356814593076706, "learning_rate": 4.859157446828868e-05, "loss": 0.066, "step": 819 }, { "epoch": 0.5750350631136045, "grad_norm": 0.2367323487997055, "learning_rate": 4.858789611911115e-05, "loss": 0.0671, "step": 820 }, { "epoch": 0.5757363253856943, "grad_norm": 0.2119174599647522, "learning_rate": 4.858421311246306e-05, "loss": 0.0598, "step": 821 }, { "epoch": 0.576437587657784, "grad_norm": 0.1697162538766861, "learning_rate": 4.8580525449071656e-05, "loss": 0.0442, "step": 822 }, { "epoch": 0.5771388499298737, "grad_norm": 0.23543579876422882, "learning_rate": 4.8576833129665046e-05, "loss": 0.0677, "step": 823 }, { "epoch": 0.5778401122019635, "grad_norm": 1.3052358627319336, "learning_rate": 4.85731361549723e-05, "loss": 0.5079, "step": 824 }, { "epoch": 0.5785413744740533, "grad_norm": 0.21277135610580444, "learning_rate": 4.856943452572338e-05, "loss": 0.0595, "step": 825 }, { "epoch": 0.5792426367461431, "grad_norm": 0.2328493595123291, "learning_rate": 4.8565728242649194e-05, "loss": 0.068, "step": 826 }, { "epoch": 0.5799438990182328, "grad_norm": 0.23273180425167084, "learning_rate": 4.8562017306481545e-05, "loss": 0.0678, "step": 827 }, { "epoch": 0.5806451612903226, "grad_norm": 0.5117275714874268, "learning_rate": 4.855830171795317e-05, "loss": 0.2834, "step": 828 }, { "epoch": 0.5813464235624124, "grad_norm": 0.21251694858074188, "learning_rate": 4.8554581477797705e-05, "loss": 0.0602, "step": 829 }, { "epoch": 0.5820476858345021, "grad_norm": 0.2103821188211441, "learning_rate": 4.855085658674973e-05, "loss": 0.061, "step": 830 }, { "epoch": 0.5827489481065918, "grad_norm": 0.5188913941383362, "learning_rate": 4.8547127045544726e-05, "loss": 0.2758, "step": 831 }, { "epoch": 0.5834502103786816, "grad_norm": 0.20679424703121185, "learning_rate": 4.85433928549191e-05, "loss": 0.0603, "step": 832 }, { "epoch": 0.5841514726507714, "grad_norm": 0.21052111685276031, "learning_rate": 4.853965401561018e-05, "loss": 0.0599, "step": 833 }, { "epoch": 0.5848527349228612, "grad_norm": 0.23384740948677063, "learning_rate": 4.853591052835619e-05, "loss": 0.0697, "step": 834 }, { "epoch": 0.5855539971949509, "grad_norm": 0.5713041424751282, "learning_rate": 4.8532162393896306e-05, "loss": 0.2748, "step": 835 }, { "epoch": 0.5862552594670407, "grad_norm": 0.48402369022369385, "learning_rate": 4.852840961297059e-05, "loss": 0.279, "step": 836 }, { "epoch": 0.5869565217391305, "grad_norm": 0.46432554721832275, "learning_rate": 4.8524652186320044e-05, "loss": 0.2774, "step": 837 }, { "epoch": 0.5876577840112202, "grad_norm": 6.828824520111084, "learning_rate": 4.852089011468657e-05, "loss": 0.9759, "step": 838 }, { "epoch": 0.5883590462833099, "grad_norm": 1.1183139085769653, "learning_rate": 4.8517123398813e-05, "loss": 0.4791, "step": 839 }, { "epoch": 0.5890603085553997, "grad_norm": 0.2336907982826233, "learning_rate": 4.851335203944308e-05, "loss": 0.0719, "step": 840 }, { "epoch": 0.5897615708274895, "grad_norm": 0.2329155057668686, "learning_rate": 4.850957603732147e-05, "loss": 0.0751, "step": 841 }, { "epoch": 0.5904628330995793, "grad_norm": 0.4334680140018463, "learning_rate": 4.850579539319376e-05, "loss": 0.2675, "step": 842 }, { "epoch": 0.591164095371669, "grad_norm": 0.2172098606824875, "learning_rate": 4.8502010107806415e-05, "loss": 0.0689, "step": 843 }, { "epoch": 0.5918653576437588, "grad_norm": 0.3796505331993103, "learning_rate": 4.849822018190687e-05, "loss": 0.2681, "step": 844 }, { "epoch": 0.5925666199158486, "grad_norm": 0.23645739257335663, "learning_rate": 4.849442561624344e-05, "loss": 0.0781, "step": 845 }, { "epoch": 0.5932678821879382, "grad_norm": 0.23531079292297363, "learning_rate": 4.8490626411565385e-05, "loss": 0.0789, "step": 846 }, { "epoch": 0.593969144460028, "grad_norm": 0.2364100068807602, "learning_rate": 4.8486822568622845e-05, "loss": 0.0795, "step": 847 }, { "epoch": 0.5946704067321178, "grad_norm": 0.22449754178524017, "learning_rate": 4.848301408816691e-05, "loss": 0.0725, "step": 848 }, { "epoch": 0.5953716690042076, "grad_norm": 0.9234817028045654, "learning_rate": 4.8479200970949566e-05, "loss": 0.4529, "step": 849 }, { "epoch": 0.5960729312762973, "grad_norm": 0.2194729745388031, "learning_rate": 4.847538321772372e-05, "loss": 0.0661, "step": 850 }, { "epoch": 0.5967741935483871, "grad_norm": 3.910118579864502, "learning_rate": 4.847156082924319e-05, "loss": 0.5946, "step": 851 }, { "epoch": 0.5974754558204769, "grad_norm": 0.38631001114845276, "learning_rate": 4.846773380626272e-05, "loss": 0.2519, "step": 852 }, { "epoch": 0.5981767180925667, "grad_norm": 0.22495704889297485, "learning_rate": 4.8463902149537955e-05, "loss": 0.0747, "step": 853 }, { "epoch": 0.5988779803646563, "grad_norm": 2.862973690032959, "learning_rate": 4.846006585982547e-05, "loss": 0.4658, "step": 854 }, { "epoch": 0.5995792426367461, "grad_norm": 0.32377296686172485, "learning_rate": 4.845622493788273e-05, "loss": 0.2601, "step": 855 }, { "epoch": 0.6002805049088359, "grad_norm": 0.3215973377227783, "learning_rate": 4.845237938446815e-05, "loss": 0.2607, "step": 856 }, { "epoch": 0.6009817671809257, "grad_norm": 2.2663378715515137, "learning_rate": 4.8448529200341034e-05, "loss": 0.391, "step": 857 }, { "epoch": 0.6016830294530154, "grad_norm": 1.3367193937301636, "learning_rate": 4.844467438626161e-05, "loss": 0.5928, "step": 858 }, { "epoch": 0.6023842917251052, "grad_norm": 0.2972490191459656, "learning_rate": 4.8440814942991016e-05, "loss": 0.2501, "step": 859 }, { "epoch": 0.603085553997195, "grad_norm": 0.30776098370552063, "learning_rate": 4.8436950871291314e-05, "loss": 0.2568, "step": 860 }, { "epoch": 0.6037868162692848, "grad_norm": 0.2515738606452942, "learning_rate": 4.843308217192546e-05, "loss": 0.0909, "step": 861 }, { "epoch": 0.6044880785413744, "grad_norm": 0.25672638416290283, "learning_rate": 4.8429208845657334e-05, "loss": 0.0925, "step": 862 }, { "epoch": 0.6051893408134642, "grad_norm": 0.28160253167152405, "learning_rate": 4.842533089325174e-05, "loss": 0.0937, "step": 863 }, { "epoch": 0.605890603085554, "grad_norm": 0.2830201983451843, "learning_rate": 4.842144831547438e-05, "loss": 0.0944, "step": 864 }, { "epoch": 0.6065918653576438, "grad_norm": 0.2602722942829132, "learning_rate": 4.8417561113091884e-05, "loss": 0.094, "step": 865 }, { "epoch": 0.6072931276297335, "grad_norm": 0.7551720142364502, "learning_rate": 4.8413669286871766e-05, "loss": 0.4026, "step": 866 }, { "epoch": 0.6079943899018233, "grad_norm": 0.3841565251350403, "learning_rate": 4.84097728375825e-05, "loss": 0.098, "step": 867 }, { "epoch": 0.6086956521739131, "grad_norm": 0.25730663537979126, "learning_rate": 4.8405871765993433e-05, "loss": 0.2504, "step": 868 }, { "epoch": 0.6093969144460029, "grad_norm": 0.2916770577430725, "learning_rate": 4.840196607287484e-05, "loss": 0.0961, "step": 869 }, { "epoch": 0.6100981767180925, "grad_norm": 0.2611294388771057, "learning_rate": 4.839805575899791e-05, "loss": 0.2488, "step": 870 }, { "epoch": 0.6107994389901823, "grad_norm": 0.2870989441871643, "learning_rate": 4.8394140825134734e-05, "loss": 0.0953, "step": 871 }, { "epoch": 0.6115007012622721, "grad_norm": 0.2609158158302307, "learning_rate": 4.839022127205832e-05, "loss": 0.0945, "step": 872 }, { "epoch": 0.6122019635343618, "grad_norm": 0.2622205317020416, "learning_rate": 4.838629710054261e-05, "loss": 0.095, "step": 873 }, { "epoch": 0.6129032258064516, "grad_norm": 0.264212042093277, "learning_rate": 4.838236831136242e-05, "loss": 0.0952, "step": 874 }, { "epoch": 0.6136044880785414, "grad_norm": 0.2874182164669037, "learning_rate": 4.8378434905293504e-05, "loss": 0.0957, "step": 875 }, { "epoch": 0.6143057503506312, "grad_norm": 0.27731218934059143, "learning_rate": 4.837449688311251e-05, "loss": 0.0938, "step": 876 }, { "epoch": 0.615007012622721, "grad_norm": 1.4610146284103394, "learning_rate": 4.837055424559702e-05, "loss": 0.4155, "step": 877 }, { "epoch": 0.6157082748948106, "grad_norm": 0.2842497229576111, "learning_rate": 4.836660699352551e-05, "loss": 0.2473, "step": 878 }, { "epoch": 0.6164095371669004, "grad_norm": 0.3478318750858307, "learning_rate": 4.836265512767737e-05, "loss": 0.0941, "step": 879 }, { "epoch": 0.6171107994389902, "grad_norm": 1.0187608003616333, "learning_rate": 4.83586986488329e-05, "loss": 0.2432, "step": 880 }, { "epoch": 0.6178120617110799, "grad_norm": 0.280254065990448, "learning_rate": 4.8354737557773324e-05, "loss": 0.2511, "step": 881 }, { "epoch": 0.6185133239831697, "grad_norm": 0.3692956864833832, "learning_rate": 4.8350771855280756e-05, "loss": 0.0971, "step": 882 }, { "epoch": 0.6192145862552595, "grad_norm": 0.29070132970809937, "learning_rate": 4.834680154213823e-05, "loss": 0.0965, "step": 883 }, { "epoch": 0.6199158485273493, "grad_norm": 0.7519779205322266, "learning_rate": 4.8342826619129705e-05, "loss": 0.4062, "step": 884 }, { "epoch": 0.620617110799439, "grad_norm": 1.0148783922195435, "learning_rate": 4.8338847087040015e-05, "loss": 0.2265, "step": 885 }, { "epoch": 0.6213183730715287, "grad_norm": 0.2945418655872345, "learning_rate": 4.8334862946654945e-05, "loss": 0.0974, "step": 886 }, { "epoch": 0.6220196353436185, "grad_norm": 0.26288947463035583, "learning_rate": 4.8330874198761164e-05, "loss": 0.0954, "step": 887 }, { "epoch": 0.6227208976157083, "grad_norm": 0.2582798898220062, "learning_rate": 4.832688084414625e-05, "loss": 0.2474, "step": 888 }, { "epoch": 0.623422159887798, "grad_norm": 0.28370651602745056, "learning_rate": 4.832288288359871e-05, "loss": 0.2581, "step": 889 }, { "epoch": 0.6241234221598878, "grad_norm": 0.26648572087287903, "learning_rate": 4.831888031790793e-05, "loss": 0.0965, "step": 890 }, { "epoch": 0.6248246844319776, "grad_norm": 0.313006192445755, "learning_rate": 4.831487314786425e-05, "loss": 0.1007, "step": 891 }, { "epoch": 0.6255259467040674, "grad_norm": 0.26917025446891785, "learning_rate": 4.8310861374258864e-05, "loss": 0.097, "step": 892 }, { "epoch": 0.6262272089761571, "grad_norm": 0.317917138338089, "learning_rate": 4.830684499788393e-05, "loss": 0.1024, "step": 893 }, { "epoch": 0.6269284712482468, "grad_norm": 0.33523494005203247, "learning_rate": 4.830282401953246e-05, "loss": 0.103, "step": 894 }, { "epoch": 0.6276297335203366, "grad_norm": 0.26023709774017334, "learning_rate": 4.829879843999843e-05, "loss": 0.2508, "step": 895 }, { "epoch": 0.6283309957924264, "grad_norm": 0.2601926028728485, "learning_rate": 4.8294768260076685e-05, "loss": 0.2537, "step": 896 }, { "epoch": 0.6290322580645161, "grad_norm": 0.2997584939002991, "learning_rate": 4.829073348056298e-05, "loss": 0.0983, "step": 897 }, { "epoch": 0.6297335203366059, "grad_norm": 0.2975466251373291, "learning_rate": 4.828669410225402e-05, "loss": 0.0981, "step": 898 }, { "epoch": 0.6304347826086957, "grad_norm": 0.353564977645874, "learning_rate": 4.8282650125947356e-05, "loss": 0.1017, "step": 899 }, { "epoch": 0.6311360448807855, "grad_norm": 0.2881447672843933, "learning_rate": 4.827860155244149e-05, "loss": 0.2537, "step": 900 }, { "epoch": 0.6318373071528752, "grad_norm": 0.26734352111816406, "learning_rate": 4.8274548382535825e-05, "loss": 0.251, "step": 901 }, { "epoch": 0.6325385694249649, "grad_norm": 0.28219926357269287, "learning_rate": 4.827049061703066e-05, "loss": 0.2547, "step": 902 }, { "epoch": 0.6332398316970547, "grad_norm": 1.079776644706726, "learning_rate": 4.826642825672721e-05, "loss": 0.2386, "step": 903 }, { "epoch": 0.6339410939691444, "grad_norm": 0.25551146268844604, "learning_rate": 4.826236130242759e-05, "loss": 0.0928, "step": 904 }, { "epoch": 0.6346423562412342, "grad_norm": 0.2885552942752838, "learning_rate": 4.825828975493484e-05, "loss": 0.0958, "step": 905 }, { "epoch": 0.635343618513324, "grad_norm": 0.2920515239238739, "learning_rate": 4.825421361505288e-05, "loss": 0.0944, "step": 906 }, { "epoch": 0.6360448807854138, "grad_norm": 0.25699445605278015, "learning_rate": 4.8250132883586554e-05, "loss": 0.093, "step": 907 }, { "epoch": 0.6367461430575035, "grad_norm": 0.2725176513195038, "learning_rate": 4.824604756134161e-05, "loss": 0.2523, "step": 908 }, { "epoch": 0.6374474053295933, "grad_norm": 0.2540269196033478, "learning_rate": 4.8241957649124715e-05, "loss": 0.0921, "step": 909 }, { "epoch": 0.638148667601683, "grad_norm": 0.7521713376045227, "learning_rate": 4.823786314774341e-05, "loss": 0.4081, "step": 910 }, { "epoch": 0.6388499298737728, "grad_norm": 0.2819633483886719, "learning_rate": 4.823376405800617e-05, "loss": 0.2533, "step": 911 }, { "epoch": 0.6395511921458625, "grad_norm": 0.25473204255104065, "learning_rate": 4.8229660380722364e-05, "loss": 0.0922, "step": 912 }, { "epoch": 0.6402524544179523, "grad_norm": 0.2530384659767151, "learning_rate": 4.822555211670228e-05, "loss": 0.0919, "step": 913 }, { "epoch": 0.6409537166900421, "grad_norm": 0.29027360677719116, "learning_rate": 4.822143926675709e-05, "loss": 0.2508, "step": 914 }, { "epoch": 0.6416549789621319, "grad_norm": 0.7898009419441223, "learning_rate": 4.821732183169888e-05, "loss": 0.4107, "step": 915 }, { "epoch": 0.6423562412342216, "grad_norm": 0.33096322417259216, "learning_rate": 4.821319981234066e-05, "loss": 0.0944, "step": 916 }, { "epoch": 0.6430575035063114, "grad_norm": 0.2567290663719177, "learning_rate": 4.8209073209496325e-05, "loss": 0.093, "step": 917 }, { "epoch": 0.6437587657784011, "grad_norm": 0.25363409519195557, "learning_rate": 4.820494202398067e-05, "loss": 0.0922, "step": 918 }, { "epoch": 0.6444600280504909, "grad_norm": 0.27545562386512756, "learning_rate": 4.8200806256609415e-05, "loss": 0.0906, "step": 919 }, { "epoch": 0.6451612903225806, "grad_norm": 0.2539672255516052, "learning_rate": 4.8196665908199165e-05, "loss": 0.0921, "step": 920 }, { "epoch": 0.6458625525946704, "grad_norm": 0.27923259139060974, "learning_rate": 4.819252097956746e-05, "loss": 0.2516, "step": 921 }, { "epoch": 0.6465638148667602, "grad_norm": 0.25723204016685486, "learning_rate": 4.818837147153269e-05, "loss": 0.0924, "step": 922 }, { "epoch": 0.64726507713885, "grad_norm": 0.2763945460319519, "learning_rate": 4.818421738491421e-05, "loss": 0.0917, "step": 923 }, { "epoch": 0.6479663394109397, "grad_norm": 0.31166157126426697, "learning_rate": 4.818005872053224e-05, "loss": 0.0909, "step": 924 }, { "epoch": 0.6486676016830295, "grad_norm": 0.2828572690486908, "learning_rate": 4.8175895479207914e-05, "loss": 0.2516, "step": 925 }, { "epoch": 0.6493688639551192, "grad_norm": 0.29512614011764526, "learning_rate": 4.817172766176328e-05, "loss": 0.2555, "step": 926 }, { "epoch": 0.6500701262272089, "grad_norm": 0.2661251723766327, "learning_rate": 4.816755526902127e-05, "loss": 0.088, "step": 927 }, { "epoch": 0.6507713884992987, "grad_norm": 0.29981863498687744, "learning_rate": 4.816337830180574e-05, "loss": 0.2573, "step": 928 }, { "epoch": 0.6514726507713885, "grad_norm": 0.787373423576355, "learning_rate": 4.815919676094144e-05, "loss": 0.4171, "step": 929 }, { "epoch": 0.6521739130434783, "grad_norm": 0.287625789642334, "learning_rate": 4.815501064725401e-05, "loss": 0.0852, "step": 930 }, { "epoch": 0.652875175315568, "grad_norm": 0.2933789789676666, "learning_rate": 4.815081996157e-05, "loss": 0.0867, "step": 931 }, { "epoch": 0.6535764375876578, "grad_norm": 0.26444724202156067, "learning_rate": 4.81466247047169e-05, "loss": 0.0878, "step": 932 }, { "epoch": 0.6542776998597476, "grad_norm": 0.25699806213378906, "learning_rate": 4.8142424877523044e-05, "loss": 0.0854, "step": 933 }, { "epoch": 0.6549789621318373, "grad_norm": 0.8077728748321533, "learning_rate": 4.8138220480817704e-05, "loss": 0.417, "step": 934 }, { "epoch": 0.655680224403927, "grad_norm": 0.2708847224712372, "learning_rate": 4.813401151543104e-05, "loss": 0.0808, "step": 935 }, { "epoch": 0.6563814866760168, "grad_norm": 0.3041734993457794, "learning_rate": 4.812979798219412e-05, "loss": 0.2574, "step": 936 }, { "epoch": 0.6570827489481066, "grad_norm": 0.24633049964904785, "learning_rate": 4.812557988193893e-05, "loss": 0.0881, "step": 937 }, { "epoch": 0.6577840112201964, "grad_norm": 0.2721611261367798, "learning_rate": 4.812135721549832e-05, "loss": 0.0795, "step": 938 }, { "epoch": 0.6584852734922861, "grad_norm": 0.24537232518196106, "learning_rate": 4.8117129983706063e-05, "loss": 0.088, "step": 939 }, { "epoch": 0.6591865357643759, "grad_norm": 0.24516059458255768, "learning_rate": 4.811289818739685e-05, "loss": 0.0874, "step": 940 }, { "epoch": 0.6598877980364657, "grad_norm": 0.25002169609069824, "learning_rate": 4.810866182740624e-05, "loss": 0.0829, "step": 941 }, { "epoch": 0.6605890603085554, "grad_norm": 0.24670995771884918, "learning_rate": 4.8104420904570724e-05, "loss": 0.0815, "step": 942 }, { "epoch": 0.6612903225806451, "grad_norm": 0.24592629075050354, "learning_rate": 4.8100175419727664e-05, "loss": 0.0801, "step": 943 }, { "epoch": 0.6619915848527349, "grad_norm": 4.298756122589111, "learning_rate": 4.809592537371536e-05, "loss": 0.8479, "step": 944 }, { "epoch": 0.6626928471248247, "grad_norm": 0.24862012267112732, "learning_rate": 4.809167076737296e-05, "loss": 0.081, "step": 945 }, { "epoch": 0.6633941093969145, "grad_norm": 0.24551771581172943, "learning_rate": 4.8087411601540566e-05, "loss": 0.0806, "step": 946 }, { "epoch": 0.6640953716690042, "grad_norm": 3.545499801635742, "learning_rate": 4.8083147877059156e-05, "loss": 0.9457, "step": 947 }, { "epoch": 0.664796633941094, "grad_norm": 0.2573499083518982, "learning_rate": 4.80788795947706e-05, "loss": 0.0765, "step": 948 }, { "epoch": 0.6654978962131838, "grad_norm": 0.2394721508026123, "learning_rate": 4.807460675551769e-05, "loss": 0.0798, "step": 949 }, { "epoch": 0.6661991584852734, "grad_norm": 0.33367910981178284, "learning_rate": 4.807032936014409e-05, "loss": 0.2602, "step": 950 }, { "epoch": 0.6669004207573632, "grad_norm": 0.239417165517807, "learning_rate": 4.80660474094944e-05, "loss": 0.0851, "step": 951 }, { "epoch": 0.667601683029453, "grad_norm": 0.25660645961761475, "learning_rate": 4.8061760904414075e-05, "loss": 0.0769, "step": 952 }, { "epoch": 0.6683029453015428, "grad_norm": 0.24100619554519653, "learning_rate": 4.80574698457495e-05, "loss": 0.0795, "step": 953 }, { "epoch": 0.6690042075736325, "grad_norm": 0.3206450343132019, "learning_rate": 4.805317423434797e-05, "loss": 0.2577, "step": 954 }, { "epoch": 0.6697054698457223, "grad_norm": 0.25024205446243286, "learning_rate": 4.804887407105764e-05, "loss": 0.083, "step": 955 }, { "epoch": 0.6704067321178121, "grad_norm": 0.856813371181488, "learning_rate": 4.804456935672759e-05, "loss": 0.4381, "step": 956 }, { "epoch": 0.6711079943899019, "grad_norm": 0.3265904486179352, "learning_rate": 4.80402600922078e-05, "loss": 0.2598, "step": 957 }, { "epoch": 0.6718092566619915, "grad_norm": 0.2394275963306427, "learning_rate": 4.803594627834913e-05, "loss": 0.0851, "step": 958 }, { "epoch": 0.6725105189340813, "grad_norm": 0.23885951936244965, "learning_rate": 4.803162791600336e-05, "loss": 0.0848, "step": 959 }, { "epoch": 0.6732117812061711, "grad_norm": 0.24277149140834808, "learning_rate": 4.802730500602316e-05, "loss": 0.0804, "step": 960 }, { "epoch": 0.6739130434782609, "grad_norm": 0.23784933984279633, "learning_rate": 4.802297754926208e-05, "loss": 0.0848, "step": 961 }, { "epoch": 0.6746143057503506, "grad_norm": 0.3277089297771454, "learning_rate": 4.80186455465746e-05, "loss": 0.2607, "step": 962 }, { "epoch": 0.6753155680224404, "grad_norm": 0.2374608814716339, "learning_rate": 4.801430899881607e-05, "loss": 0.0793, "step": 963 }, { "epoch": 0.6760168302945302, "grad_norm": 2.069669008255005, "learning_rate": 4.8009967906842756e-05, "loss": 0.3661, "step": 964 }, { "epoch": 0.67671809256662, "grad_norm": 0.32493939995765686, "learning_rate": 4.800562227151182e-05, "loss": 0.2568, "step": 965 }, { "epoch": 0.6774193548387096, "grad_norm": 0.8489137291908264, "learning_rate": 4.8001272093681296e-05, "loss": 0.4353, "step": 966 }, { "epoch": 0.6781206171107994, "grad_norm": 0.24699875712394714, "learning_rate": 4.799691737421015e-05, "loss": 0.0826, "step": 967 }, { "epoch": 0.6788218793828892, "grad_norm": 0.2476251870393753, "learning_rate": 4.799255811395823e-05, "loss": 0.083, "step": 968 }, { "epoch": 0.679523141654979, "grad_norm": 0.31077897548675537, "learning_rate": 4.7988194313786275e-05, "loss": 0.2575, "step": 969 }, { "epoch": 0.6802244039270687, "grad_norm": 0.8317369222640991, "learning_rate": 4.798382597455591e-05, "loss": 0.4343, "step": 970 }, { "epoch": 0.6809256661991585, "grad_norm": 0.3065662086009979, "learning_rate": 4.79794530971297e-05, "loss": 0.2547, "step": 971 }, { "epoch": 0.6816269284712483, "grad_norm": 0.2590061128139496, "learning_rate": 4.797507568237105e-05, "loss": 0.0867, "step": 972 }, { "epoch": 0.6823281907433381, "grad_norm": 0.3046721816062927, "learning_rate": 4.7970693731144304e-05, "loss": 0.2566, "step": 973 }, { "epoch": 0.6830294530154277, "grad_norm": 0.30907079577445984, "learning_rate": 4.796630724431468e-05, "loss": 0.2549, "step": 974 }, { "epoch": 0.6837307152875175, "grad_norm": 0.2914944291114807, "learning_rate": 4.7961916222748296e-05, "loss": 0.2539, "step": 975 }, { "epoch": 0.6844319775596073, "grad_norm": 0.291054904460907, "learning_rate": 4.795752066731218e-05, "loss": 0.2508, "step": 976 }, { "epoch": 0.685133239831697, "grad_norm": 0.2474406510591507, "learning_rate": 4.7953120578874223e-05, "loss": 0.0895, "step": 977 }, { "epoch": 0.6858345021037868, "grad_norm": 0.2848660945892334, "learning_rate": 4.7948715958303246e-05, "loss": 0.0867, "step": 978 }, { "epoch": 0.6865357643758766, "grad_norm": 0.24811328947544098, "learning_rate": 4.794430680646893e-05, "loss": 0.0898, "step": 979 }, { "epoch": 0.6872370266479664, "grad_norm": 0.29956844449043274, "learning_rate": 4.79398931242419e-05, "loss": 0.2555, "step": 980 }, { "epoch": 0.6879382889200562, "grad_norm": 0.2671471834182739, "learning_rate": 4.79354749124936e-05, "loss": 0.0897, "step": 981 }, { "epoch": 0.6886395511921458, "grad_norm": 0.26499825716018677, "learning_rate": 4.793105217209646e-05, "loss": 0.0888, "step": 982 }, { "epoch": 0.6893408134642356, "grad_norm": 0.2855158746242523, "learning_rate": 4.7926624903923734e-05, "loss": 0.0851, "step": 983 }, { "epoch": 0.6900420757363254, "grad_norm": 0.2950212061405182, "learning_rate": 4.79221931088496e-05, "loss": 0.251, "step": 984 }, { "epoch": 0.6907433380084151, "grad_norm": 0.2984510660171509, "learning_rate": 4.7917756787749115e-05, "loss": 0.2542, "step": 985 }, { "epoch": 0.6914446002805049, "grad_norm": 0.3055588901042938, "learning_rate": 4.791331594149824e-05, "loss": 0.0814, "step": 986 }, { "epoch": 0.6921458625525947, "grad_norm": 0.24886322021484375, "learning_rate": 4.790887057097384e-05, "loss": 0.0899, "step": 987 }, { "epoch": 0.6928471248246845, "grad_norm": 0.2920214533805847, "learning_rate": 4.7904420677053645e-05, "loss": 0.254, "step": 988 }, { "epoch": 0.6935483870967742, "grad_norm": 0.2591300308704376, "learning_rate": 4.789996626061631e-05, "loss": 0.0867, "step": 989 }, { "epoch": 0.6942496493688639, "grad_norm": 0.2488069385290146, "learning_rate": 4.789550732254135e-05, "loss": 0.0891, "step": 990 }, { "epoch": 0.6949509116409537, "grad_norm": 0.27415943145751953, "learning_rate": 4.789104386370921e-05, "loss": 0.0827, "step": 991 }, { "epoch": 0.6956521739130435, "grad_norm": 0.3065854609012604, "learning_rate": 4.788657588500119e-05, "loss": 0.2537, "step": 992 }, { "epoch": 0.6963534361851332, "grad_norm": 0.2457040399312973, "learning_rate": 4.78821033872995e-05, "loss": 0.088, "step": 993 }, { "epoch": 0.697054698457223, "grad_norm": 0.2455568015575409, "learning_rate": 4.787762637148726e-05, "loss": 0.0875, "step": 994 }, { "epoch": 0.6977559607293128, "grad_norm": 0.3072669208049774, "learning_rate": 4.787314483844845e-05, "loss": 0.2508, "step": 995 }, { "epoch": 0.6984572230014026, "grad_norm": 0.2461680769920349, "learning_rate": 4.786865878906796e-05, "loss": 0.0827, "step": 996 }, { "epoch": 0.6991584852734923, "grad_norm": 0.3161550760269165, "learning_rate": 4.7864168224231563e-05, "loss": 0.2592, "step": 997 }, { "epoch": 0.699859747545582, "grad_norm": 0.3222694396972656, "learning_rate": 4.785967314482594e-05, "loss": 0.2534, "step": 998 }, { "epoch": 0.7005610098176718, "grad_norm": 0.24120812118053436, "learning_rate": 4.785517355173865e-05, "loss": 0.0806, "step": 999 }, { "epoch": 0.7012622720897616, "grad_norm": 2.707901954650879, "learning_rate": 4.7850669445858134e-05, "loss": 0.5639, "step": 1000 }, { "epoch": 0.7019635343618513, "grad_norm": 0.24249106645584106, "learning_rate": 4.784616082807374e-05, "loss": 0.0865, "step": 1001 }, { "epoch": 0.7026647966339411, "grad_norm": 0.2581387758255005, "learning_rate": 4.784164769927571e-05, "loss": 0.0783, "step": 1002 }, { "epoch": 0.7033660589060309, "grad_norm": 0.31591030955314636, "learning_rate": 4.783713006035516e-05, "loss": 0.2581, "step": 1003 }, { "epoch": 0.7040673211781207, "grad_norm": 0.24406127631664276, "learning_rate": 4.783260791220412e-05, "loss": 0.0876, "step": 1004 }, { "epoch": 0.7047685834502104, "grad_norm": 0.24357423186302185, "learning_rate": 4.782808125571547e-05, "loss": 0.0821, "step": 1005 }, { "epoch": 0.7054698457223001, "grad_norm": 0.2490423321723938, "learning_rate": 4.782355009178303e-05, "loss": 0.0823, "step": 1006 }, { "epoch": 0.7061711079943899, "grad_norm": 0.2553521394729614, "learning_rate": 4.781901442130148e-05, "loss": 0.0776, "step": 1007 }, { "epoch": 0.7068723702664796, "grad_norm": 0.2441233992576599, "learning_rate": 4.7814474245166385e-05, "loss": 0.0871, "step": 1008 }, { "epoch": 0.7075736325385694, "grad_norm": 0.24585826694965363, "learning_rate": 4.7809929564274224e-05, "loss": 0.0817, "step": 1009 }, { "epoch": 0.7082748948106592, "grad_norm": 0.3585468530654907, "learning_rate": 4.7805380379522355e-05, "loss": 0.2471, "step": 1010 }, { "epoch": 0.708976157082749, "grad_norm": 0.3170870840549469, "learning_rate": 4.7800826691809006e-05, "loss": 0.2504, "step": 1011 }, { "epoch": 0.7096774193548387, "grad_norm": 0.3265076279640198, "learning_rate": 4.779626850203331e-05, "loss": 0.2584, "step": 1012 }, { "epoch": 0.7103786816269285, "grad_norm": 2.1101510524749756, "learning_rate": 4.779170581109532e-05, "loss": 0.3847, "step": 1013 }, { "epoch": 0.7110799438990182, "grad_norm": 0.854012668132782, "learning_rate": 4.778713861989591e-05, "loss": 0.4302, "step": 1014 }, { "epoch": 0.711781206171108, "grad_norm": 0.33490678668022156, "learning_rate": 4.7782566929336895e-05, "loss": 0.2579, "step": 1015 }, { "epoch": 0.7124824684431977, "grad_norm": 0.24936442077159882, "learning_rate": 4.7777990740320973e-05, "loss": 0.083, "step": 1016 }, { "epoch": 0.7131837307152875, "grad_norm": 0.8109617233276367, "learning_rate": 4.77734100537517e-05, "loss": 0.4209, "step": 1017 }, { "epoch": 0.7138849929873773, "grad_norm": 0.29213276505470276, "learning_rate": 4.776882487053356e-05, "loss": 0.2526, "step": 1018 }, { "epoch": 0.7145862552594671, "grad_norm": 1.3293391466140747, "learning_rate": 4.776423519157189e-05, "loss": 0.5907, "step": 1019 }, { "epoch": 0.7152875175315568, "grad_norm": 0.27557218074798584, "learning_rate": 4.775964101777294e-05, "loss": 0.0817, "step": 1020 }, { "epoch": 0.7159887798036466, "grad_norm": 0.2990712523460388, "learning_rate": 4.7755042350043834e-05, "loss": 0.258, "step": 1021 }, { "epoch": 0.7166900420757363, "grad_norm": 0.2621557414531708, "learning_rate": 4.775043918929258e-05, "loss": 0.0875, "step": 1022 }, { "epoch": 0.717391304347826, "grad_norm": 0.30606698989868164, "learning_rate": 4.77458315364281e-05, "loss": 0.0821, "step": 1023 }, { "epoch": 0.7180925666199158, "grad_norm": 0.25450003147125244, "learning_rate": 4.774121939236016e-05, "loss": 0.0914, "step": 1024 }, { "epoch": 0.7187938288920056, "grad_norm": 0.7723038792610168, "learning_rate": 4.773660275799944e-05, "loss": 0.4086, "step": 1025 }, { "epoch": 0.7194950911640954, "grad_norm": 0.28056085109710693, "learning_rate": 4.7731981634257516e-05, "loss": 0.2539, "step": 1026 }, { "epoch": 0.7201963534361852, "grad_norm": 0.757085919380188, "learning_rate": 4.772735602204683e-05, "loss": 0.4066, "step": 1027 }, { "epoch": 0.7208976157082749, "grad_norm": 1.5208889245986938, "learning_rate": 4.7722725922280716e-05, "loss": 0.2843, "step": 1028 }, { "epoch": 0.7215988779803647, "grad_norm": 0.2938483953475952, "learning_rate": 4.771809133587339e-05, "loss": 0.0881, "step": 1029 }, { "epoch": 0.7223001402524544, "grad_norm": 0.30682387948036194, "learning_rate": 4.771345226373996e-05, "loss": 0.0907, "step": 1030 }, { "epoch": 0.7230014025245441, "grad_norm": 0.26441490650177, "learning_rate": 4.770880870679644e-05, "loss": 0.2486, "step": 1031 }, { "epoch": 0.7237026647966339, "grad_norm": 0.285538911819458, "learning_rate": 4.770416066595967e-05, "loss": 0.0944, "step": 1032 }, { "epoch": 0.7244039270687237, "grad_norm": 0.26655080914497375, "learning_rate": 4.7699508142147444e-05, "loss": 0.0961, "step": 1033 }, { "epoch": 0.7251051893408135, "grad_norm": 0.25600916147232056, "learning_rate": 4.769485113627841e-05, "loss": 0.2463, "step": 1034 }, { "epoch": 0.7258064516129032, "grad_norm": 0.2665409743785858, "learning_rate": 4.7690189649272075e-05, "loss": 0.0965, "step": 1035 }, { "epoch": 0.726507713884993, "grad_norm": 0.2623591125011444, "learning_rate": 4.7685523682048885e-05, "loss": 0.2489, "step": 1036 }, { "epoch": 0.7272089761570828, "grad_norm": 0.2989228665828705, "learning_rate": 4.768085323553014e-05, "loss": 0.0958, "step": 1037 }, { "epoch": 0.7279102384291725, "grad_norm": 0.2622661590576172, "learning_rate": 4.767617831063801e-05, "loss": 0.249, "step": 1038 }, { "epoch": 0.7286115007012622, "grad_norm": 1.1651087999343872, "learning_rate": 4.767149890829559e-05, "loss": 0.2432, "step": 1039 }, { "epoch": 0.729312762973352, "grad_norm": 0.272581547498703, "learning_rate": 4.7666815029426816e-05, "loss": 0.0982, "step": 1040 }, { "epoch": 0.7300140252454418, "grad_norm": 0.7141725420951843, "learning_rate": 4.766212667495654e-05, "loss": 0.3953, "step": 1041 }, { "epoch": 0.7307152875175316, "grad_norm": 0.24843184649944305, "learning_rate": 4.765743384581048e-05, "loss": 0.2442, "step": 1042 }, { "epoch": 0.7314165497896213, "grad_norm": 0.9259204268455505, "learning_rate": 4.7652736542915245e-05, "loss": 0.2176, "step": 1043 }, { "epoch": 0.7321178120617111, "grad_norm": 0.2444864958524704, "learning_rate": 4.764803476719834e-05, "loss": 0.2466, "step": 1044 }, { "epoch": 0.7328190743338009, "grad_norm": 0.2256244719028473, "learning_rate": 4.7643328519588104e-05, "loss": 0.2432, "step": 1045 }, { "epoch": 0.7335203366058906, "grad_norm": 0.2507931888103485, "learning_rate": 4.763861780101382e-05, "loss": 0.2525, "step": 1046 }, { "epoch": 0.7342215988779803, "grad_norm": 0.22847916185855865, "learning_rate": 4.7633902612405624e-05, "loss": 0.2449, "step": 1047 }, { "epoch": 0.7349228611500701, "grad_norm": 0.24027034640312195, "learning_rate": 4.762918295469453e-05, "loss": 0.2443, "step": 1048 }, { "epoch": 0.7356241234221599, "grad_norm": 0.23747925460338593, "learning_rate": 4.762445882881246e-05, "loss": 0.2503, "step": 1049 }, { "epoch": 0.7363253856942497, "grad_norm": 0.30435463786125183, "learning_rate": 4.7619730235692186e-05, "loss": 0.1068, "step": 1050 }, { "epoch": 0.7370266479663394, "grad_norm": 0.30631303787231445, "learning_rate": 4.7614997176267376e-05, "loss": 0.1073, "step": 1051 }, { "epoch": 0.7377279102384292, "grad_norm": 0.5116908550262451, "learning_rate": 4.761025965147258e-05, "loss": 0.1288, "step": 1052 }, { "epoch": 0.738429172510519, "grad_norm": 0.23589937388896942, "learning_rate": 4.760551766224324e-05, "loss": 0.2495, "step": 1053 }, { "epoch": 0.7391304347826086, "grad_norm": 0.5503666996955872, "learning_rate": 4.760077120951567e-05, "loss": 0.133, "step": 1054 }, { "epoch": 0.7398316970546984, "grad_norm": 0.30067455768585205, "learning_rate": 4.759602029422705e-05, "loss": 0.1063, "step": 1055 }, { "epoch": 0.7405329593267882, "grad_norm": 0.302632600069046, "learning_rate": 4.759126491731547e-05, "loss": 0.1066, "step": 1056 }, { "epoch": 0.741234221598878, "grad_norm": 0.30034542083740234, "learning_rate": 4.758650507971989e-05, "loss": 0.106, "step": 1057 }, { "epoch": 0.7419354838709677, "grad_norm": 0.3517294228076935, "learning_rate": 4.758174078238014e-05, "loss": 0.1102, "step": 1058 }, { "epoch": 0.7426367461430575, "grad_norm": 0.29058367013931274, "learning_rate": 4.757697202623693e-05, "loss": 0.1034, "step": 1059 }, { "epoch": 0.7433380084151473, "grad_norm": 0.2841102182865143, "learning_rate": 4.757219881223188e-05, "loss": 0.1016, "step": 1060 }, { "epoch": 0.7440392706872371, "grad_norm": 0.2825700342655182, "learning_rate": 4.7567421141307456e-05, "loss": 0.1015, "step": 1061 }, { "epoch": 0.7447405329593267, "grad_norm": 0.3283899128437042, "learning_rate": 4.756263901440702e-05, "loss": 0.1041, "step": 1062 }, { "epoch": 0.7454417952314165, "grad_norm": 0.3068044185638428, "learning_rate": 4.7557852432474805e-05, "loss": 0.1002, "step": 1063 }, { "epoch": 0.7461430575035063, "grad_norm": 0.2728789448738098, "learning_rate": 4.755306139645594e-05, "loss": 0.0979, "step": 1064 }, { "epoch": 0.7468443197755961, "grad_norm": 0.263884961605072, "learning_rate": 4.754826590729643e-05, "loss": 0.2525, "step": 1065 }, { "epoch": 0.7475455820476858, "grad_norm": 0.7371654510498047, "learning_rate": 4.7543465965943126e-05, "loss": 0.4002, "step": 1066 }, { "epoch": 0.7482468443197756, "grad_norm": 0.7516751885414124, "learning_rate": 4.7538661573343815e-05, "loss": 0.4045, "step": 1067 }, { "epoch": 0.7489481065918654, "grad_norm": 0.2814399302005768, "learning_rate": 4.7533852730447106e-05, "loss": 0.0942, "step": 1068 }, { "epoch": 0.7496493688639552, "grad_norm": 0.3058032989501953, "learning_rate": 4.752903943820254e-05, "loss": 0.249, "step": 1069 }, { "epoch": 0.7503506311360448, "grad_norm": 0.7549634575843811, "learning_rate": 4.752422169756048e-05, "loss": 0.4096, "step": 1070 }, { "epoch": 0.7510518934081346, "grad_norm": 0.2618332803249359, "learning_rate": 4.751939950947222e-05, "loss": 0.0946, "step": 1071 }, { "epoch": 0.7517531556802244, "grad_norm": 0.2783300280570984, "learning_rate": 4.75145728748899e-05, "loss": 0.0925, "step": 1072 }, { "epoch": 0.7524544179523142, "grad_norm": 0.2575899064540863, "learning_rate": 4.750974179476655e-05, "loss": 0.0933, "step": 1073 }, { "epoch": 0.7531556802244039, "grad_norm": 0.2608352303504944, "learning_rate": 4.750490627005607e-05, "loss": 0.0942, "step": 1074 }, { "epoch": 0.7538569424964937, "grad_norm": 0.2744944393634796, "learning_rate": 4.7500066301713254e-05, "loss": 0.0918, "step": 1075 }, { "epoch": 0.7545582047685835, "grad_norm": 0.2554890215396881, "learning_rate": 4.749522189069375e-05, "loss": 0.0925, "step": 1076 }, { "epoch": 0.7552594670406733, "grad_norm": 2.6327297687530518, "learning_rate": 4.7490373037954104e-05, "loss": 0.5047, "step": 1077 }, { "epoch": 0.7559607293127629, "grad_norm": 0.25294172763824463, "learning_rate": 4.7485519744451724e-05, "loss": 0.0916, "step": 1078 }, { "epoch": 0.7566619915848527, "grad_norm": 0.26704955101013184, "learning_rate": 4.7480662011144904e-05, "loss": 0.0908, "step": 1079 }, { "epoch": 0.7573632538569425, "grad_norm": 0.253868043422699, "learning_rate": 4.747579983899281e-05, "loss": 0.0919, "step": 1080 }, { "epoch": 0.7580645161290323, "grad_norm": 0.2536796033382416, "learning_rate": 4.747093322895548e-05, "loss": 0.0917, "step": 1081 }, { "epoch": 0.758765778401122, "grad_norm": 0.2655544877052307, "learning_rate": 4.746606218199385e-05, "loss": 0.0878, "step": 1082 }, { "epoch": 0.7594670406732118, "grad_norm": 0.26282596588134766, "learning_rate": 4.7461186699069716e-05, "loss": 0.088, "step": 1083 }, { "epoch": 0.7601683029453016, "grad_norm": 0.782834529876709, "learning_rate": 4.745630678114573e-05, "loss": 0.4134, "step": 1084 }, { "epoch": 0.7608695652173914, "grad_norm": 0.30833783745765686, "learning_rate": 4.745142242918546e-05, "loss": 0.256, "step": 1085 }, { "epoch": 0.761570827489481, "grad_norm": 0.25399160385131836, "learning_rate": 4.744653364415332e-05, "loss": 0.0861, "step": 1086 }, { "epoch": 0.7622720897615708, "grad_norm": 0.25452277064323425, "learning_rate": 4.744164042701462e-05, "loss": 0.0862, "step": 1087 }, { "epoch": 0.7629733520336606, "grad_norm": 0.7889896035194397, "learning_rate": 4.7436742778735516e-05, "loss": 0.4174, "step": 1088 }, { "epoch": 0.7636746143057503, "grad_norm": 0.2571878433227539, "learning_rate": 4.743184070028307e-05, "loss": 0.0868, "step": 1089 }, { "epoch": 0.7643758765778401, "grad_norm": 1.9019166231155396, "learning_rate": 4.7426934192625204e-05, "loss": 0.4781, "step": 1090 }, { "epoch": 0.7650771388499299, "grad_norm": 0.794689416885376, "learning_rate": 4.7422023256730716e-05, "loss": 0.415, "step": 1091 }, { "epoch": 0.7657784011220197, "grad_norm": 0.2620965242385864, "learning_rate": 4.741710789356927e-05, "loss": 0.0879, "step": 1092 }, { "epoch": 0.7664796633941094, "grad_norm": 0.30132466554641724, "learning_rate": 4.7412188104111436e-05, "loss": 0.2509, "step": 1093 }, { "epoch": 0.7671809256661991, "grad_norm": 0.26039016246795654, "learning_rate": 4.7407263889328604e-05, "loss": 0.0882, "step": 1094 }, { "epoch": 0.7678821879382889, "grad_norm": 1.2806977033615112, "learning_rate": 4.7402335250193096e-05, "loss": 0.5809, "step": 1095 }, { "epoch": 0.7685834502103787, "grad_norm": 0.26505598425865173, "learning_rate": 4.7397402187678066e-05, "loss": 0.0898, "step": 1096 }, { "epoch": 0.7692847124824684, "grad_norm": 0.2676396369934082, "learning_rate": 4.7392464702757546e-05, "loss": 0.0912, "step": 1097 }, { "epoch": 0.7699859747545582, "grad_norm": 0.27367928624153137, "learning_rate": 4.738752279640648e-05, "loss": 0.2473, "step": 1098 }, { "epoch": 0.770687237026648, "grad_norm": 0.2696455121040344, "learning_rate": 4.7382576469600626e-05, "loss": 0.2521, "step": 1099 }, { "epoch": 0.7713884992987378, "grad_norm": 0.2703818380832672, "learning_rate": 4.7377625723316664e-05, "loss": 0.0911, "step": 1100 }, { "epoch": 0.7720897615708275, "grad_norm": 0.25609537959098816, "learning_rate": 4.737267055853212e-05, "loss": 0.093, "step": 1101 }, { "epoch": 0.7727910238429172, "grad_norm": 0.26896342635154724, "learning_rate": 4.7367710976225405e-05, "loss": 0.0905, "step": 1102 }, { "epoch": 0.773492286115007, "grad_norm": 0.7723101377487183, "learning_rate": 4.7362746977375786e-05, "loss": 0.4125, "step": 1103 }, { "epoch": 0.7741935483870968, "grad_norm": 0.2691146731376648, "learning_rate": 4.735777856296343e-05, "loss": 0.0914, "step": 1104 }, { "epoch": 0.7748948106591865, "grad_norm": 0.2563805878162384, "learning_rate": 4.7352805733969335e-05, "loss": 0.093, "step": 1105 }, { "epoch": 0.7755960729312763, "grad_norm": 0.272805392742157, "learning_rate": 4.734782849137541e-05, "loss": 0.0918, "step": 1106 }, { "epoch": 0.7762973352033661, "grad_norm": 0.27109086513519287, "learning_rate": 4.734284683616443e-05, "loss": 0.2499, "step": 1107 }, { "epoch": 0.7769985974754559, "grad_norm": 0.27034929394721985, "learning_rate": 4.7337860769320016e-05, "loss": 0.2486, "step": 1108 }, { "epoch": 0.7776998597475456, "grad_norm": 0.26933553814888, "learning_rate": 4.7332870291826676e-05, "loss": 0.2505, "step": 1109 }, { "epoch": 0.7784011220196353, "grad_norm": 0.2728195786476135, "learning_rate": 4.732787540466979e-05, "loss": 0.0922, "step": 1110 }, { "epoch": 0.7791023842917251, "grad_norm": 1.5769168138504028, "learning_rate": 4.732287610883561e-05, "loss": 0.3054, "step": 1111 }, { "epoch": 0.7798036465638148, "grad_norm": 0.2693023383617401, "learning_rate": 4.731787240531126e-05, "loss": 0.2506, "step": 1112 }, { "epoch": 0.7805049088359046, "grad_norm": 0.27896684408187866, "learning_rate": 4.731286429508472e-05, "loss": 0.0939, "step": 1113 }, { "epoch": 0.7812061711079944, "grad_norm": 0.26295697689056396, "learning_rate": 4.730785177914485e-05, "loss": 0.2495, "step": 1114 }, { "epoch": 0.7819074333800842, "grad_norm": 0.2779906094074249, "learning_rate": 4.7302834858481394e-05, "loss": 0.0937, "step": 1115 }, { "epoch": 0.782608695652174, "grad_norm": 0.3081243634223938, "learning_rate": 4.729781353408493e-05, "loss": 0.0918, "step": 1116 }, { "epoch": 0.7833099579242637, "grad_norm": 0.27877384424209595, "learning_rate": 4.729278780694695e-05, "loss": 0.093, "step": 1117 }, { "epoch": 0.7840112201963534, "grad_norm": 0.2616277039051056, "learning_rate": 4.7287757678059775e-05, "loss": 0.095, "step": 1118 }, { "epoch": 0.7847124824684432, "grad_norm": 0.3485963046550751, "learning_rate": 4.728272314841662e-05, "loss": 0.0928, "step": 1119 }, { "epoch": 0.7854137447405329, "grad_norm": 0.2586495578289032, "learning_rate": 4.727768421901156e-05, "loss": 0.0943, "step": 1120 }, { "epoch": 0.7861150070126227, "grad_norm": 0.2676641643047333, "learning_rate": 4.7272640890839534e-05, "loss": 0.0913, "step": 1121 }, { "epoch": 0.7868162692847125, "grad_norm": 0.27698248624801636, "learning_rate": 4.726759316489637e-05, "loss": 0.0931, "step": 1122 }, { "epoch": 0.7875175315568023, "grad_norm": 0.2673553228378296, "learning_rate": 4.726254104217874e-05, "loss": 0.0908, "step": 1123 }, { "epoch": 0.788218793828892, "grad_norm": 0.2680528461933136, "learning_rate": 4.72574845236842e-05, "loss": 0.2467, "step": 1124 }, { "epoch": 0.7889200561009818, "grad_norm": 0.7540523409843445, "learning_rate": 4.7252423610411154e-05, "loss": 0.409, "step": 1125 }, { "epoch": 0.7896213183730715, "grad_norm": 1.4885246753692627, "learning_rate": 4.724735830335891e-05, "loss": 0.3076, "step": 1126 }, { "epoch": 0.7903225806451613, "grad_norm": 0.28761786222457886, "learning_rate": 4.724228860352761e-05, "loss": 0.249, "step": 1127 }, { "epoch": 0.791023842917251, "grad_norm": 0.27183759212493896, "learning_rate": 4.7237214511918274e-05, "loss": 0.2499, "step": 1128 }, { "epoch": 0.7917251051893408, "grad_norm": 0.2889002859592438, "learning_rate": 4.723213602953279e-05, "loss": 0.0891, "step": 1129 }, { "epoch": 0.7924263674614306, "grad_norm": 1.9018967151641846, "learning_rate": 4.7227053157373916e-05, "loss": 0.6042, "step": 1130 }, { "epoch": 0.7931276297335204, "grad_norm": 0.2720188498497009, "learning_rate": 4.722196589644527e-05, "loss": 0.0929, "step": 1131 }, { "epoch": 0.7938288920056101, "grad_norm": 0.2670721709728241, "learning_rate": 4.7216874247751344e-05, "loss": 0.2472, "step": 1132 }, { "epoch": 0.7945301542776999, "grad_norm": 0.2769507169723511, "learning_rate": 4.72117782122975e-05, "loss": 0.0937, "step": 1133 }, { "epoch": 0.7952314165497896, "grad_norm": 0.26173004508018494, "learning_rate": 4.720667779108994e-05, "loss": 0.247, "step": 1134 }, { "epoch": 0.7959326788218793, "grad_norm": 0.2841971814632416, "learning_rate": 4.7201572985135756e-05, "loss": 0.0957, "step": 1135 }, { "epoch": 0.7966339410939691, "grad_norm": 0.3140200674533844, "learning_rate": 4.719646379544292e-05, "loss": 0.0961, "step": 1136 }, { "epoch": 0.7973352033660589, "grad_norm": 0.2795914113521576, "learning_rate": 4.7191350223020224e-05, "loss": 0.0945, "step": 1137 }, { "epoch": 0.7980364656381487, "grad_norm": 0.2799009382724762, "learning_rate": 4.7186232268877375e-05, "loss": 0.0947, "step": 1138 }, { "epoch": 0.7987377279102384, "grad_norm": 0.2927541136741638, "learning_rate": 4.7181109934024904e-05, "loss": 0.2491, "step": 1139 }, { "epoch": 0.7994389901823282, "grad_norm": 0.2584473192691803, "learning_rate": 4.717598321947424e-05, "loss": 0.0941, "step": 1140 }, { "epoch": 0.800140252454418, "grad_norm": 0.7416365742683411, "learning_rate": 4.717085212623764e-05, "loss": 0.4022, "step": 1141 }, { "epoch": 0.8008415147265077, "grad_norm": 0.2792588770389557, "learning_rate": 4.7165716655328275e-05, "loss": 0.0943, "step": 1142 }, { "epoch": 0.8015427769985974, "grad_norm": 0.2648766040802002, "learning_rate": 4.716057680776013e-05, "loss": 0.2503, "step": 1143 }, { "epoch": 0.8022440392706872, "grad_norm": 0.25996991991996765, "learning_rate": 4.715543258454808e-05, "loss": 0.0946, "step": 1144 }, { "epoch": 0.802945301542777, "grad_norm": 0.262742280960083, "learning_rate": 4.715028398670787e-05, "loss": 0.095, "step": 1145 }, { "epoch": 0.8036465638148668, "grad_norm": 0.7508605122566223, "learning_rate": 4.7145131015256095e-05, "loss": 0.4061, "step": 1146 }, { "epoch": 0.8043478260869565, "grad_norm": 0.2632063329219818, "learning_rate": 4.7139973671210224e-05, "loss": 0.2459, "step": 1147 }, { "epoch": 0.8050490883590463, "grad_norm": 0.302593857049942, "learning_rate": 4.713481195558857e-05, "loss": 0.0916, "step": 1148 }, { "epoch": 0.8057503506311361, "grad_norm": 0.27316465973854065, "learning_rate": 4.712964586941033e-05, "loss": 0.0922, "step": 1149 }, { "epoch": 0.8064516129032258, "grad_norm": 0.29497459530830383, "learning_rate": 4.712447541369556e-05, "loss": 0.0905, "step": 1150 }, { "epoch": 0.8071528751753155, "grad_norm": 0.7540062665939331, "learning_rate": 4.7119300589465164e-05, "loss": 0.406, "step": 1151 }, { "epoch": 0.8078541374474053, "grad_norm": 0.2601810693740845, "learning_rate": 4.7114121397740943e-05, "loss": 0.0944, "step": 1152 }, { "epoch": 0.8085553997194951, "grad_norm": 0.2673753499984741, "learning_rate": 4.710893783954551e-05, "loss": 0.2448, "step": 1153 }, { "epoch": 0.8092566619915849, "grad_norm": 0.25727906823158264, "learning_rate": 4.7103749915902386e-05, "loss": 0.0933, "step": 1154 }, { "epoch": 0.8099579242636746, "grad_norm": 0.27337294816970825, "learning_rate": 4.7098557627835924e-05, "loss": 0.0914, "step": 1155 }, { "epoch": 0.8106591865357644, "grad_norm": 0.27481862902641296, "learning_rate": 4.709336097637135e-05, "loss": 0.2523, "step": 1156 }, { "epoch": 0.8113604488078542, "grad_norm": 1.7498785257339478, "learning_rate": 4.708815996253476e-05, "loss": 0.457, "step": 1157 }, { "epoch": 0.8120617110799438, "grad_norm": 0.27253201603889465, "learning_rate": 4.708295458735311e-05, "loss": 0.0915, "step": 1158 }, { "epoch": 0.8127629733520336, "grad_norm": 0.2747477889060974, "learning_rate": 4.707774485185419e-05, "loss": 0.092, "step": 1159 }, { "epoch": 0.8134642356241234, "grad_norm": 0.2604442238807678, "learning_rate": 4.7072530757066694e-05, "loss": 0.0939, "step": 1160 }, { "epoch": 0.8141654978962132, "grad_norm": 0.30064675211906433, "learning_rate": 4.706731230402013e-05, "loss": 0.0898, "step": 1161 }, { "epoch": 0.814866760168303, "grad_norm": 0.2689192295074463, "learning_rate": 4.70620894937449e-05, "loss": 0.0908, "step": 1162 }, { "epoch": 0.8155680224403927, "grad_norm": 0.25993919372558594, "learning_rate": 4.705686232727226e-05, "loss": 0.0938, "step": 1163 }, { "epoch": 0.8162692847124825, "grad_norm": 0.2943669259548187, "learning_rate": 4.705163080563433e-05, "loss": 0.0893, "step": 1164 }, { "epoch": 0.8169705469845723, "grad_norm": 0.759937047958374, "learning_rate": 4.704639492986407e-05, "loss": 0.4097, "step": 1165 }, { "epoch": 0.8176718092566619, "grad_norm": 0.2572208344936371, "learning_rate": 4.704115470099531e-05, "loss": 0.093, "step": 1166 }, { "epoch": 0.8183730715287517, "grad_norm": 0.2701454758644104, "learning_rate": 4.703591012006276e-05, "loss": 0.0901, "step": 1167 }, { "epoch": 0.8190743338008415, "grad_norm": 0.26014047861099243, "learning_rate": 4.703066118810195e-05, "loss": 0.0928, "step": 1168 }, { "epoch": 0.8197755960729313, "grad_norm": 0.2673785984516144, "learning_rate": 4.702540790614931e-05, "loss": 0.0895, "step": 1169 }, { "epoch": 0.820476858345021, "grad_norm": 0.25396981835365295, "learning_rate": 4.70201502752421e-05, "loss": 0.0914, "step": 1170 }, { "epoch": 0.8211781206171108, "grad_norm": 0.2622472047805786, "learning_rate": 4.701488829641845e-05, "loss": 0.0878, "step": 1171 }, { "epoch": 0.8218793828892006, "grad_norm": 0.2971939444541931, "learning_rate": 4.7009621970717344e-05, "loss": 0.2514, "step": 1172 }, { "epoch": 0.8225806451612904, "grad_norm": 0.29688969254493713, "learning_rate": 4.7004351299178636e-05, "loss": 0.2523, "step": 1173 }, { "epoch": 0.82328190743338, "grad_norm": 0.24907928705215454, "learning_rate": 4.6999076282843013e-05, "loss": 0.0895, "step": 1174 }, { "epoch": 0.8239831697054698, "grad_norm": 0.2563653886318207, "learning_rate": 4.6993796922752055e-05, "loss": 0.0853, "step": 1175 }, { "epoch": 0.8246844319775596, "grad_norm": 0.2519226670265198, "learning_rate": 4.698851321994816e-05, "loss": 0.0846, "step": 1176 }, { "epoch": 0.8253856942496494, "grad_norm": 0.25980815291404724, "learning_rate": 4.698322517547462e-05, "loss": 0.0789, "step": 1177 }, { "epoch": 0.8260869565217391, "grad_norm": 0.25769248604774475, "learning_rate": 4.697793279037557e-05, "loss": 0.0776, "step": 1178 }, { "epoch": 0.8267882187938289, "grad_norm": 0.2507522702217102, "learning_rate": 4.697263606569599e-05, "loss": 0.0829, "step": 1179 }, { "epoch": 0.8274894810659187, "grad_norm": 1.3409552574157715, "learning_rate": 4.696733500248172e-05, "loss": 0.5889, "step": 1180 }, { "epoch": 0.8281907433380085, "grad_norm": 0.24525587260723114, "learning_rate": 4.696202960177949e-05, "loss": 0.0869, "step": 1181 }, { "epoch": 0.8288920056100981, "grad_norm": 0.3113880455493927, "learning_rate": 4.6956719864636836e-05, "loss": 0.2552, "step": 1182 }, { "epoch": 0.8295932678821879, "grad_norm": 0.37778759002685547, "learning_rate": 4.6951405792102196e-05, "loss": 0.2672, "step": 1183 }, { "epoch": 0.8302945301542777, "grad_norm": 0.24035696685314178, "learning_rate": 4.694608738522482e-05, "loss": 0.0851, "step": 1184 }, { "epoch": 0.8309957924263675, "grad_norm": 0.24196834862232208, "learning_rate": 4.6940764645054856e-05, "loss": 0.0805, "step": 1185 }, { "epoch": 0.8316970546984572, "grad_norm": 0.23574328422546387, "learning_rate": 4.693543757264328e-05, "loss": 0.0782, "step": 1186 }, { "epoch": 0.832398316970547, "grad_norm": 0.2415281981229782, "learning_rate": 4.693010616904193e-05, "loss": 0.085, "step": 1187 }, { "epoch": 0.8330995792426368, "grad_norm": 0.23983024060726166, "learning_rate": 4.69247704353035e-05, "loss": 0.0799, "step": 1188 }, { "epoch": 0.8338008415147266, "grad_norm": 0.24173972010612488, "learning_rate": 4.6919430372481545e-05, "loss": 0.0848, "step": 1189 }, { "epoch": 0.8345021037868162, "grad_norm": 0.24538077414035797, "learning_rate": 4.6914085981630465e-05, "loss": 0.0725, "step": 1190 }, { "epoch": 0.835203366058906, "grad_norm": 0.3570541441440582, "learning_rate": 4.6908737263805526e-05, "loss": 0.2535, "step": 1191 }, { "epoch": 0.8359046283309958, "grad_norm": 0.33068758249282837, "learning_rate": 4.690338422006283e-05, "loss": 0.2581, "step": 1192 }, { "epoch": 0.8366058906030855, "grad_norm": 0.23293465375900269, "learning_rate": 4.6898026851459353e-05, "loss": 0.0765, "step": 1193 }, { "epoch": 0.8373071528751753, "grad_norm": 0.34894171357154846, "learning_rate": 4.6892665159052916e-05, "loss": 0.2519, "step": 1194 }, { "epoch": 0.8380084151472651, "grad_norm": 0.22459785640239716, "learning_rate": 4.6887299143902195e-05, "loss": 0.0675, "step": 1195 }, { "epoch": 0.8387096774193549, "grad_norm": 0.22263850271701813, "learning_rate": 4.688192880706671e-05, "loss": 0.0736, "step": 1196 }, { "epoch": 0.8394109396914446, "grad_norm": 0.2279958724975586, "learning_rate": 4.687655414960686e-05, "loss": 0.0744, "step": 1197 }, { "epoch": 0.8401122019635343, "grad_norm": 0.2375834435224533, "learning_rate": 4.687117517258386e-05, "loss": 0.0814, "step": 1198 }, { "epoch": 0.8408134642356241, "grad_norm": 0.23694610595703125, "learning_rate": 4.6865791877059814e-05, "loss": 0.0809, "step": 1199 }, { "epoch": 0.8415147265077139, "grad_norm": 0.23668017983436584, "learning_rate": 4.686040426409767e-05, "loss": 0.0806, "step": 1200 }, { "epoch": 0.8422159887798036, "grad_norm": 0.22806808352470398, "learning_rate": 4.6855012334761194e-05, "loss": 0.0726, "step": 1201 }, { "epoch": 0.8429172510518934, "grad_norm": 0.39075711369514465, "learning_rate": 4.684961609011506e-05, "loss": 0.2634, "step": 1202 }, { "epoch": 0.8436185133239832, "grad_norm": 0.39393171668052673, "learning_rate": 4.684421553122473e-05, "loss": 0.2582, "step": 1203 }, { "epoch": 0.844319775596073, "grad_norm": 0.2343747615814209, "learning_rate": 4.68388106591566e-05, "loss": 0.0781, "step": 1204 }, { "epoch": 0.8450210378681627, "grad_norm": 0.21101199090480804, "learning_rate": 4.683340147497783e-05, "loss": 0.0623, "step": 1205 }, { "epoch": 0.8457223001402524, "grad_norm": 0.40408656001091003, "learning_rate": 4.68279879797565e-05, "loss": 0.2582, "step": 1206 }, { "epoch": 0.8464235624123422, "grad_norm": 0.2059122920036316, "learning_rate": 4.682257017456151e-05, "loss": 0.0613, "step": 1207 }, { "epoch": 0.847124824684432, "grad_norm": 0.23322226107120514, "learning_rate": 4.68171480604626e-05, "loss": 0.0761, "step": 1208 }, { "epoch": 0.8478260869565217, "grad_norm": 0.20682433247566223, "learning_rate": 4.681172163853038e-05, "loss": 0.0608, "step": 1209 }, { "epoch": 0.8485273492286115, "grad_norm": 0.23484937846660614, "learning_rate": 4.6806290909836326e-05, "loss": 0.076, "step": 1210 }, { "epoch": 0.8492286115007013, "grad_norm": 0.4288455545902252, "learning_rate": 4.6800855875452723e-05, "loss": 0.2628, "step": 1211 }, { "epoch": 0.8499298737727911, "grad_norm": 0.19835878908634186, "learning_rate": 4.679541653645273e-05, "loss": 0.0587, "step": 1212 }, { "epoch": 0.8506311360448808, "grad_norm": 0.4290521442890167, "learning_rate": 4.678997289391036e-05, "loss": 0.2598, "step": 1213 }, { "epoch": 0.8513323983169705, "grad_norm": 0.23187117278575897, "learning_rate": 4.678452494890047e-05, "loss": 0.0739, "step": 1214 }, { "epoch": 0.8520336605890603, "grad_norm": 0.21620476245880127, "learning_rate": 4.677907270249876e-05, "loss": 0.0653, "step": 1215 }, { "epoch": 0.85273492286115, "grad_norm": 0.4463881552219391, "learning_rate": 4.67736161557818e-05, "loss": 0.278, "step": 1216 }, { "epoch": 0.8534361851332398, "grad_norm": 0.1965550184249878, "learning_rate": 4.676815530982698e-05, "loss": 0.0571, "step": 1217 }, { "epoch": 0.8541374474053296, "grad_norm": 0.45563003420829773, "learning_rate": 4.6762690165712555e-05, "loss": 0.2639, "step": 1218 }, { "epoch": 0.8548387096774194, "grad_norm": 0.4655260741710663, "learning_rate": 4.675722072451762e-05, "loss": 0.2813, "step": 1219 }, { "epoch": 0.8555399719495091, "grad_norm": 0.23369549214839935, "learning_rate": 4.6751746987322157e-05, "loss": 0.0733, "step": 1220 }, { "epoch": 0.8562412342215989, "grad_norm": 0.21398091316223145, "learning_rate": 4.674626895520693e-05, "loss": 0.0663, "step": 1221 }, { "epoch": 0.8569424964936886, "grad_norm": 0.45610395073890686, "learning_rate": 4.6740786629253594e-05, "loss": 0.2783, "step": 1222 }, { "epoch": 0.8576437587657784, "grad_norm": 0.2130671888589859, "learning_rate": 4.673530001054466e-05, "loss": 0.0656, "step": 1223 }, { "epoch": 0.8583450210378681, "grad_norm": 0.4313875436782837, "learning_rate": 4.672980910016345e-05, "loss": 0.2733, "step": 1224 }, { "epoch": 0.8590462833099579, "grad_norm": 0.23226389288902283, "learning_rate": 4.672431389919416e-05, "loss": 0.0744, "step": 1225 }, { "epoch": 0.8597475455820477, "grad_norm": 0.21392512321472168, "learning_rate": 4.671881440872184e-05, "loss": 0.0664, "step": 1226 }, { "epoch": 0.8604488078541375, "grad_norm": 0.44776198267936707, "learning_rate": 4.671331062983236e-05, "loss": 0.2655, "step": 1227 }, { "epoch": 0.8611500701262272, "grad_norm": 0.45394572615623474, "learning_rate": 4.670780256361245e-05, "loss": 0.2658, "step": 1228 }, { "epoch": 0.861851332398317, "grad_norm": 0.4391431510448456, "learning_rate": 4.670229021114969e-05, "loss": 0.2642, "step": 1229 }, { "epoch": 0.8625525946704067, "grad_norm": 0.23390814661979675, "learning_rate": 4.66967735735325e-05, "loss": 0.0756, "step": 1230 }, { "epoch": 0.8632538569424965, "grad_norm": 0.3965989053249359, "learning_rate": 4.669125265185017e-05, "loss": 0.2681, "step": 1231 }, { "epoch": 0.8639551192145862, "grad_norm": 0.21505074203014374, "learning_rate": 4.6685727447192786e-05, "loss": 0.0683, "step": 1232 }, { "epoch": 0.864656381486676, "grad_norm": 0.9890860319137573, "learning_rate": 4.668019796065133e-05, "loss": 0.4605, "step": 1233 }, { "epoch": 0.8653576437587658, "grad_norm": 0.19596923887729645, "learning_rate": 4.6674664193317595e-05, "loss": 0.0608, "step": 1234 }, { "epoch": 0.8660589060308556, "grad_norm": 0.21796807646751404, "learning_rate": 4.666912614628424e-05, "loss": 0.0701, "step": 1235 }, { "epoch": 0.8667601683029453, "grad_norm": 0.38822802901268005, "learning_rate": 4.666358382064476e-05, "loss": 0.2706, "step": 1236 }, { "epoch": 0.8674614305750351, "grad_norm": 0.23857428133487701, "learning_rate": 4.66580372174935e-05, "loss": 0.0803, "step": 1237 }, { "epoch": 0.8681626928471248, "grad_norm": 0.2239842712879181, "learning_rate": 4.665248633792564e-05, "loss": 0.0724, "step": 1238 }, { "epoch": 0.8688639551192145, "grad_norm": 0.23949959874153137, "learning_rate": 4.664693118303722e-05, "loss": 0.0809, "step": 1239 }, { "epoch": 0.8695652173913043, "grad_norm": 0.24086378514766693, "learning_rate": 4.66413717539251e-05, "loss": 0.0815, "step": 1240 }, { "epoch": 0.8702664796633941, "grad_norm": 0.2243194729089737, "learning_rate": 4.6635808051687014e-05, "loss": 0.0719, "step": 1241 }, { "epoch": 0.8709677419354839, "grad_norm": 0.2396458089351654, "learning_rate": 4.6630240077421524e-05, "loss": 0.0813, "step": 1242 }, { "epoch": 0.8716690042075736, "grad_norm": 0.3594561815261841, "learning_rate": 4.662466783222802e-05, "loss": 0.2628, "step": 1243 }, { "epoch": 0.8723702664796634, "grad_norm": 0.1917501538991928, "learning_rate": 4.6619091317206775e-05, "loss": 0.0549, "step": 1244 }, { "epoch": 0.8730715287517532, "grad_norm": 0.23824982345104218, "learning_rate": 4.661351053345886e-05, "loss": 0.0807, "step": 1245 }, { "epoch": 0.8737727910238429, "grad_norm": 0.3831689953804016, "learning_rate": 4.660792548208622e-05, "loss": 0.2574, "step": 1246 }, { "epoch": 0.8744740532959326, "grad_norm": 0.21972650289535522, "learning_rate": 4.660233616419163e-05, "loss": 0.0719, "step": 1247 }, { "epoch": 0.8751753155680224, "grad_norm": 0.40142184495925903, "learning_rate": 4.659674258087872e-05, "loss": 0.2445, "step": 1248 }, { "epoch": 0.8758765778401122, "grad_norm": 0.23850712180137634, "learning_rate": 4.6591144733251935e-05, "loss": 0.0799, "step": 1249 }, { "epoch": 0.876577840112202, "grad_norm": 0.22030366957187653, "learning_rate": 4.658554262241659e-05, "loss": 0.0705, "step": 1250 }, { "epoch": 0.8772791023842917, "grad_norm": 0.23983080685138702, "learning_rate": 4.657993624947883e-05, "loss": 0.0798, "step": 1251 }, { "epoch": 0.8779803646563815, "grad_norm": 0.22431717813014984, "learning_rate": 4.6574325615545654e-05, "loss": 0.0718, "step": 1252 }, { "epoch": 0.8786816269284713, "grad_norm": 0.23657220602035522, "learning_rate": 4.656871072172487e-05, "loss": 0.0779, "step": 1253 }, { "epoch": 0.879382889200561, "grad_norm": 0.2367831915616989, "learning_rate": 4.6563091569125175e-05, "loss": 0.0776, "step": 1254 }, { "epoch": 0.8800841514726507, "grad_norm": 0.2381071299314499, "learning_rate": 4.655746815885605e-05, "loss": 0.0783, "step": 1255 }, { "epoch": 0.8807854137447405, "grad_norm": 0.41840845346450806, "learning_rate": 4.6551840492027866e-05, "loss": 0.2743, "step": 1256 }, { "epoch": 0.8814866760168303, "grad_norm": 0.4525039494037628, "learning_rate": 4.6546208569751806e-05, "loss": 0.2762, "step": 1257 }, { "epoch": 0.8821879382889201, "grad_norm": 0.23455829918384552, "learning_rate": 4.654057239313992e-05, "loss": 0.0767, "step": 1258 }, { "epoch": 0.8828892005610098, "grad_norm": 1.151419758796692, "learning_rate": 4.6534931963305064e-05, "loss": 0.4733, "step": 1259 }, { "epoch": 0.8835904628330996, "grad_norm": 0.38574957847595215, "learning_rate": 4.6529287281360946e-05, "loss": 0.2649, "step": 1260 }, { "epoch": 0.8842917251051894, "grad_norm": 0.4007299244403839, "learning_rate": 4.652363834842214e-05, "loss": 0.2695, "step": 1261 }, { "epoch": 0.884992987377279, "grad_norm": 1.0146665573120117, "learning_rate": 4.651798516560402e-05, "loss": 0.4583, "step": 1262 }, { "epoch": 0.8856942496493688, "grad_norm": 0.9837619066238403, "learning_rate": 4.651232773402282e-05, "loss": 0.4318, "step": 1263 }, { "epoch": 0.8863955119214586, "grad_norm": 0.18298576772212982, "learning_rate": 4.650666605479561e-05, "loss": 0.0535, "step": 1264 }, { "epoch": 0.8870967741935484, "grad_norm": 0.22086800634860992, "learning_rate": 4.650100012904031e-05, "loss": 0.0734, "step": 1265 }, { "epoch": 0.8877980364656382, "grad_norm": 0.22226445376873016, "learning_rate": 4.6495329957875646e-05, "loss": 0.0739, "step": 1266 }, { "epoch": 0.8884992987377279, "grad_norm": 0.36015430092811584, "learning_rate": 4.648965554242121e-05, "loss": 0.2512, "step": 1267 }, { "epoch": 0.8892005610098177, "grad_norm": 0.20949146151542664, "learning_rate": 4.6483976883797435e-05, "loss": 0.0658, "step": 1268 }, { "epoch": 0.8899018232819075, "grad_norm": 0.20826151967048645, "learning_rate": 4.6478293983125574e-05, "loss": 0.0662, "step": 1269 }, { "epoch": 0.8906030855539971, "grad_norm": 0.22625242173671722, "learning_rate": 4.647260684152773e-05, "loss": 0.0758, "step": 1270 }, { "epoch": 0.8913043478260869, "grad_norm": 0.3450414538383484, "learning_rate": 4.646691546012683e-05, "loss": 0.2486, "step": 1271 }, { "epoch": 0.8920056100981767, "grad_norm": 0.8793926239013672, "learning_rate": 4.6461219840046654e-05, "loss": 0.4257, "step": 1272 }, { "epoch": 0.8927068723702665, "grad_norm": 0.3251304626464844, "learning_rate": 4.645551998241181e-05, "loss": 0.2607, "step": 1273 }, { "epoch": 0.8934081346423562, "grad_norm": 0.21184508502483368, "learning_rate": 4.644981588834775e-05, "loss": 0.0681, "step": 1274 }, { "epoch": 0.894109396914446, "grad_norm": 0.30734875798225403, "learning_rate": 4.644410755898075e-05, "loss": 0.2555, "step": 1275 }, { "epoch": 0.8948106591865358, "grad_norm": 0.25254765152931213, "learning_rate": 4.643839499543793e-05, "loss": 0.0888, "step": 1276 }, { "epoch": 0.8955119214586256, "grad_norm": 5.887379169464111, "learning_rate": 4.6432678198847246e-05, "loss": 0.7178, "step": 1277 }, { "epoch": 0.8962131837307152, "grad_norm": 0.25540220737457275, "learning_rate": 4.6426957170337496e-05, "loss": 0.0898, "step": 1278 }, { "epoch": 0.896914446002805, "grad_norm": 0.25358515977859497, "learning_rate": 4.6421231911038294e-05, "loss": 0.0902, "step": 1279 }, { "epoch": 0.8976157082748948, "grad_norm": 0.2466079741716385, "learning_rate": 4.641550242208012e-05, "loss": 0.0828, "step": 1280 }, { "epoch": 0.8983169705469846, "grad_norm": 0.2561773359775543, "learning_rate": 4.640976870459425e-05, "loss": 0.0913, "step": 1281 }, { "epoch": 0.8990182328190743, "grad_norm": 0.2565630376338959, "learning_rate": 4.640403075971282e-05, "loss": 0.0915, "step": 1282 }, { "epoch": 0.8997194950911641, "grad_norm": 0.24910064041614532, "learning_rate": 4.6398288588568826e-05, "loss": 0.0831, "step": 1283 }, { "epoch": 0.9004207573632539, "grad_norm": 0.24633432924747467, "learning_rate": 4.639254219229604e-05, "loss": 0.083, "step": 1284 }, { "epoch": 0.9011220196353437, "grad_norm": 0.28724923729896545, "learning_rate": 4.63867915720291e-05, "loss": 0.253, "step": 1285 }, { "epoch": 0.9018232819074333, "grad_norm": 0.25480008125305176, "learning_rate": 4.638103672890348e-05, "loss": 0.0911, "step": 1286 }, { "epoch": 0.9025245441795231, "grad_norm": 0.24358530342578888, "learning_rate": 4.6375277664055494e-05, "loss": 0.0823, "step": 1287 }, { "epoch": 0.9032258064516129, "grad_norm": 0.2814571261405945, "learning_rate": 4.636951437862227e-05, "loss": 0.2515, "step": 1288 }, { "epoch": 0.9039270687237027, "grad_norm": 0.30580753087997437, "learning_rate": 4.636374687374178e-05, "loss": 0.249, "step": 1289 }, { "epoch": 0.9046283309957924, "grad_norm": 0.3007729649543762, "learning_rate": 4.635797515055282e-05, "loss": 0.2575, "step": 1290 }, { "epoch": 0.9053295932678822, "grad_norm": 0.2998989224433899, "learning_rate": 4.635219921019504e-05, "loss": 0.2537, "step": 1291 }, { "epoch": 0.906030855539972, "grad_norm": 0.3101722002029419, "learning_rate": 4.63464190538089e-05, "loss": 0.2465, "step": 1292 }, { "epoch": 0.9067321178120618, "grad_norm": 0.2877163290977478, "learning_rate": 4.63406346825357e-05, "loss": 0.2515, "step": 1293 }, { "epoch": 0.9074333800841514, "grad_norm": 0.25233492255210876, "learning_rate": 4.633484609751758e-05, "loss": 0.0904, "step": 1294 }, { "epoch": 0.9081346423562412, "grad_norm": 0.2835732400417328, "learning_rate": 4.6329053299897505e-05, "loss": 0.2519, "step": 1295 }, { "epoch": 0.908835904628331, "grad_norm": 0.25394460558891296, "learning_rate": 4.632325629081928e-05, "loss": 0.0908, "step": 1296 }, { "epoch": 0.9095371669004207, "grad_norm": 0.25106382369995117, "learning_rate": 4.631745507142752e-05, "loss": 0.0845, "step": 1297 }, { "epoch": 0.9102384291725105, "grad_norm": 0.2553684413433075, "learning_rate": 4.6311649642867696e-05, "loss": 0.0857, "step": 1298 }, { "epoch": 0.9109396914446003, "grad_norm": 0.255780965089798, "learning_rate": 4.630584000628609e-05, "loss": 0.0912, "step": 1299 }, { "epoch": 0.9116409537166901, "grad_norm": 0.2532196044921875, "learning_rate": 4.630002616282984e-05, "loss": 0.0912, "step": 1300 }, { "epoch": 0.9123422159887798, "grad_norm": 0.24730892479419708, "learning_rate": 4.629420811364689e-05, "loss": 0.0757, "step": 1301 }, { "epoch": 0.9130434782608695, "grad_norm": 0.25310656428337097, "learning_rate": 4.6288385859886016e-05, "loss": 0.0901, "step": 1302 }, { "epoch": 0.9137447405329593, "grad_norm": 0.25735652446746826, "learning_rate": 4.628255940269685e-05, "loss": 0.0762, "step": 1303 }, { "epoch": 0.9144460028050491, "grad_norm": 0.2517518401145935, "learning_rate": 4.627672874322982e-05, "loss": 0.0901, "step": 1304 }, { "epoch": 0.9151472650771388, "grad_norm": 0.25043922662734985, "learning_rate": 4.627089388263621e-05, "loss": 0.0893, "step": 1305 }, { "epoch": 0.9158485273492286, "grad_norm": 0.2477719783782959, "learning_rate": 4.626505482206811e-05, "loss": 0.0885, "step": 1306 }, { "epoch": 0.9165497896213184, "grad_norm": 0.24747590720653534, "learning_rate": 4.6259211562678474e-05, "loss": 0.0882, "step": 1307 }, { "epoch": 0.9172510518934082, "grad_norm": 0.3237937092781067, "learning_rate": 4.6253364105621046e-05, "loss": 0.249, "step": 1308 }, { "epoch": 0.9179523141654979, "grad_norm": 0.2329646199941635, "learning_rate": 4.624751245205042e-05, "loss": 0.0787, "step": 1309 }, { "epoch": 0.9186535764375876, "grad_norm": 0.2440682202577591, "learning_rate": 4.624165660312202e-05, "loss": 0.0865, "step": 1310 }, { "epoch": 0.9193548387096774, "grad_norm": 0.2436639368534088, "learning_rate": 4.623579655999209e-05, "loss": 0.0858, "step": 1311 }, { "epoch": 0.9200561009817672, "grad_norm": 0.23994649946689606, "learning_rate": 4.6229932323817704e-05, "loss": 0.0849, "step": 1312 }, { "epoch": 0.9207573632538569, "grad_norm": 0.8427737355232239, "learning_rate": 4.6224063895756765e-05, "loss": 0.4319, "step": 1313 }, { "epoch": 0.9214586255259467, "grad_norm": 0.3326450288295746, "learning_rate": 4.6218191276968e-05, "loss": 0.2628, "step": 1314 }, { "epoch": 0.9221598877980365, "grad_norm": 0.32842421531677246, "learning_rate": 4.621231446861099e-05, "loss": 0.2587, "step": 1315 }, { "epoch": 0.9228611500701263, "grad_norm": 0.22603265941143036, "learning_rate": 4.62064334718461e-05, "loss": 0.0762, "step": 1316 }, { "epoch": 0.923562412342216, "grad_norm": 0.2081862986087799, "learning_rate": 4.6200548287834546e-05, "loss": 0.0669, "step": 1317 }, { "epoch": 0.9242636746143057, "grad_norm": 0.22569425404071808, "learning_rate": 4.619465891773837e-05, "loss": 0.0762, "step": 1318 }, { "epoch": 0.9249649368863955, "grad_norm": 0.37584248185157776, "learning_rate": 4.618876536272044e-05, "loss": 0.2454, "step": 1319 }, { "epoch": 0.9256661991584852, "grad_norm": 0.24006323516368866, "learning_rate": 4.6182867623944436e-05, "loss": 0.0842, "step": 1320 }, { "epoch": 0.926367461430575, "grad_norm": 1.4730037450790405, "learning_rate": 4.61769657025749e-05, "loss": 0.5899, "step": 1321 }, { "epoch": 0.9270687237026648, "grad_norm": 0.241683229804039, "learning_rate": 4.6171059599777156e-05, "loss": 0.0846, "step": 1322 }, { "epoch": 0.9277699859747546, "grad_norm": 0.24017193913459778, "learning_rate": 4.616514931671738e-05, "loss": 0.084, "step": 1323 }, { "epoch": 0.9284712482468443, "grad_norm": 0.3358677923679352, "learning_rate": 4.615923485456258e-05, "loss": 0.2637, "step": 1324 }, { "epoch": 0.9291725105189341, "grad_norm": 0.34681248664855957, "learning_rate": 4.6153316214480554e-05, "loss": 0.2527, "step": 1325 }, { "epoch": 0.9298737727910238, "grad_norm": 1.3803893327713013, "learning_rate": 4.614739339763997e-05, "loss": 0.6082, "step": 1326 }, { "epoch": 0.9305750350631136, "grad_norm": 0.24180029332637787, "learning_rate": 4.6141466405210275e-05, "loss": 0.085, "step": 1327 }, { "epoch": 0.9312762973352033, "grad_norm": 0.35854920744895935, "learning_rate": 4.613553523836179e-05, "loss": 0.2664, "step": 1328 }, { "epoch": 0.9319775596072931, "grad_norm": 0.24290727078914642, "learning_rate": 4.612959989826561e-05, "loss": 0.0859, "step": 1329 }, { "epoch": 0.9326788218793829, "grad_norm": 0.3121919631958008, "learning_rate": 4.6123660386093695e-05, "loss": 0.2581, "step": 1330 }, { "epoch": 0.9333800841514727, "grad_norm": 0.24457161128520966, "learning_rate": 4.6117716703018804e-05, "loss": 0.0867, "step": 1331 }, { "epoch": 0.9340813464235624, "grad_norm": 0.23154322803020477, "learning_rate": 4.6111768850214525e-05, "loss": 0.0785, "step": 1332 }, { "epoch": 0.9347826086956522, "grad_norm": 0.3377188742160797, "learning_rate": 4.610581682885528e-05, "loss": 0.2472, "step": 1333 }, { "epoch": 0.9354838709677419, "grad_norm": 0.24633723497390747, "learning_rate": 4.6099860640116296e-05, "loss": 0.0878, "step": 1334 }, { "epoch": 0.9361851332398317, "grad_norm": 0.8178338408470154, "learning_rate": 4.6093900285173655e-05, "loss": 0.425, "step": 1335 }, { "epoch": 0.9368863955119214, "grad_norm": 0.3122860789299011, "learning_rate": 4.60879357652042e-05, "loss": 0.2613, "step": 1336 }, { "epoch": 0.9375876577840112, "grad_norm": 0.2176394909620285, "learning_rate": 4.608196708138568e-05, "loss": 0.0621, "step": 1337 }, { "epoch": 0.938288920056101, "grad_norm": 0.22240909934043884, "learning_rate": 4.607599423489658e-05, "loss": 0.0711, "step": 1338 }, { "epoch": 0.9389901823281908, "grad_norm": 0.25041788816452026, "learning_rate": 4.607001722691628e-05, "loss": 0.0893, "step": 1339 }, { "epoch": 0.9396914446002805, "grad_norm": 0.3087342381477356, "learning_rate": 4.6064036058624936e-05, "loss": 0.2458, "step": 1340 }, { "epoch": 0.9403927068723703, "grad_norm": 0.23618170619010925, "learning_rate": 4.605805073120355e-05, "loss": 0.0798, "step": 1341 }, { "epoch": 0.94109396914446, "grad_norm": 0.21987338364124298, "learning_rate": 4.6052061245833924e-05, "loss": 0.0703, "step": 1342 }, { "epoch": 0.9417952314165497, "grad_norm": 0.2942277193069458, "learning_rate": 4.6046067603698694e-05, "loss": 0.2524, "step": 1343 }, { "epoch": 0.9424964936886395, "grad_norm": 0.34312671422958374, "learning_rate": 4.604006980598132e-05, "loss": 0.2329, "step": 1344 }, { "epoch": 0.9431977559607293, "grad_norm": 0.21100713312625885, "learning_rate": 4.603406785386608e-05, "loss": 0.0617, "step": 1345 }, { "epoch": 0.9438990182328191, "grad_norm": 5.2866411209106445, "learning_rate": 4.602806174853805e-05, "loss": 0.9361, "step": 1346 }, { "epoch": 0.9446002805049089, "grad_norm": 0.23182807862758636, "learning_rate": 4.602205149118318e-05, "loss": 0.08, "step": 1347 }, { "epoch": 0.9453015427769986, "grad_norm": 0.31045612692832947, "learning_rate": 4.601603708298817e-05, "loss": 0.2448, "step": 1348 }, { "epoch": 0.9460028050490884, "grad_norm": 0.25440487265586853, "learning_rate": 4.6010018525140604e-05, "loss": 0.0908, "step": 1349 }, { "epoch": 0.9467040673211781, "grad_norm": 0.2545359432697296, "learning_rate": 4.600399581882884e-05, "loss": 0.0909, "step": 1350 }, { "epoch": 0.9474053295932678, "grad_norm": 0.24201400578022003, "learning_rate": 4.5997968965242075e-05, "loss": 0.0825, "step": 1351 }, { "epoch": 0.9481065918653576, "grad_norm": 0.2394978106021881, "learning_rate": 4.599193796557032e-05, "loss": 0.0816, "step": 1352 }, { "epoch": 0.9488078541374474, "grad_norm": 0.23794499039649963, "learning_rate": 4.5985902821004406e-05, "loss": 0.0747, "step": 1353 }, { "epoch": 0.9495091164095372, "grad_norm": 0.3221774995326996, "learning_rate": 4.597986353273599e-05, "loss": 0.2322, "step": 1354 }, { "epoch": 0.9502103786816269, "grad_norm": 0.8162661790847778, "learning_rate": 4.597382010195753e-05, "loss": 0.4071, "step": 1355 }, { "epoch": 0.9509116409537167, "grad_norm": 0.305083692073822, "learning_rate": 4.596777252986232e-05, "loss": 0.2462, "step": 1356 }, { "epoch": 0.9516129032258065, "grad_norm": 0.2559936046600342, "learning_rate": 4.596172081764446e-05, "loss": 0.0912, "step": 1357 }, { "epoch": 0.9523141654978962, "grad_norm": 0.28394508361816406, "learning_rate": 4.595566496649888e-05, "loss": 0.2514, "step": 1358 }, { "epoch": 0.9530154277699859, "grad_norm": 0.2382439225912094, "learning_rate": 4.59496049776213e-05, "loss": 0.0745, "step": 1359 }, { "epoch": 0.9537166900420757, "grad_norm": 0.2561317980289459, "learning_rate": 4.5943540852208286e-05, "loss": 0.0914, "step": 1360 }, { "epoch": 0.9544179523141655, "grad_norm": 0.3037665784358978, "learning_rate": 4.5937472591457203e-05, "loss": 0.2434, "step": 1361 }, { "epoch": 0.9551192145862553, "grad_norm": 0.2562285363674164, "learning_rate": 4.593140019656625e-05, "loss": 0.0913, "step": 1362 }, { "epoch": 0.955820476858345, "grad_norm": 0.25460687279701233, "learning_rate": 4.5925323668734425e-05, "loss": 0.0913, "step": 1363 }, { "epoch": 0.9565217391304348, "grad_norm": 0.25562524795532227, "learning_rate": 4.591924300916155e-05, "loss": 0.0907, "step": 1364 }, { "epoch": 0.9572230014025246, "grad_norm": 0.24858476221561432, "learning_rate": 4.591315821904827e-05, "loss": 0.0832, "step": 1365 }, { "epoch": 0.9579242636746143, "grad_norm": 0.23811568319797516, "learning_rate": 4.590706929959603e-05, "loss": 0.0818, "step": 1366 }, { "epoch": 0.958625525946704, "grad_norm": 0.23621425032615662, "learning_rate": 4.590097625200709e-05, "loss": 0.081, "step": 1367 }, { "epoch": 0.9593267882187938, "grad_norm": 0.2518707513809204, "learning_rate": 4.5894879077484544e-05, "loss": 0.0901, "step": 1368 }, { "epoch": 0.9600280504908836, "grad_norm": 0.31451287865638733, "learning_rate": 4.5888777777232286e-05, "loss": 0.2445, "step": 1369 }, { "epoch": 0.9607293127629734, "grad_norm": 0.24903994798660278, "learning_rate": 4.588267235245502e-05, "loss": 0.0885, "step": 1370 }, { "epoch": 0.9614305750350631, "grad_norm": 0.3066101670265198, "learning_rate": 4.587656280435828e-05, "loss": 0.258, "step": 1371 }, { "epoch": 0.9621318373071529, "grad_norm": 0.8244637846946716, "learning_rate": 4.587044913414842e-05, "loss": 0.4134, "step": 1372 }, { "epoch": 0.9628330995792427, "grad_norm": 0.22079426050186157, "learning_rate": 4.586433134303257e-05, "loss": 0.0703, "step": 1373 }, { "epoch": 0.9635343618513323, "grad_norm": 7.766087532043457, "learning_rate": 4.585820943221871e-05, "loss": 0.8631, "step": 1374 }, { "epoch": 0.9642356241234221, "grad_norm": 7.180385589599609, "learning_rate": 4.5852083402915624e-05, "loss": 0.8227, "step": 1375 }, { "epoch": 0.9649368863955119, "grad_norm": 0.2973330020904541, "learning_rate": 4.5845953256332904e-05, "loss": 0.2543, "step": 1376 }, { "epoch": 0.9656381486676017, "grad_norm": 0.2521699070930481, "learning_rate": 4.583981899368097e-05, "loss": 0.0899, "step": 1377 }, { "epoch": 0.9663394109396914, "grad_norm": 0.8244422078132629, "learning_rate": 4.583368061617102e-05, "loss": 0.4037, "step": 1378 }, { "epoch": 0.9670406732117812, "grad_norm": 0.23985379934310913, "learning_rate": 4.58275381250151e-05, "loss": 0.0822, "step": 1379 }, { "epoch": 0.967741935483871, "grad_norm": 0.24784153699874878, "learning_rate": 4.582139152142605e-05, "loss": 0.0757, "step": 1380 }, { "epoch": 0.9684431977559608, "grad_norm": 0.24920059740543365, "learning_rate": 4.581524080661754e-05, "loss": 0.0841, "step": 1381 }, { "epoch": 0.9691444600280504, "grad_norm": 0.24819567799568176, "learning_rate": 4.580908598180402e-05, "loss": 0.0843, "step": 1382 }, { "epoch": 0.9698457223001402, "grad_norm": 0.2492281049489975, "learning_rate": 4.580292704820079e-05, "loss": 0.0844, "step": 1383 }, { "epoch": 0.97054698457223, "grad_norm": 0.7856109738349915, "learning_rate": 4.579676400702394e-05, "loss": 0.4047, "step": 1384 }, { "epoch": 0.9712482468443198, "grad_norm": 0.30018311738967896, "learning_rate": 4.579059685949035e-05, "loss": 0.2485, "step": 1385 }, { "epoch": 0.9719495091164095, "grad_norm": 0.2727750837802887, "learning_rate": 4.578442560681776e-05, "loss": 0.2508, "step": 1386 }, { "epoch": 0.9726507713884993, "grad_norm": 0.7776963114738464, "learning_rate": 4.5778250250224685e-05, "loss": 0.4119, "step": 1387 }, { "epoch": 0.9733520336605891, "grad_norm": 0.306197851896286, "learning_rate": 4.577207079093045e-05, "loss": 0.2385, "step": 1388 }, { "epoch": 0.9740532959326789, "grad_norm": 0.26948127150535583, "learning_rate": 4.576588723015522e-05, "loss": 0.0813, "step": 1389 }, { "epoch": 0.9747545582047685, "grad_norm": 0.25820401310920715, "learning_rate": 4.575969956911994e-05, "loss": 0.0932, "step": 1390 }, { "epoch": 0.9754558204768583, "grad_norm": 2.990391492843628, "learning_rate": 4.575350780904637e-05, "loss": 0.4212, "step": 1391 }, { "epoch": 0.9761570827489481, "grad_norm": 0.26485106348991394, "learning_rate": 4.5747311951157086e-05, "loss": 0.2473, "step": 1392 }, { "epoch": 0.9768583450210379, "grad_norm": 0.2634119689464569, "learning_rate": 4.5741111996675464e-05, "loss": 0.0951, "step": 1393 }, { "epoch": 0.9775596072931276, "grad_norm": 0.26336774230003357, "learning_rate": 4.5734907946825714e-05, "loss": 0.0955, "step": 1394 }, { "epoch": 0.9782608695652174, "grad_norm": 0.25498878955841064, "learning_rate": 4.572869980283282e-05, "loss": 0.2487, "step": 1395 }, { "epoch": 0.9789621318373072, "grad_norm": 2.422319173812866, "learning_rate": 4.57224875659226e-05, "loss": 0.3471, "step": 1396 }, { "epoch": 0.979663394109397, "grad_norm": 0.2827964723110199, "learning_rate": 4.5716271237321666e-05, "loss": 0.0936, "step": 1397 }, { "epoch": 0.9803646563814866, "grad_norm": 2.034679651260376, "learning_rate": 4.571005081825745e-05, "loss": 0.4425, "step": 1398 }, { "epoch": 0.9810659186535764, "grad_norm": 0.29625797271728516, "learning_rate": 4.570382630995817e-05, "loss": 0.0966, "step": 1399 }, { "epoch": 0.9817671809256662, "grad_norm": 0.35058730840682983, "learning_rate": 4.569759771365287e-05, "loss": 0.0971, "step": 1400 }, { "epoch": 0.982468443197756, "grad_norm": 0.2744004428386688, "learning_rate": 4.569136503057141e-05, "loss": 0.0996, "step": 1401 }, { "epoch": 0.9831697054698457, "grad_norm": 0.27430325746536255, "learning_rate": 4.5685128261944435e-05, "loss": 0.0994, "step": 1402 }, { "epoch": 0.9838709677419355, "grad_norm": 0.27541935443878174, "learning_rate": 4.5678887409003396e-05, "loss": 0.0998, "step": 1403 }, { "epoch": 0.9845722300140253, "grad_norm": 0.27593210339546204, "learning_rate": 4.567264247298058e-05, "loss": 0.1, "step": 1404 }, { "epoch": 0.985273492286115, "grad_norm": 0.7075673937797546, "learning_rate": 4.566639345510904e-05, "loss": 0.389, "step": 1405 }, { "epoch": 0.9859747545582047, "grad_norm": 0.2564747929573059, "learning_rate": 4.566014035662267e-05, "loss": 0.2523, "step": 1406 }, { "epoch": 0.9866760168302945, "grad_norm": 0.341265469789505, "learning_rate": 4.565388317875615e-05, "loss": 0.104, "step": 1407 }, { "epoch": 0.9873772791023843, "grad_norm": 2.164001226425171, "learning_rate": 4.564762192274498e-05, "loss": 0.4343, "step": 1408 }, { "epoch": 0.988078541374474, "grad_norm": 0.2744847536087036, "learning_rate": 4.5641356589825436e-05, "loss": 0.0998, "step": 1409 }, { "epoch": 0.9887798036465638, "grad_norm": 0.2378116101026535, "learning_rate": 4.5635087181234636e-05, "loss": 0.2467, "step": 1410 }, { "epoch": 0.9894810659186536, "grad_norm": 0.2791043817996979, "learning_rate": 4.562881369821048e-05, "loss": 0.1006, "step": 1411 }, { "epoch": 0.9901823281907434, "grad_norm": 0.2827954888343811, "learning_rate": 4.562253614199167e-05, "loss": 0.1015, "step": 1412 }, { "epoch": 0.9908835904628331, "grad_norm": 0.7585847973823547, "learning_rate": 4.561625451381773e-05, "loss": 0.1706, "step": 1413 }, { "epoch": 0.9915848527349228, "grad_norm": 0.38553163409233093, "learning_rate": 4.560996881492897e-05, "loss": 0.112, "step": 1414 }, { "epoch": 0.9922861150070126, "grad_norm": 0.25067636370658875, "learning_rate": 4.5603679046566525e-05, "loss": 0.24, "step": 1415 }, { "epoch": 0.9929873772791024, "grad_norm": 0.28044822812080383, "learning_rate": 4.5597385209972316e-05, "loss": 0.2569, "step": 1416 }, { "epoch": 0.9936886395511921, "grad_norm": 0.25493404269218445, "learning_rate": 4.5591087306389065e-05, "loss": 0.2365, "step": 1417 }, { "epoch": 0.9943899018232819, "grad_norm": 0.29073822498321533, "learning_rate": 4.5584785337060305e-05, "loss": 0.1036, "step": 1418 }, { "epoch": 0.9950911640953717, "grad_norm": 0.45427462458610535, "learning_rate": 4.557847930323037e-05, "loss": 0.1203, "step": 1419 }, { "epoch": 0.9957924263674615, "grad_norm": 0.295771986246109, "learning_rate": 4.557216920614441e-05, "loss": 0.2581, "step": 1420 }, { "epoch": 0.9964936886395512, "grad_norm": 0.2884272634983063, "learning_rate": 4.556585504704835e-05, "loss": 0.1027, "step": 1421 }, { "epoch": 0.9971949509116409, "grad_norm": 0.2842670679092407, "learning_rate": 4.555953682718894e-05, "loss": 0.1019, "step": 1422 }, { "epoch": 0.9978962131837307, "grad_norm": 0.23879435658454895, "learning_rate": 4.5553214547813715e-05, "loss": 0.2452, "step": 1423 }, { "epoch": 0.9985974754558204, "grad_norm": 0.28411993384361267, "learning_rate": 4.554688821017102e-05, "loss": 0.1017, "step": 1424 }, { "epoch": 0.9992987377279102, "grad_norm": 0.8383079767227173, "learning_rate": 4.554055781551002e-05, "loss": 0.1473, "step": 1425 }, { "epoch": 1.0, "grad_norm": 0.288871169090271, "learning_rate": 4.5534223365080644e-05, "loss": 0.1017, "step": 1426 }, { "epoch": 1.0, "eval_f1 (minor class)": 0.1200889547813195, "eval_loss": 0.17408692836761475, "eval_roc_auc": 0.5250917865352013, "eval_runtime": 233.2678, "eval_samples_per_second": 5.436, "eval_steps_per_second": 1.359, "step": 1426 }, { "epoch": 1.0007012622720897, "grad_norm": 0.37876200675964355, "learning_rate": 4.552788486013364e-05, "loss": 0.1077, "step": 1427 }, { "epoch": 1.0014025245441796, "grad_norm": 0.45390892028808594, "learning_rate": 4.552154230192056e-05, "loss": 0.1101, "step": 1428 }, { "epoch": 1.0021037868162692, "grad_norm": 1.3102214336395264, "learning_rate": 4.551519569169376e-05, "loss": 0.336, "step": 1429 }, { "epoch": 1.002805049088359, "grad_norm": 0.2605941593647003, "learning_rate": 4.550884503070638e-05, "loss": 0.2493, "step": 1430 }, { "epoch": 1.0035063113604488, "grad_norm": 0.26678207516670227, "learning_rate": 4.550249032021237e-05, "loss": 0.0961, "step": 1431 }, { "epoch": 1.0042075736325387, "grad_norm": 0.27871862053871155, "learning_rate": 4.549613156146648e-05, "loss": 0.2574, "step": 1432 }, { "epoch": 1.0049088359046283, "grad_norm": 0.2599840760231018, "learning_rate": 4.548976875572426e-05, "loss": 0.0945, "step": 1433 }, { "epoch": 1.0056100981767182, "grad_norm": 0.31192582845687866, "learning_rate": 4.548340190424206e-05, "loss": 0.0976, "step": 1434 }, { "epoch": 1.0063113604488079, "grad_norm": 0.3593388497829437, "learning_rate": 4.547703100827701e-05, "loss": 0.0989, "step": 1435 }, { "epoch": 1.0070126227208975, "grad_norm": 0.25751352310180664, "learning_rate": 4.547065606908708e-05, "loss": 0.0933, "step": 1436 }, { "epoch": 1.0077138849929874, "grad_norm": 0.2923816740512848, "learning_rate": 4.546427708793099e-05, "loss": 0.0933, "step": 1437 }, { "epoch": 1.008415147265077, "grad_norm": 0.2516046464443207, "learning_rate": 4.545789406606829e-05, "loss": 0.0918, "step": 1438 }, { "epoch": 1.009116409537167, "grad_norm": 0.2803119421005249, "learning_rate": 4.545150700475932e-05, "loss": 0.0914, "step": 1439 }, { "epoch": 1.0098176718092566, "grad_norm": 0.2703472673892975, "learning_rate": 4.544511590526521e-05, "loss": 0.0889, "step": 1440 }, { "epoch": 1.0105189340813465, "grad_norm": 0.24711363017559052, "learning_rate": 4.54387207688479e-05, "loss": 0.0893, "step": 1441 }, { "epoch": 1.0112201963534362, "grad_norm": 0.2443845123052597, "learning_rate": 4.543232159677011e-05, "loss": 0.0886, "step": 1442 }, { "epoch": 1.0119214586255258, "grad_norm": 0.8128860592842102, "learning_rate": 4.5425918390295385e-05, "loss": 0.4242, "step": 1443 }, { "epoch": 1.0126227208976157, "grad_norm": 0.24508877098560333, "learning_rate": 4.541951115068803e-05, "loss": 0.0878, "step": 1444 }, { "epoch": 1.0133239831697054, "grad_norm": 0.2505439221858978, "learning_rate": 4.5413099879213184e-05, "loss": 0.0831, "step": 1445 }, { "epoch": 1.0140252454417953, "grad_norm": 0.27523377537727356, "learning_rate": 4.540668457713675e-05, "loss": 0.081, "step": 1446 }, { "epoch": 1.014726507713885, "grad_norm": 2.0952515602111816, "learning_rate": 4.540026524572545e-05, "loss": 0.4993, "step": 1447 }, { "epoch": 1.0154277699859748, "grad_norm": 0.3205340802669525, "learning_rate": 4.539384188624678e-05, "loss": 0.2514, "step": 1448 }, { "epoch": 1.0161290322580645, "grad_norm": 0.3117935359477997, "learning_rate": 4.5387414499969054e-05, "loss": 0.2548, "step": 1449 }, { "epoch": 1.0168302945301542, "grad_norm": 0.2478097677230835, "learning_rate": 4.538098308816137e-05, "loss": 0.0828, "step": 1450 }, { "epoch": 1.017531556802244, "grad_norm": 0.8301771283149719, "learning_rate": 4.537454765209361e-05, "loss": 0.4243, "step": 1451 }, { "epoch": 1.0182328190743337, "grad_norm": 0.3093932569026947, "learning_rate": 4.5368108193036476e-05, "loss": 0.2557, "step": 1452 }, { "epoch": 1.0189340813464236, "grad_norm": 0.30550289154052734, "learning_rate": 4.536166471226144e-05, "loss": 0.2601, "step": 1453 }, { "epoch": 1.0196353436185133, "grad_norm": 0.30653810501098633, "learning_rate": 4.535521721104078e-05, "loss": 0.2541, "step": 1454 }, { "epoch": 1.0203366058906032, "grad_norm": 0.24016809463500977, "learning_rate": 4.534876569064758e-05, "loss": 0.0871, "step": 1455 }, { "epoch": 1.0210378681626928, "grad_norm": 0.30314552783966064, "learning_rate": 4.534231015235568e-05, "loss": 0.2514, "step": 1456 }, { "epoch": 1.0217391304347827, "grad_norm": 0.8025575876235962, "learning_rate": 4.533585059743976e-05, "loss": 0.418, "step": 1457 }, { "epoch": 1.0224403927068724, "grad_norm": 0.296852171421051, "learning_rate": 4.532938702717525e-05, "loss": 0.2573, "step": 1458 }, { "epoch": 1.023141654978962, "grad_norm": 0.24408957362174988, "learning_rate": 4.5322919442838404e-05, "loss": 0.0886, "step": 1459 }, { "epoch": 1.023842917251052, "grad_norm": 0.2449481189250946, "learning_rate": 4.531644784570626e-05, "loss": 0.0886, "step": 1460 }, { "epoch": 1.0245441795231416, "grad_norm": 0.29466503858566284, "learning_rate": 4.530997223705664e-05, "loss": 0.2507, "step": 1461 }, { "epoch": 1.0252454417952315, "grad_norm": 0.2924681007862091, "learning_rate": 4.5303492618168164e-05, "loss": 0.2559, "step": 1462 }, { "epoch": 1.0259467040673211, "grad_norm": 0.2867869436740875, "learning_rate": 4.529700899032026e-05, "loss": 0.2545, "step": 1463 }, { "epoch": 1.026647966339411, "grad_norm": 0.2465362846851349, "learning_rate": 4.5290521354793106e-05, "loss": 0.0897, "step": 1464 }, { "epoch": 1.0273492286115007, "grad_norm": 0.24829934537410736, "learning_rate": 4.528402971286771e-05, "loss": 0.0899, "step": 1465 }, { "epoch": 1.0280504908835906, "grad_norm": 0.2604140639305115, "learning_rate": 4.527753406582585e-05, "loss": 0.0876, "step": 1466 }, { "epoch": 1.0287517531556802, "grad_norm": 0.28749263286590576, "learning_rate": 4.527103441495011e-05, "loss": 0.0867, "step": 1467 }, { "epoch": 1.02945301542777, "grad_norm": 0.31079867482185364, "learning_rate": 4.526453076152387e-05, "loss": 0.0835, "step": 1468 }, { "epoch": 1.0301542776998598, "grad_norm": 0.2466721087694168, "learning_rate": 4.525802310683126e-05, "loss": 0.0896, "step": 1469 }, { "epoch": 1.0308555399719495, "grad_norm": 0.24847888946533203, "learning_rate": 4.525151145215725e-05, "loss": 0.0901, "step": 1470 }, { "epoch": 1.0315568022440393, "grad_norm": 0.29355722665786743, "learning_rate": 4.524499579878756e-05, "loss": 0.2507, "step": 1471 }, { "epoch": 1.032258064516129, "grad_norm": 0.2928573191165924, "learning_rate": 4.5238476148008726e-05, "loss": 0.25, "step": 1472 }, { "epoch": 1.032959326788219, "grad_norm": 0.7910961508750916, "learning_rate": 4.523195250110807e-05, "loss": 0.4227, "step": 1473 }, { "epoch": 1.0336605890603086, "grad_norm": 2.560176372528076, "learning_rate": 4.522542485937369e-05, "loss": 0.6754, "step": 1474 }, { "epoch": 1.0343618513323982, "grad_norm": 0.253307580947876, "learning_rate": 4.5218893224094475e-05, "loss": 0.0853, "step": 1475 }, { "epoch": 1.035063113604488, "grad_norm": 0.2575397193431854, "learning_rate": 4.521235759656012e-05, "loss": 0.0869, "step": 1476 }, { "epoch": 1.0357643758765778, "grad_norm": 0.24987660348415375, "learning_rate": 4.520581797806109e-05, "loss": 0.0906, "step": 1477 }, { "epoch": 1.0364656381486677, "grad_norm": 0.2816125452518463, "learning_rate": 4.519927436988864e-05, "loss": 0.2481, "step": 1478 }, { "epoch": 1.0371669004207573, "grad_norm": 0.24857982993125916, "learning_rate": 4.5192726773334816e-05, "loss": 0.0903, "step": 1479 }, { "epoch": 1.0378681626928472, "grad_norm": 0.2597651481628418, "learning_rate": 4.518617518969246e-05, "loss": 0.0875, "step": 1480 }, { "epoch": 1.0385694249649369, "grad_norm": 0.28540512919425964, "learning_rate": 4.517961962025519e-05, "loss": 0.2507, "step": 1481 }, { "epoch": 1.0392706872370265, "grad_norm": 0.30471858382225037, "learning_rate": 4.517306006631742e-05, "loss": 0.246, "step": 1482 }, { "epoch": 1.0399719495091164, "grad_norm": 0.2501063644886017, "learning_rate": 4.516649652917433e-05, "loss": 0.091, "step": 1483 }, { "epoch": 1.040673211781206, "grad_norm": 0.306984007358551, "learning_rate": 4.5159929010121915e-05, "loss": 0.0806, "step": 1484 }, { "epoch": 1.041374474053296, "grad_norm": 0.2582702934741974, "learning_rate": 4.515335751045694e-05, "loss": 0.087, "step": 1485 }, { "epoch": 1.0420757363253856, "grad_norm": 0.26104986667633057, "learning_rate": 4.5146782031476954e-05, "loss": 0.0882, "step": 1486 }, { "epoch": 1.0427769985974755, "grad_norm": 0.2888221740722656, "learning_rate": 4.514020257448031e-05, "loss": 0.0833, "step": 1487 }, { "epoch": 1.0434782608695652, "grad_norm": 0.2537221908569336, "learning_rate": 4.513361914076611e-05, "loss": 0.0854, "step": 1488 }, { "epoch": 1.044179523141655, "grad_norm": 0.29407888650894165, "learning_rate": 4.5127031731634285e-05, "loss": 0.2472, "step": 1489 }, { "epoch": 1.0448807854137447, "grad_norm": 0.3155667781829834, "learning_rate": 4.5120440348385524e-05, "loss": 0.2479, "step": 1490 }, { "epoch": 1.0455820476858344, "grad_norm": 0.25144967436790466, "learning_rate": 4.51138449923213e-05, "loss": 0.084, "step": 1491 }, { "epoch": 1.0462833099579243, "grad_norm": 0.2481483519077301, "learning_rate": 4.510724566474389e-05, "loss": 0.0836, "step": 1492 }, { "epoch": 1.046984572230014, "grad_norm": 0.30683591961860657, "learning_rate": 4.510064236695633e-05, "loss": 0.2597, "step": 1493 }, { "epoch": 1.0476858345021038, "grad_norm": 0.2563723921775818, "learning_rate": 4.509403510026246e-05, "loss": 0.0768, "step": 1494 }, { "epoch": 1.0483870967741935, "grad_norm": 0.24517430365085602, "learning_rate": 4.508742386596689e-05, "loss": 0.0873, "step": 1495 }, { "epoch": 1.0490883590462834, "grad_norm": 0.2390676885843277, "learning_rate": 4.508080866537503e-05, "loss": 0.0802, "step": 1496 }, { "epoch": 1.049789621318373, "grad_norm": 0.23691008985042572, "learning_rate": 4.5074189499793064e-05, "loss": 0.0798, "step": 1497 }, { "epoch": 1.050490883590463, "grad_norm": 2.9752190113067627, "learning_rate": 4.506756637052794e-05, "loss": 0.4427, "step": 1498 }, { "epoch": 1.0511921458625526, "grad_norm": 0.24278704822063446, "learning_rate": 4.5060939278887415e-05, "loss": 0.0718, "step": 1499 }, { "epoch": 1.0518934081346423, "grad_norm": 0.24183033406734467, "learning_rate": 4.505430822618002e-05, "loss": 0.0855, "step": 1500 }, { "epoch": 1.0525946704067322, "grad_norm": 0.33012595772743225, "learning_rate": 4.504767321371507e-05, "loss": 0.2545, "step": 1501 }, { "epoch": 1.0532959326788218, "grad_norm": 0.8503175377845764, "learning_rate": 4.504103424280266e-05, "loss": 0.4381, "step": 1502 }, { "epoch": 1.0539971949509117, "grad_norm": 0.24102482199668884, "learning_rate": 4.503439131475367e-05, "loss": 0.085, "step": 1503 }, { "epoch": 1.0546984572230014, "grad_norm": 0.8454064130783081, "learning_rate": 4.502774443087975e-05, "loss": 0.4343, "step": 1504 }, { "epoch": 1.0553997194950913, "grad_norm": 0.2412094622850418, "learning_rate": 4.5021093592493335e-05, "loss": 0.0854, "step": 1505 }, { "epoch": 1.056100981767181, "grad_norm": 0.8627410531044006, "learning_rate": 4.501443880090766e-05, "loss": 0.4209, "step": 1506 }, { "epoch": 1.0568022440392706, "grad_norm": 0.2433081418275833, "learning_rate": 4.50077800574367e-05, "loss": 0.0866, "step": 1507 }, { "epoch": 1.0575035063113605, "grad_norm": 0.240680530667305, "learning_rate": 4.500111736339526e-05, "loss": 0.0808, "step": 1508 }, { "epoch": 1.0582047685834501, "grad_norm": 0.24129627645015717, "learning_rate": 4.499445072009889e-05, "loss": 0.0808, "step": 1509 }, { "epoch": 1.05890603085554, "grad_norm": 0.24726960062980652, "learning_rate": 4.4987780128863935e-05, "loss": 0.0822, "step": 1510 }, { "epoch": 1.0596072931276297, "grad_norm": 0.2406468391418457, "learning_rate": 4.49811055910075e-05, "loss": 0.0862, "step": 1511 }, { "epoch": 1.0603085553997196, "grad_norm": 0.2463967502117157, "learning_rate": 4.49744271078475e-05, "loss": 0.0818, "step": 1512 }, { "epoch": 1.0610098176718092, "grad_norm": 2.547175168991089, "learning_rate": 4.496774468070262e-05, "loss": 0.4142, "step": 1513 }, { "epoch": 1.061711079943899, "grad_norm": 0.24112842977046967, "learning_rate": 4.496105831089229e-05, "loss": 0.081, "step": 1514 }, { "epoch": 1.0624123422159888, "grad_norm": 2.044945240020752, "learning_rate": 4.495436799973676e-05, "loss": 0.358, "step": 1515 }, { "epoch": 1.0631136044880785, "grad_norm": 0.24274036288261414, "learning_rate": 4.494767374855705e-05, "loss": 0.0874, "step": 1516 }, { "epoch": 1.0638148667601683, "grad_norm": 0.2975045442581177, "learning_rate": 4.494097555867493e-05, "loss": 0.257, "step": 1517 }, { "epoch": 1.064516129032258, "grad_norm": 1.3022165298461914, "learning_rate": 4.4934273431412996e-05, "loss": 0.5874, "step": 1518 }, { "epoch": 1.065217391304348, "grad_norm": 0.24567250907421112, "learning_rate": 4.492756736809458e-05, "loss": 0.084, "step": 1519 }, { "epoch": 1.0659186535764376, "grad_norm": 2.4776437282562256, "learning_rate": 4.492085737004381e-05, "loss": 0.3416, "step": 1520 }, { "epoch": 1.0666199158485274, "grad_norm": 0.2465512901544571, "learning_rate": 4.491414343858558e-05, "loss": 0.0902, "step": 1521 }, { "epoch": 1.067321178120617, "grad_norm": 0.2479626089334488, "learning_rate": 4.490742557504557e-05, "loss": 0.0905, "step": 1522 }, { "epoch": 1.0680224403927068, "grad_norm": 0.265165239572525, "learning_rate": 4.490070378075023e-05, "loss": 0.0904, "step": 1523 }, { "epoch": 1.0687237026647967, "grad_norm": 0.3078922927379608, "learning_rate": 4.4893978057026805e-05, "loss": 0.2508, "step": 1524 }, { "epoch": 1.0694249649368863, "grad_norm": 0.7576010227203369, "learning_rate": 4.488724840520329e-05, "loss": 0.4119, "step": 1525 }, { "epoch": 1.0701262272089762, "grad_norm": 0.26810187101364136, "learning_rate": 4.4880514826608465e-05, "loss": 0.0918, "step": 1526 }, { "epoch": 1.0708274894810659, "grad_norm": 0.27178090810775757, "learning_rate": 4.4873777322571886e-05, "loss": 0.0935, "step": 1527 }, { "epoch": 1.0715287517531558, "grad_norm": 0.2713686525821686, "learning_rate": 4.486703589442389e-05, "loss": 0.2538, "step": 1528 }, { "epoch": 1.0722300140252454, "grad_norm": 0.2955628037452698, "learning_rate": 4.486029054349558e-05, "loss": 0.2557, "step": 1529 }, { "epoch": 1.0729312762973353, "grad_norm": 0.27126792073249817, "learning_rate": 4.485354127111884e-05, "loss": 0.2489, "step": 1530 }, { "epoch": 1.073632538569425, "grad_norm": 0.25978219509124756, "learning_rate": 4.484678807862632e-05, "loss": 0.2483, "step": 1531 }, { "epoch": 1.0743338008415146, "grad_norm": 0.2695441246032715, "learning_rate": 4.4840030967351456e-05, "loss": 0.2509, "step": 1532 }, { "epoch": 1.0750350631136045, "grad_norm": 1.2784907817840576, "learning_rate": 4.483326993862844e-05, "loss": 0.3972, "step": 1533 }, { "epoch": 1.0757363253856942, "grad_norm": 0.31042924523353577, "learning_rate": 4.4826504993792274e-05, "loss": 0.0953, "step": 1534 }, { "epoch": 1.076437587657784, "grad_norm": 0.264726847410202, "learning_rate": 4.481973613417868e-05, "loss": 0.2524, "step": 1535 }, { "epoch": 1.0771388499298737, "grad_norm": 0.258071631193161, "learning_rate": 4.481296336112419e-05, "loss": 0.0943, "step": 1536 }, { "epoch": 1.0778401122019636, "grad_norm": 0.2883880138397217, "learning_rate": 4.48061866759661e-05, "loss": 0.0977, "step": 1537 }, { "epoch": 1.0785413744740533, "grad_norm": 0.2596016526222229, "learning_rate": 4.4799406080042486e-05, "loss": 0.0951, "step": 1538 }, { "epoch": 1.079242636746143, "grad_norm": 0.28890836238861084, "learning_rate": 4.4792621574692176e-05, "loss": 0.097, "step": 1539 }, { "epoch": 1.0799438990182328, "grad_norm": 0.27450546622276306, "learning_rate": 4.478583316125479e-05, "loss": 0.252, "step": 1540 }, { "epoch": 1.0806451612903225, "grad_norm": 1.0820015668869019, "learning_rate": 4.4779040841070705e-05, "loss": 0.3668, "step": 1541 }, { "epoch": 1.0813464235624124, "grad_norm": 0.2543790340423584, "learning_rate": 4.4772244615481094e-05, "loss": 0.2443, "step": 1542 }, { "epoch": 1.082047685834502, "grad_norm": 0.2949135899543762, "learning_rate": 4.476544448582786e-05, "loss": 0.0984, "step": 1543 }, { "epoch": 1.082748948106592, "grad_norm": 0.2528541386127472, "learning_rate": 4.475864045345371e-05, "loss": 0.2484, "step": 1544 }, { "epoch": 1.0834502103786816, "grad_norm": 0.26322582364082336, "learning_rate": 4.475183251970212e-05, "loss": 0.2489, "step": 1545 }, { "epoch": 1.0841514726507713, "grad_norm": 0.2620756924152374, "learning_rate": 4.4745020685917314e-05, "loss": 0.2507, "step": 1546 }, { "epoch": 1.0848527349228612, "grad_norm": 0.2548838257789612, "learning_rate": 4.473820495344431e-05, "loss": 0.248, "step": 1547 }, { "epoch": 1.0855539971949508, "grad_norm": 0.24572794139385223, "learning_rate": 4.473138532362888e-05, "loss": 0.2458, "step": 1548 }, { "epoch": 1.0862552594670407, "grad_norm": 0.26769140362739563, "learning_rate": 4.4724561797817584e-05, "loss": 0.2509, "step": 1549 }, { "epoch": 1.0869565217391304, "grad_norm": 0.34975120425224304, "learning_rate": 4.4717734377357725e-05, "loss": 0.1063, "step": 1550 }, { "epoch": 1.0876577840112203, "grad_norm": 0.2456403523683548, "learning_rate": 4.471090306359739e-05, "loss": 0.2455, "step": 1551 }, { "epoch": 1.08835904628331, "grad_norm": 0.24154141545295715, "learning_rate": 4.470406785788543e-05, "loss": 0.2486, "step": 1552 }, { "epoch": 1.0890603085553998, "grad_norm": 0.24251843988895416, "learning_rate": 4.469722876157149e-05, "loss": 0.2512, "step": 1553 }, { "epoch": 1.0897615708274895, "grad_norm": 0.23832961916923523, "learning_rate": 4.469038577600594e-05, "loss": 0.2435, "step": 1554 }, { "epoch": 1.0904628330995791, "grad_norm": 0.281493216753006, "learning_rate": 4.468353890253995e-05, "loss": 0.2571, "step": 1555 }, { "epoch": 1.091164095371669, "grad_norm": 0.28381064534187317, "learning_rate": 4.4676688142525436e-05, "loss": 0.1016, "step": 1556 }, { "epoch": 1.0918653576437587, "grad_norm": 0.2831249237060547, "learning_rate": 4.466983349731511e-05, "loss": 0.101, "step": 1557 }, { "epoch": 1.0925666199158486, "grad_norm": 0.39789021015167236, "learning_rate": 4.466297496826241e-05, "loss": 0.1087, "step": 1558 }, { "epoch": 1.0932678821879382, "grad_norm": 0.32453107833862305, "learning_rate": 4.465611255672157e-05, "loss": 0.1043, "step": 1559 }, { "epoch": 1.0939691444600281, "grad_norm": 0.337541401386261, "learning_rate": 4.464924626404761e-05, "loss": 0.1061, "step": 1560 }, { "epoch": 1.0946704067321178, "grad_norm": 0.28520235419273376, "learning_rate": 4.4642376091596265e-05, "loss": 0.2509, "step": 1561 }, { "epoch": 1.0953716690042077, "grad_norm": 0.23992851376533508, "learning_rate": 4.463550204072407e-05, "loss": 0.2424, "step": 1562 }, { "epoch": 1.0960729312762973, "grad_norm": 0.24844178557395935, "learning_rate": 4.462862411278832e-05, "loss": 0.2437, "step": 1563 }, { "epoch": 1.096774193548387, "grad_norm": 0.2762105464935303, "learning_rate": 4.462174230914707e-05, "loss": 0.099, "step": 1564 }, { "epoch": 1.097475455820477, "grad_norm": 0.2556638717651367, "learning_rate": 4.461485663115915e-05, "loss": 0.2463, "step": 1565 }, { "epoch": 1.0981767180925666, "grad_norm": 0.2755028009414673, "learning_rate": 4.4607967080184144e-05, "loss": 0.0984, "step": 1566 }, { "epoch": 1.0988779803646564, "grad_norm": 0.2761232852935791, "learning_rate": 4.46010736575824e-05, "loss": 0.0986, "step": 1567 }, { "epoch": 1.0995792426367461, "grad_norm": 0.3140960931777954, "learning_rate": 4.4594176364715055e-05, "loss": 0.0975, "step": 1568 }, { "epoch": 1.100280504908836, "grad_norm": 0.27235937118530273, "learning_rate": 4.458727520294397e-05, "loss": 0.0973, "step": 1569 }, { "epoch": 1.1009817671809257, "grad_norm": 0.263944536447525, "learning_rate": 4.45803701736318e-05, "loss": 0.2474, "step": 1570 }, { "epoch": 1.1016830294530153, "grad_norm": 0.29119524359703064, "learning_rate": 4.457346127814196e-05, "loss": 0.0959, "step": 1571 }, { "epoch": 1.1023842917251052, "grad_norm": 0.270054429769516, "learning_rate": 4.456654851783861e-05, "loss": 0.096, "step": 1572 }, { "epoch": 1.1030855539971949, "grad_norm": 1.3696931600570679, "learning_rate": 4.455963189408671e-05, "loss": 0.2718, "step": 1573 }, { "epoch": 1.1037868162692848, "grad_norm": 0.2660577893257141, "learning_rate": 4.4552711408251925e-05, "loss": 0.0955, "step": 1574 }, { "epoch": 1.1044880785413744, "grad_norm": 0.2817106544971466, "learning_rate": 4.454578706170075e-05, "loss": 0.248, "step": 1575 }, { "epoch": 1.1051893408134643, "grad_norm": 0.26571038365364075, "learning_rate": 4.453885885580039e-05, "loss": 0.0953, "step": 1576 }, { "epoch": 1.105890603085554, "grad_norm": 0.3060363531112671, "learning_rate": 4.4531926791918835e-05, "loss": 0.2455, "step": 1577 }, { "epoch": 1.1065918653576436, "grad_norm": 0.28391918540000916, "learning_rate": 4.452499087142483e-05, "loss": 0.2444, "step": 1578 }, { "epoch": 1.1072931276297335, "grad_norm": 0.2673475444316864, "learning_rate": 4.45180510956879e-05, "loss": 0.2469, "step": 1579 }, { "epoch": 1.1079943899018232, "grad_norm": 0.3228946924209595, "learning_rate": 4.4511107466078294e-05, "loss": 0.0949, "step": 1580 }, { "epoch": 1.108695652173913, "grad_norm": 0.31056925654411316, "learning_rate": 4.450415998396705e-05, "loss": 0.0912, "step": 1581 }, { "epoch": 1.1093969144460027, "grad_norm": 0.2765128016471863, "learning_rate": 4.4497208650725974e-05, "loss": 0.2465, "step": 1582 }, { "epoch": 1.1100981767180926, "grad_norm": 0.32272353768348694, "learning_rate": 4.4490253467727604e-05, "loss": 0.2488, "step": 1583 }, { "epoch": 1.1107994389901823, "grad_norm": 0.3075654208660126, "learning_rate": 4.448329443634526e-05, "loss": 0.0915, "step": 1584 }, { "epoch": 1.1115007012622722, "grad_norm": 0.27821481227874756, "learning_rate": 4.447633155795301e-05, "loss": 0.2524, "step": 1585 }, { "epoch": 1.1122019635343618, "grad_norm": 0.278465211391449, "learning_rate": 4.4469364833925695e-05, "loss": 0.251, "step": 1586 }, { "epoch": 1.1129032258064515, "grad_norm": 0.2622699737548828, "learning_rate": 4.4462394265638883e-05, "loss": 0.0932, "step": 1587 }, { "epoch": 1.1136044880785414, "grad_norm": 0.2595105469226837, "learning_rate": 4.445541985446896e-05, "loss": 0.0925, "step": 1588 }, { "epoch": 1.114305750350631, "grad_norm": 0.26019108295440674, "learning_rate": 4.4448441601793014e-05, "loss": 0.093, "step": 1589 }, { "epoch": 1.115007012622721, "grad_norm": 0.2614012658596039, "learning_rate": 4.44414595089889e-05, "loss": 0.0926, "step": 1590 }, { "epoch": 1.1157082748948106, "grad_norm": 0.2588747441768646, "learning_rate": 4.443447357743528e-05, "loss": 0.0917, "step": 1591 }, { "epoch": 1.1164095371669005, "grad_norm": 0.2755652070045471, "learning_rate": 4.44274838085115e-05, "loss": 0.089, "step": 1592 }, { "epoch": 1.1171107994389902, "grad_norm": 0.2675817310810089, "learning_rate": 4.442049020359773e-05, "loss": 0.0874, "step": 1593 }, { "epoch": 1.11781206171108, "grad_norm": 0.25709888339042664, "learning_rate": 4.441349276407486e-05, "loss": 0.0908, "step": 1594 }, { "epoch": 1.1185133239831697, "grad_norm": 0.2565917372703552, "learning_rate": 4.440649149132454e-05, "loss": 0.0849, "step": 1595 }, { "epoch": 1.1192145862552594, "grad_norm": 0.2637706398963928, "learning_rate": 4.43994863867292e-05, "loss": 0.0858, "step": 1596 }, { "epoch": 1.1199158485273493, "grad_norm": 0.8290214538574219, "learning_rate": 4.4392477451671994e-05, "loss": 0.4257, "step": 1597 }, { "epoch": 1.120617110799439, "grad_norm": 2.7374427318573, "learning_rate": 4.4385464687536845e-05, "loss": 0.5693, "step": 1598 }, { "epoch": 1.1213183730715288, "grad_norm": 0.2539411783218384, "learning_rate": 4.4378448095708455e-05, "loss": 0.0889, "step": 1599 }, { "epoch": 1.1220196353436185, "grad_norm": 2.690411329269409, "learning_rate": 4.437142767757225e-05, "loss": 0.7046, "step": 1600 }, { "epoch": 1.1227208976157084, "grad_norm": 0.29606756567955017, "learning_rate": 4.436440343451442e-05, "loss": 0.2531, "step": 1601 }, { "epoch": 1.123422159887798, "grad_norm": 0.794806718826294, "learning_rate": 4.4357375367921915e-05, "loss": 0.4157, "step": 1602 }, { "epoch": 1.1241234221598877, "grad_norm": 0.8014317750930786, "learning_rate": 4.435034347918245e-05, "loss": 0.4177, "step": 1603 }, { "epoch": 1.1248246844319776, "grad_norm": 0.28170913457870483, "learning_rate": 4.434330776968447e-05, "loss": 0.0848, "step": 1604 }, { "epoch": 1.1255259467040672, "grad_norm": 0.7935320734977722, "learning_rate": 4.433626824081719e-05, "loss": 0.4104, "step": 1605 }, { "epoch": 1.1262272089761571, "grad_norm": 0.2828822135925293, "learning_rate": 4.432922489397059e-05, "loss": 0.2474, "step": 1606 }, { "epoch": 1.1269284712482468, "grad_norm": 0.2927398681640625, "learning_rate": 4.4322177730535374e-05, "loss": 0.0888, "step": 1607 }, { "epoch": 1.1276297335203367, "grad_norm": 0.2605818510055542, "learning_rate": 4.431512675190303e-05, "loss": 0.0936, "step": 1608 }, { "epoch": 1.1283309957924264, "grad_norm": 0.26187512278556824, "learning_rate": 4.4308071959465774e-05, "loss": 0.0945, "step": 1609 }, { "epoch": 1.129032258064516, "grad_norm": 0.2790880501270294, "learning_rate": 4.4301013354616595e-05, "loss": 0.093, "step": 1610 }, { "epoch": 1.129733520336606, "grad_norm": 0.26847758889198303, "learning_rate": 4.429395093874923e-05, "loss": 0.2484, "step": 1611 }, { "epoch": 1.1304347826086956, "grad_norm": 0.26900821924209595, "learning_rate": 4.428688471325815e-05, "loss": 0.2482, "step": 1612 }, { "epoch": 1.1311360448807855, "grad_norm": 0.2662496268749237, "learning_rate": 4.427981467953861e-05, "loss": 0.2472, "step": 1613 }, { "epoch": 1.1318373071528751, "grad_norm": 0.26535531878471375, "learning_rate": 4.4272740838986585e-05, "loss": 0.0949, "step": 1614 }, { "epoch": 1.132538569424965, "grad_norm": 1.5433094501495361, "learning_rate": 4.426566319299883e-05, "loss": 0.2931, "step": 1615 }, { "epoch": 1.1332398316970547, "grad_norm": 0.27371445298194885, "learning_rate": 4.425858174297283e-05, "loss": 0.2464, "step": 1616 }, { "epoch": 1.1339410939691446, "grad_norm": 0.28649306297302246, "learning_rate": 4.4251496490306835e-05, "loss": 0.0965, "step": 1617 }, { "epoch": 1.1346423562412342, "grad_norm": 0.7433398365974426, "learning_rate": 4.4244407436399837e-05, "loss": 0.3995, "step": 1618 }, { "epoch": 1.1353436185133239, "grad_norm": 0.2875600755214691, "learning_rate": 4.4237314582651585e-05, "loss": 0.0953, "step": 1619 }, { "epoch": 1.1360448807854138, "grad_norm": 0.24635711312294006, "learning_rate": 4.423021793046257e-05, "loss": 0.2455, "step": 1620 }, { "epoch": 1.1367461430575034, "grad_norm": 0.7345531582832336, "learning_rate": 4.4223117481234044e-05, "loss": 0.3988, "step": 1621 }, { "epoch": 1.1374474053295933, "grad_norm": 0.25590527057647705, "learning_rate": 4.4216013236368e-05, "loss": 0.2473, "step": 1622 }, { "epoch": 1.138148667601683, "grad_norm": 0.2743765115737915, "learning_rate": 4.420890519726718e-05, "loss": 0.0983, "step": 1623 }, { "epoch": 1.1388499298737729, "grad_norm": 0.25098279118537903, "learning_rate": 4.420179336533509e-05, "loss": 0.2427, "step": 1624 }, { "epoch": 1.1395511921458625, "grad_norm": 0.24860350787639618, "learning_rate": 4.4194677741975955e-05, "loss": 0.2433, "step": 1625 }, { "epoch": 1.1402524544179524, "grad_norm": 0.24288491904735565, "learning_rate": 4.418755832859478e-05, "loss": 0.2478, "step": 1626 }, { "epoch": 1.140953716690042, "grad_norm": 0.3012188673019409, "learning_rate": 4.418043512659731e-05, "loss": 0.0998, "step": 1627 }, { "epoch": 1.1416549789621318, "grad_norm": 0.34581485390663147, "learning_rate": 4.417330813739001e-05, "loss": 0.1031, "step": 1628 }, { "epoch": 1.1423562412342216, "grad_norm": 0.2802393436431885, "learning_rate": 4.4166177362380144e-05, "loss": 0.1004, "step": 1629 }, { "epoch": 1.1430575035063113, "grad_norm": 0.2804116904735565, "learning_rate": 4.4159042802975684e-05, "loss": 0.1004, "step": 1630 }, { "epoch": 1.1437587657784012, "grad_norm": 0.28247618675231934, "learning_rate": 4.415190446058536e-05, "loss": 0.101, "step": 1631 }, { "epoch": 1.1444600280504909, "grad_norm": 0.30268722772598267, "learning_rate": 4.414476233661865e-05, "loss": 0.1007, "step": 1632 }, { "epoch": 1.1451612903225807, "grad_norm": 0.35813426971435547, "learning_rate": 4.4137616432485786e-05, "loss": 0.1032, "step": 1633 }, { "epoch": 1.1458625525946704, "grad_norm": 0.3326414227485657, "learning_rate": 4.413046674959772e-05, "loss": 0.1001, "step": 1634 }, { "epoch": 1.1465638148667603, "grad_norm": 0.28055688738822937, "learning_rate": 4.4123313289366194e-05, "loss": 0.1001, "step": 1635 }, { "epoch": 1.14726507713885, "grad_norm": 0.270048588514328, "learning_rate": 4.411615605320365e-05, "loss": 0.2446, "step": 1636 }, { "epoch": 1.1479663394109396, "grad_norm": 0.27725139260292053, "learning_rate": 4.410899504252332e-05, "loss": 0.0987, "step": 1637 }, { "epoch": 1.1486676016830295, "grad_norm": 1.1510576009750366, "learning_rate": 4.410183025873913e-05, "loss": 0.2658, "step": 1638 }, { "epoch": 1.1493688639551192, "grad_norm": 0.28646260499954224, "learning_rate": 4.409466170326579e-05, "loss": 0.251, "step": 1639 }, { "epoch": 1.150070126227209, "grad_norm": 0.26908111572265625, "learning_rate": 4.4087489377518754e-05, "loss": 0.0971, "step": 1640 }, { "epoch": 1.1507713884992987, "grad_norm": 0.2879350781440735, "learning_rate": 4.4080313282914196e-05, "loss": 0.0966, "step": 1641 }, { "epoch": 1.1514726507713884, "grad_norm": 0.26007214188575745, "learning_rate": 4.4073133420869055e-05, "loss": 0.2513, "step": 1642 }, { "epoch": 1.1521739130434783, "grad_norm": 1.040879726409912, "learning_rate": 4.406594979280101e-05, "loss": 0.2544, "step": 1643 }, { "epoch": 1.152875175315568, "grad_norm": 0.29438871145248413, "learning_rate": 4.405876240012847e-05, "loss": 0.0975, "step": 1644 }, { "epoch": 1.1535764375876578, "grad_norm": 0.29054293036460876, "learning_rate": 4.405157124427061e-05, "loss": 0.0981, "step": 1645 }, { "epoch": 1.1542776998597475, "grad_norm": 0.2859719693660736, "learning_rate": 4.404437632664733e-05, "loss": 0.0959, "step": 1646 }, { "epoch": 1.1549789621318374, "grad_norm": 0.2831045985221863, "learning_rate": 4.4037177648679274e-05, "loss": 0.0965, "step": 1647 }, { "epoch": 1.155680224403927, "grad_norm": 0.28792262077331543, "learning_rate": 4.4029975211787844e-05, "loss": 0.0968, "step": 1648 }, { "epoch": 1.156381486676017, "grad_norm": 0.2644612789154053, "learning_rate": 4.402276901739517e-05, "loss": 0.0959, "step": 1649 }, { "epoch": 1.1570827489481066, "grad_norm": 0.26387739181518555, "learning_rate": 4.401555906692413e-05, "loss": 0.0962, "step": 1650 }, { "epoch": 1.1577840112201963, "grad_norm": 0.3108357787132263, "learning_rate": 4.400834536179832e-05, "loss": 0.0966, "step": 1651 }, { "epoch": 1.1584852734922861, "grad_norm": 0.25980550050735474, "learning_rate": 4.400112790344213e-05, "loss": 0.2491, "step": 1652 }, { "epoch": 1.1591865357643758, "grad_norm": 0.25991588830947876, "learning_rate": 4.399390669328064e-05, "loss": 0.245, "step": 1653 }, { "epoch": 1.1598877980364657, "grad_norm": 0.25834113359451294, "learning_rate": 4.39866817327397e-05, "loss": 0.0945, "step": 1654 }, { "epoch": 1.1605890603085554, "grad_norm": 0.2647199034690857, "learning_rate": 4.3979453023245884e-05, "loss": 0.2504, "step": 1655 }, { "epoch": 1.1612903225806452, "grad_norm": 0.32795625925064087, "learning_rate": 4.397222056622651e-05, "loss": 0.0945, "step": 1656 }, { "epoch": 1.161991584852735, "grad_norm": 0.27843576669692993, "learning_rate": 4.396498436310965e-05, "loss": 0.095, "step": 1657 }, { "epoch": 1.1626928471248248, "grad_norm": 0.25918513536453247, "learning_rate": 4.3957744415324094e-05, "loss": 0.0939, "step": 1658 }, { "epoch": 1.1633941093969145, "grad_norm": 0.2650809586048126, "learning_rate": 4.395050072429939e-05, "loss": 0.2487, "step": 1659 }, { "epoch": 1.1640953716690041, "grad_norm": 0.2530025541782379, "learning_rate": 4.3943253291465814e-05, "loss": 0.0918, "step": 1660 }, { "epoch": 1.164796633941094, "grad_norm": 0.26504117250442505, "learning_rate": 4.3936002118254385e-05, "loss": 0.0913, "step": 1661 }, { "epoch": 1.1654978962131837, "grad_norm": 0.26788651943206787, "learning_rate": 4.392874720609685e-05, "loss": 0.0921, "step": 1662 }, { "epoch": 1.1661991584852736, "grad_norm": 0.25101807713508606, "learning_rate": 4.3921488556425725e-05, "loss": 0.0913, "step": 1663 }, { "epoch": 1.1669004207573632, "grad_norm": 1.2680083513259888, "learning_rate": 4.391422617067423e-05, "loss": 0.5706, "step": 1664 }, { "epoch": 1.167601683029453, "grad_norm": 0.783208966255188, "learning_rate": 4.390696005027633e-05, "loss": 0.4182, "step": 1665 }, { "epoch": 1.1683029453015428, "grad_norm": 0.2744220495223999, "learning_rate": 4.389969019666674e-05, "loss": 0.0868, "step": 1666 }, { "epoch": 1.1690042075736327, "grad_norm": 0.2594437003135681, "learning_rate": 4.389241661128091e-05, "loss": 0.0889, "step": 1667 }, { "epoch": 1.1697054698457223, "grad_norm": 0.28359851241111755, "learning_rate": 4.3885139295555014e-05, "loss": 0.2562, "step": 1668 }, { "epoch": 1.170406732117812, "grad_norm": 0.25767427682876587, "learning_rate": 4.387785825092596e-05, "loss": 0.0892, "step": 1669 }, { "epoch": 1.1711079943899019, "grad_norm": 0.2846045196056366, "learning_rate": 4.387057347883143e-05, "loss": 0.2537, "step": 1670 }, { "epoch": 1.1718092566619915, "grad_norm": 0.24978309869766235, "learning_rate": 4.38632849807098e-05, "loss": 0.0906, "step": 1671 }, { "epoch": 1.1725105189340814, "grad_norm": 1.1881375312805176, "learning_rate": 4.385599275800019e-05, "loss": 0.2862, "step": 1672 }, { "epoch": 1.173211781206171, "grad_norm": 0.2804025113582611, "learning_rate": 4.384869681214246e-05, "loss": 0.2519, "step": 1673 }, { "epoch": 1.1739130434782608, "grad_norm": 0.25132954120635986, "learning_rate": 4.384139714457723e-05, "loss": 0.0913, "step": 1674 }, { "epoch": 1.1746143057503506, "grad_norm": 0.250708669424057, "learning_rate": 4.383409375674581e-05, "loss": 0.0913, "step": 1675 }, { "epoch": 1.1753155680224403, "grad_norm": 0.26255956292152405, "learning_rate": 4.382678665009028e-05, "loss": 0.09, "step": 1676 }, { "epoch": 1.1760168302945302, "grad_norm": 0.2754112780094147, "learning_rate": 4.3819475826053426e-05, "loss": 0.2499, "step": 1677 }, { "epoch": 1.1767180925666199, "grad_norm": 0.26163342595100403, "learning_rate": 4.38121612860788e-05, "loss": 0.0896, "step": 1678 }, { "epoch": 1.1774193548387097, "grad_norm": 0.30236268043518066, "learning_rate": 4.3804843031610655e-05, "loss": 0.2488, "step": 1679 }, { "epoch": 1.1781206171107994, "grad_norm": 0.2639826834201813, "learning_rate": 4.3797521064094e-05, "loss": 0.0906, "step": 1680 }, { "epoch": 1.1788218793828893, "grad_norm": 0.2752377688884735, "learning_rate": 4.379019538497457e-05, "loss": 0.2516, "step": 1681 }, { "epoch": 1.179523141654979, "grad_norm": 0.2625703513622284, "learning_rate": 4.378286599569883e-05, "loss": 0.0906, "step": 1682 }, { "epoch": 1.1802244039270686, "grad_norm": 0.26494288444519043, "learning_rate": 4.377553289771399e-05, "loss": 0.0909, "step": 1683 }, { "epoch": 1.1809256661991585, "grad_norm": 0.2620839774608612, "learning_rate": 4.376819609246797e-05, "loss": 0.09, "step": 1684 }, { "epoch": 1.1816269284712482, "grad_norm": 0.25137925148010254, "learning_rate": 4.3760855581409446e-05, "loss": 0.0916, "step": 1685 }, { "epoch": 1.182328190743338, "grad_norm": 0.2920821011066437, "learning_rate": 4.37535113659878e-05, "loss": 0.2563, "step": 1686 }, { "epoch": 1.1830294530154277, "grad_norm": 1.2823435068130493, "learning_rate": 4.3746163447653176e-05, "loss": 0.5802, "step": 1687 }, { "epoch": 1.1837307152875176, "grad_norm": 0.27940279245376587, "learning_rate": 4.373881182785643e-05, "loss": 0.252, "step": 1688 }, { "epoch": 1.1844319775596073, "grad_norm": 0.2789333462715149, "learning_rate": 4.373145650804914e-05, "loss": 0.2494, "step": 1689 }, { "epoch": 1.1851332398316972, "grad_norm": 1.1428829431533813, "learning_rate": 4.3724097489683634e-05, "loss": 0.2689, "step": 1690 }, { "epoch": 1.1858345021037868, "grad_norm": 0.25178763270378113, "learning_rate": 4.371673477421296e-05, "loss": 0.0918, "step": 1691 }, { "epoch": 1.1865357643758765, "grad_norm": 0.2600536048412323, "learning_rate": 4.3709368363090916e-05, "loss": 0.0897, "step": 1692 }, { "epoch": 1.1872370266479664, "grad_norm": 0.7622694969177246, "learning_rate": 4.370199825777199e-05, "loss": 0.4091, "step": 1693 }, { "epoch": 1.187938288920056, "grad_norm": 0.2534977197647095, "learning_rate": 4.369462445971143e-05, "loss": 0.0924, "step": 1694 }, { "epoch": 1.188639551192146, "grad_norm": 0.2712070643901825, "learning_rate": 4.36872469703652e-05, "loss": 0.2511, "step": 1695 }, { "epoch": 1.1893408134642356, "grad_norm": 0.25794029235839844, "learning_rate": 4.3679865791190016e-05, "loss": 0.0936, "step": 1696 }, { "epoch": 1.1900420757363255, "grad_norm": 0.27509772777557373, "learning_rate": 4.367248092364329e-05, "loss": 0.0942, "step": 1697 }, { "epoch": 1.1907433380084151, "grad_norm": 0.2628704905509949, "learning_rate": 4.366509236918317e-05, "loss": 0.2494, "step": 1698 }, { "epoch": 1.191444600280505, "grad_norm": 0.2757580280303955, "learning_rate": 4.365770012926855e-05, "loss": 0.2506, "step": 1699 }, { "epoch": 1.1921458625525947, "grad_norm": 0.2582368552684784, "learning_rate": 4.365030420535904e-05, "loss": 0.0943, "step": 1700 }, { "epoch": 1.1928471248246844, "grad_norm": 0.2571169137954712, "learning_rate": 4.364290459891498e-05, "loss": 0.0938, "step": 1701 }, { "epoch": 1.1935483870967742, "grad_norm": 0.2583800256252289, "learning_rate": 4.363550131139743e-05, "loss": 0.0941, "step": 1702 }, { "epoch": 1.194249649368864, "grad_norm": 0.25680840015411377, "learning_rate": 4.362809434426818e-05, "loss": 0.2503, "step": 1703 }, { "epoch": 1.1949509116409538, "grad_norm": 0.2587563693523407, "learning_rate": 4.362068369898975e-05, "loss": 0.0941, "step": 1704 }, { "epoch": 1.1956521739130435, "grad_norm": 0.2695988416671753, "learning_rate": 4.3613269377025376e-05, "loss": 0.2492, "step": 1705 }, { "epoch": 1.1963534361851331, "grad_norm": 0.2795807719230652, "learning_rate": 4.360585137983906e-05, "loss": 0.2565, "step": 1706 }, { "epoch": 1.197054698457223, "grad_norm": 0.257971853017807, "learning_rate": 4.359842970889546e-05, "loss": 0.0943, "step": 1707 }, { "epoch": 1.1977559607293127, "grad_norm": 0.26234811544418335, "learning_rate": 4.359100436566002e-05, "loss": 0.2506, "step": 1708 }, { "epoch": 1.1984572230014026, "grad_norm": 0.2625715732574463, "learning_rate": 4.358357535159888e-05, "loss": 0.2493, "step": 1709 }, { "epoch": 1.1991584852734922, "grad_norm": 0.2597384452819824, "learning_rate": 4.357614266817891e-05, "loss": 0.0945, "step": 1710 }, { "epoch": 1.1998597475455821, "grad_norm": 0.2603517770767212, "learning_rate": 4.356870631686771e-05, "loss": 0.2501, "step": 1711 }, { "epoch": 1.2005610098176718, "grad_norm": 0.27304375171661377, "learning_rate": 4.3561266299133595e-05, "loss": 0.2517, "step": 1712 }, { "epoch": 1.2012622720897617, "grad_norm": 0.3100862503051758, "learning_rate": 4.3553822616445625e-05, "loss": 0.0964, "step": 1713 }, { "epoch": 1.2019635343618513, "grad_norm": 0.2599480152130127, "learning_rate": 4.354637527027355e-05, "loss": 0.0948, "step": 1714 }, { "epoch": 1.202664796633941, "grad_norm": 0.2830665111541748, "learning_rate": 4.353892426208787e-05, "loss": 0.0956, "step": 1715 }, { "epoch": 1.2033660589060309, "grad_norm": 0.25782883167266846, "learning_rate": 4.353146959335978e-05, "loss": 0.094, "step": 1716 }, { "epoch": 1.2040673211781205, "grad_norm": 0.2575712502002716, "learning_rate": 4.352401126556125e-05, "loss": 0.0941, "step": 1717 }, { "epoch": 1.2047685834502104, "grad_norm": 0.33132219314575195, "learning_rate": 4.351654928016492e-05, "loss": 0.0934, "step": 1718 }, { "epoch": 1.2054698457223, "grad_norm": 0.2743256092071533, "learning_rate": 4.3509083638644174e-05, "loss": 0.0938, "step": 1719 }, { "epoch": 1.20617110799439, "grad_norm": 0.2776623070240021, "learning_rate": 4.3501614342473115e-05, "loss": 0.0918, "step": 1720 }, { "epoch": 1.2068723702664796, "grad_norm": 0.2760424315929413, "learning_rate": 4.349414139312658e-05, "loss": 0.251, "step": 1721 }, { "epoch": 1.2075736325385695, "grad_norm": 0.2831612825393677, "learning_rate": 4.348666479208009e-05, "loss": 0.2523, "step": 1722 }, { "epoch": 1.2082748948106592, "grad_norm": 0.2644678056240082, "learning_rate": 4.347918454080994e-05, "loss": 0.0908, "step": 1723 }, { "epoch": 1.2089761570827489, "grad_norm": 0.25193578004837036, "learning_rate": 4.347170064079311e-05, "loss": 0.0922, "step": 1724 }, { "epoch": 1.2096774193548387, "grad_norm": 0.28283700346946716, "learning_rate": 4.34642130935073e-05, "loss": 0.2507, "step": 1725 }, { "epoch": 1.2103786816269284, "grad_norm": 1.7996563911437988, "learning_rate": 4.345672190043094e-05, "loss": 0.4462, "step": 1726 }, { "epoch": 1.2110799438990183, "grad_norm": 0.2521771788597107, "learning_rate": 4.3449227063043196e-05, "loss": 0.092, "step": 1727 }, { "epoch": 1.211781206171108, "grad_norm": 0.2811664342880249, "learning_rate": 4.3441728582823914e-05, "loss": 0.2541, "step": 1728 }, { "epoch": 1.2124824684431978, "grad_norm": 0.25794678926467896, "learning_rate": 4.34342264612537e-05, "loss": 0.0891, "step": 1729 }, { "epoch": 1.2131837307152875, "grad_norm": 0.30555540323257446, "learning_rate": 4.342672069981385e-05, "loss": 0.0881, "step": 1730 }, { "epoch": 1.2138849929873774, "grad_norm": 0.27281343936920166, "learning_rate": 4.3419211299986384e-05, "loss": 0.2514, "step": 1731 }, { "epoch": 1.214586255259467, "grad_norm": 0.2496904730796814, "learning_rate": 4.341169826325406e-05, "loss": 0.0913, "step": 1732 }, { "epoch": 1.2152875175315567, "grad_norm": 0.28752103447914124, "learning_rate": 4.3404181591100345e-05, "loss": 0.2545, "step": 1733 }, { "epoch": 1.2159887798036466, "grad_norm": 0.28477439284324646, "learning_rate": 4.33966612850094e-05, "loss": 0.2527, "step": 1734 }, { "epoch": 1.2166900420757363, "grad_norm": 0.24891877174377441, "learning_rate": 4.3389137346466124e-05, "loss": 0.0912, "step": 1735 }, { "epoch": 1.2173913043478262, "grad_norm": 0.2618456184864044, "learning_rate": 4.3381609776956146e-05, "loss": 0.0902, "step": 1736 }, { "epoch": 1.2180925666199158, "grad_norm": 0.2602887451648712, "learning_rate": 4.3374078577965784e-05, "loss": 0.0894, "step": 1737 }, { "epoch": 1.2187938288920055, "grad_norm": 0.27974236011505127, "learning_rate": 4.3366543750982104e-05, "loss": 0.2537, "step": 1738 }, { "epoch": 1.2194950911640954, "grad_norm": 1.359866976737976, "learning_rate": 4.335900529749285e-05, "loss": 0.4345, "step": 1739 }, { "epoch": 1.220196353436185, "grad_norm": 0.2601240873336792, "learning_rate": 4.335146321898651e-05, "loss": 0.0898, "step": 1740 }, { "epoch": 1.220897615708275, "grad_norm": 0.2781059145927429, "learning_rate": 4.3343917516952275e-05, "loss": 0.2538, "step": 1741 }, { "epoch": 1.2215988779803646, "grad_norm": 0.27687254548072815, "learning_rate": 4.3336368192880074e-05, "loss": 0.2546, "step": 1742 }, { "epoch": 1.2223001402524545, "grad_norm": 0.7590541839599609, "learning_rate": 4.332881524826051e-05, "loss": 0.4105, "step": 1743 }, { "epoch": 1.2230014025245441, "grad_norm": 0.28391122817993164, "learning_rate": 4.332125868458495e-05, "loss": 0.0906, "step": 1744 }, { "epoch": 1.223702664796634, "grad_norm": 0.28031736612319946, "learning_rate": 4.331369850334543e-05, "loss": 0.2547, "step": 1745 }, { "epoch": 1.2244039270687237, "grad_norm": 0.2735079526901245, "learning_rate": 4.330613470603474e-05, "loss": 0.0914, "step": 1746 }, { "epoch": 1.2251051893408134, "grad_norm": 0.7535891532897949, "learning_rate": 4.329856729414634e-05, "loss": 0.4083, "step": 1747 }, { "epoch": 1.2258064516129032, "grad_norm": 0.2803468704223633, "learning_rate": 4.329099626917446e-05, "loss": 0.2525, "step": 1748 }, { "epoch": 1.226507713884993, "grad_norm": 0.2656641900539398, "learning_rate": 4.3283421632613974e-05, "loss": 0.2485, "step": 1749 }, { "epoch": 1.2272089761570828, "grad_norm": 0.27109116315841675, "learning_rate": 4.327584338596054e-05, "loss": 0.0933, "step": 1750 }, { "epoch": 1.2279102384291725, "grad_norm": 0.2569841742515564, "learning_rate": 4.326826153071048e-05, "loss": 0.0937, "step": 1751 }, { "epoch": 1.2286115007012623, "grad_norm": 0.26185253262519836, "learning_rate": 4.326067606836085e-05, "loss": 0.2478, "step": 1752 }, { "epoch": 1.229312762973352, "grad_norm": 0.2579227387905121, "learning_rate": 4.32530870004094e-05, "loss": 0.0943, "step": 1753 }, { "epoch": 1.230014025245442, "grad_norm": 0.26070600748062134, "learning_rate": 4.324549432835463e-05, "loss": 0.0949, "step": 1754 }, { "epoch": 1.2307152875175316, "grad_norm": 0.27734458446502686, "learning_rate": 4.323789805369569e-05, "loss": 0.2504, "step": 1755 }, { "epoch": 1.2314165497896212, "grad_norm": 0.2635039985179901, "learning_rate": 4.323029817793252e-05, "loss": 0.2491, "step": 1756 }, { "epoch": 1.2321178120617111, "grad_norm": 0.25918978452682495, "learning_rate": 4.322269470256568e-05, "loss": 0.2493, "step": 1757 }, { "epoch": 1.2328190743338008, "grad_norm": 0.25580987334251404, "learning_rate": 4.321508762909654e-05, "loss": 0.2477, "step": 1758 }, { "epoch": 1.2335203366058907, "grad_norm": 0.2761329710483551, "learning_rate": 4.320747695902709e-05, "loss": 0.0939, "step": 1759 }, { "epoch": 1.2342215988779803, "grad_norm": 0.7372150421142578, "learning_rate": 4.319986269386009e-05, "loss": 0.4023, "step": 1760 }, { "epoch": 1.2349228611500702, "grad_norm": 0.277351051568985, "learning_rate": 4.3192244835098995e-05, "loss": 0.0939, "step": 1761 }, { "epoch": 1.2356241234221599, "grad_norm": 0.2615059018135071, "learning_rate": 4.318462338424794e-05, "loss": 0.0951, "step": 1762 }, { "epoch": 1.2363253856942498, "grad_norm": 0.26461148262023926, "learning_rate": 4.3176998342811816e-05, "loss": 0.0957, "step": 1763 }, { "epoch": 1.2370266479663394, "grad_norm": 0.26211634278297424, "learning_rate": 4.3169369712296184e-05, "loss": 0.0953, "step": 1764 }, { "epoch": 1.237727910238429, "grad_norm": 0.2869391441345215, "learning_rate": 4.316173749420734e-05, "loss": 0.2516, "step": 1765 }, { "epoch": 1.238429172510519, "grad_norm": 0.2812752425670624, "learning_rate": 4.315410169005229e-05, "loss": 0.0947, "step": 1766 }, { "epoch": 1.2391304347826086, "grad_norm": 0.2680431008338928, "learning_rate": 4.314646230133871e-05, "loss": 0.2514, "step": 1767 }, { "epoch": 1.2398316970546985, "grad_norm": 0.32760077714920044, "learning_rate": 4.3138819329575024e-05, "loss": 0.091, "step": 1768 }, { "epoch": 1.2405329593267882, "grad_norm": 0.26964035630226135, "learning_rate": 4.313117277627035e-05, "loss": 0.246, "step": 1769 }, { "epoch": 1.2412342215988779, "grad_norm": 0.3023952543735504, "learning_rate": 4.312352264293452e-05, "loss": 0.2541, "step": 1770 }, { "epoch": 1.2419354838709677, "grad_norm": 0.2598937749862671, "learning_rate": 4.3115868931078054e-05, "loss": 0.0945, "step": 1771 }, { "epoch": 1.2426367461430574, "grad_norm": 0.2992234528064728, "learning_rate": 4.310821164221219e-05, "loss": 0.0913, "step": 1772 }, { "epoch": 1.2433380084151473, "grad_norm": 0.27370408177375793, "learning_rate": 4.310055077784889e-05, "loss": 0.253, "step": 1773 }, { "epoch": 1.244039270687237, "grad_norm": 1.6938620805740356, "learning_rate": 4.309288633950079e-05, "loss": 0.301, "step": 1774 }, { "epoch": 1.2447405329593269, "grad_norm": 0.25900354981422424, "learning_rate": 4.308521832868124e-05, "loss": 0.0937, "step": 1775 }, { "epoch": 1.2454417952314165, "grad_norm": 0.2803562581539154, "learning_rate": 4.307754674690432e-05, "loss": 0.0939, "step": 1776 }, { "epoch": 1.2461430575035064, "grad_norm": 0.260047048330307, "learning_rate": 4.306987159568479e-05, "loss": 0.0942, "step": 1777 }, { "epoch": 1.246844319775596, "grad_norm": 0.2738437056541443, "learning_rate": 4.3062192876538114e-05, "loss": 0.093, "step": 1778 }, { "epoch": 1.2475455820476857, "grad_norm": 0.7509914040565491, "learning_rate": 4.305451059098048e-05, "loss": 0.4038, "step": 1779 }, { "epoch": 1.2482468443197756, "grad_norm": 0.25914332270622253, "learning_rate": 4.304682474052876e-05, "loss": 0.0937, "step": 1780 }, { "epoch": 1.2489481065918653, "grad_norm": 0.2751765251159668, "learning_rate": 4.303913532670054e-05, "loss": 0.0926, "step": 1781 }, { "epoch": 1.2496493688639552, "grad_norm": 0.2661525309085846, "learning_rate": 4.303144235101412e-05, "loss": 0.2498, "step": 1782 }, { "epoch": 1.2503506311360448, "grad_norm": 1.4240188598632812, "learning_rate": 4.302374581498847e-05, "loss": 0.4287, "step": 1783 }, { "epoch": 1.2510518934081347, "grad_norm": 0.3100104033946991, "learning_rate": 4.301604572014329e-05, "loss": 0.0917, "step": 1784 }, { "epoch": 1.2517531556802244, "grad_norm": 0.25836291909217834, "learning_rate": 4.3008342067998986e-05, "loss": 0.0938, "step": 1785 }, { "epoch": 1.2524544179523143, "grad_norm": 0.27620819211006165, "learning_rate": 4.3000634860076654e-05, "loss": 0.0932, "step": 1786 }, { "epoch": 1.253155680224404, "grad_norm": 0.2615886926651001, "learning_rate": 4.2992924097898095e-05, "loss": 0.2481, "step": 1787 }, { "epoch": 1.2538569424964936, "grad_norm": 0.2964332103729248, "learning_rate": 4.298520978298581e-05, "loss": 0.0918, "step": 1788 }, { "epoch": 1.2545582047685835, "grad_norm": 0.2644781768321991, "learning_rate": 4.297749191686301e-05, "loss": 0.2493, "step": 1789 }, { "epoch": 1.2552594670406731, "grad_norm": 0.272525817155838, "learning_rate": 4.2969770501053586e-05, "loss": 0.2483, "step": 1790 }, { "epoch": 1.255960729312763, "grad_norm": 0.2608782947063446, "learning_rate": 4.296204553708216e-05, "loss": 0.2475, "step": 1791 }, { "epoch": 1.2566619915848527, "grad_norm": 0.259375661611557, "learning_rate": 4.295431702647404e-05, "loss": 0.0938, "step": 1792 }, { "epoch": 1.2573632538569424, "grad_norm": 0.2757596969604492, "learning_rate": 4.2946584970755224e-05, "loss": 0.2532, "step": 1793 }, { "epoch": 1.2580645161290323, "grad_norm": 0.2597247064113617, "learning_rate": 4.2938849371452425e-05, "loss": 0.0942, "step": 1794 }, { "epoch": 1.2587657784011221, "grad_norm": 1.298866868019104, "learning_rate": 4.2931110230093045e-05, "loss": 0.3927, "step": 1795 }, { "epoch": 1.2594670406732118, "grad_norm": 0.2841314673423767, "learning_rate": 4.29233675482052e-05, "loss": 0.0958, "step": 1796 }, { "epoch": 1.2601683029453015, "grad_norm": 0.28046104311943054, "learning_rate": 4.2915621327317685e-05, "loss": 0.095, "step": 1797 }, { "epoch": 1.2608695652173914, "grad_norm": 1.0790376663208008, "learning_rate": 4.290787156896001e-05, "loss": 0.241, "step": 1798 }, { "epoch": 1.261570827489481, "grad_norm": 0.2564866542816162, "learning_rate": 4.290011827466238e-05, "loss": 0.2466, "step": 1799 }, { "epoch": 1.262272089761571, "grad_norm": 0.2967292368412018, "learning_rate": 4.28923614459557e-05, "loss": 0.0983, "step": 1800 }, { "epoch": 1.2629733520336606, "grad_norm": 0.3512256443500519, "learning_rate": 4.2884601084371565e-05, "loss": 0.103, "step": 1801 }, { "epoch": 1.2636746143057502, "grad_norm": 0.2674315869808197, "learning_rate": 4.287683719144226e-05, "loss": 0.0967, "step": 1802 }, { "epoch": 1.2643758765778401, "grad_norm": 0.358871728181839, "learning_rate": 4.28690697687008e-05, "loss": 0.1024, "step": 1803 }, { "epoch": 1.26507713884993, "grad_norm": 0.2700803875923157, "learning_rate": 4.2861298817680854e-05, "loss": 0.0978, "step": 1804 }, { "epoch": 1.2657784011220197, "grad_norm": 0.2959842085838318, "learning_rate": 4.285352433991683e-05, "loss": 0.0996, "step": 1805 }, { "epoch": 1.2664796633941093, "grad_norm": 0.26162847876548767, "learning_rate": 4.284574633694379e-05, "loss": 0.2531, "step": 1806 }, { "epoch": 1.2671809256661992, "grad_norm": 0.300720751285553, "learning_rate": 4.2837964810297535e-05, "loss": 0.0995, "step": 1807 }, { "epoch": 1.2678821879382889, "grad_norm": 0.2622394859790802, "learning_rate": 4.283017976151455e-05, "loss": 0.0954, "step": 1808 }, { "epoch": 1.2685834502103788, "grad_norm": 0.7669475674629211, "learning_rate": 4.282239119213196e-05, "loss": 0.409, "step": 1809 }, { "epoch": 1.2692847124824684, "grad_norm": 0.2614777982234955, "learning_rate": 4.281459910368768e-05, "loss": 0.0948, "step": 1810 }, { "epoch": 1.269985974754558, "grad_norm": 0.2678206264972687, "learning_rate": 4.280680349772024e-05, "loss": 0.2485, "step": 1811 }, { "epoch": 1.270687237026648, "grad_norm": 0.2946365177631378, "learning_rate": 4.2799004375768914e-05, "loss": 0.0977, "step": 1812 }, { "epoch": 1.2713884992987377, "grad_norm": 0.25984570384025574, "learning_rate": 4.2791201739373644e-05, "loss": 0.25, "step": 1813 }, { "epoch": 1.2720897615708275, "grad_norm": 0.2915489971637726, "learning_rate": 4.278339559007507e-05, "loss": 0.0977, "step": 1814 }, { "epoch": 1.2727910238429172, "grad_norm": 0.28471750020980835, "learning_rate": 4.277558592941454e-05, "loss": 0.096, "step": 1815 }, { "epoch": 1.273492286115007, "grad_norm": 0.25805047154426575, "learning_rate": 4.276777275893408e-05, "loss": 0.0938, "step": 1816 }, { "epoch": 1.2741935483870968, "grad_norm": 0.2689046561717987, "learning_rate": 4.275995608017641e-05, "loss": 0.2495, "step": 1817 }, { "epoch": 1.2748948106591866, "grad_norm": 0.31768864393234253, "learning_rate": 4.275213589468495e-05, "loss": 0.0973, "step": 1818 }, { "epoch": 1.2755960729312763, "grad_norm": 0.31554579734802246, "learning_rate": 4.2744312204003816e-05, "loss": 0.0955, "step": 1819 }, { "epoch": 1.276297335203366, "grad_norm": 0.2763252854347229, "learning_rate": 4.27364850096778e-05, "loss": 0.0941, "step": 1820 }, { "epoch": 1.2769985974754559, "grad_norm": 0.32158714532852173, "learning_rate": 4.272865431325239e-05, "loss": 0.0955, "step": 1821 }, { "epoch": 1.2776998597475455, "grad_norm": 0.278431236743927, "learning_rate": 4.2720820116273785e-05, "loss": 0.2522, "step": 1822 }, { "epoch": 1.2784011220196354, "grad_norm": 0.33777284622192383, "learning_rate": 4.271298242028886e-05, "loss": 0.0932, "step": 1823 }, { "epoch": 1.279102384291725, "grad_norm": 0.2829602360725403, "learning_rate": 4.270514122684516e-05, "loss": 0.2545, "step": 1824 }, { "epoch": 1.2798036465638147, "grad_norm": 1.1718313694000244, "learning_rate": 4.269729653749097e-05, "loss": 0.265, "step": 1825 }, { "epoch": 1.2805049088359046, "grad_norm": 0.2468155324459076, "learning_rate": 4.268944835377521e-05, "loss": 0.0903, "step": 1826 }, { "epoch": 1.2812061711079945, "grad_norm": 0.24894979596138, "learning_rate": 4.268159667724755e-05, "loss": 0.0903, "step": 1827 }, { "epoch": 1.2819074333800842, "grad_norm": 0.2886178493499756, "learning_rate": 4.267374150945829e-05, "loss": 0.2548, "step": 1828 }, { "epoch": 1.2826086956521738, "grad_norm": 0.26613062620162964, "learning_rate": 4.266588285195845e-05, "loss": 0.0903, "step": 1829 }, { "epoch": 1.2833099579242637, "grad_norm": 0.2812913954257965, "learning_rate": 4.2658020706299736e-05, "loss": 0.2508, "step": 1830 }, { "epoch": 1.2840112201963534, "grad_norm": 0.2635002136230469, "learning_rate": 4.265015507403456e-05, "loss": 0.0895, "step": 1831 }, { "epoch": 1.2847124824684433, "grad_norm": 0.7782965898513794, "learning_rate": 4.264228595671599e-05, "loss": 0.415, "step": 1832 }, { "epoch": 1.285413744740533, "grad_norm": 0.24777694046497345, "learning_rate": 4.2634413355897795e-05, "loss": 0.09, "step": 1833 }, { "epoch": 1.2861150070126226, "grad_norm": 0.3097330629825592, "learning_rate": 4.262653727313444e-05, "loss": 0.0898, "step": 1834 }, { "epoch": 1.2868162692847125, "grad_norm": 0.26603129506111145, "learning_rate": 4.261865770998106e-05, "loss": 0.0899, "step": 1835 }, { "epoch": 1.2875175315568024, "grad_norm": 0.28444257378578186, "learning_rate": 4.261077466799349e-05, "loss": 0.2521, "step": 1836 }, { "epoch": 1.288218793828892, "grad_norm": 0.24634964764118195, "learning_rate": 4.260288814872827e-05, "loss": 0.0894, "step": 1837 }, { "epoch": 1.2889200561009817, "grad_norm": 0.2612331211566925, "learning_rate": 4.259499815374259e-05, "loss": 0.0882, "step": 1838 }, { "epoch": 1.2896213183730716, "grad_norm": 0.2797384262084961, "learning_rate": 4.2587104684594344e-05, "loss": 0.0868, "step": 1839 }, { "epoch": 1.2903225806451613, "grad_norm": 1.3522377014160156, "learning_rate": 4.257920774284211e-05, "loss": 0.281, "step": 1840 }, { "epoch": 1.2910238429172511, "grad_norm": 0.2452174723148346, "learning_rate": 4.257130733004516e-05, "loss": 0.0887, "step": 1841 }, { "epoch": 1.2917251051893408, "grad_norm": 0.2969903349876404, "learning_rate": 4.256340344776344e-05, "loss": 0.0875, "step": 1842 }, { "epoch": 1.2924263674614305, "grad_norm": 0.24351541697978973, "learning_rate": 4.255549609755758e-05, "loss": 0.0887, "step": 1843 }, { "epoch": 1.2931276297335204, "grad_norm": 0.2951391935348511, "learning_rate": 4.25475852809889e-05, "loss": 0.2536, "step": 1844 }, { "epoch": 1.29382889200561, "grad_norm": 0.2905614972114563, "learning_rate": 4.253967099961942e-05, "loss": 0.2518, "step": 1845 }, { "epoch": 1.2945301542777, "grad_norm": 0.24288716912269592, "learning_rate": 4.253175325501181e-05, "loss": 0.0883, "step": 1846 }, { "epoch": 1.2952314165497896, "grad_norm": 0.2979552149772644, "learning_rate": 4.252383204872945e-05, "loss": 0.2578, "step": 1847 }, { "epoch": 1.2959326788218795, "grad_norm": 0.2755988538265228, "learning_rate": 4.25159073823364e-05, "loss": 0.0856, "step": 1848 }, { "epoch": 1.2966339410939691, "grad_norm": 0.2455526441335678, "learning_rate": 4.250797925739739e-05, "loss": 0.0888, "step": 1849 }, { "epoch": 1.297335203366059, "grad_norm": 0.25995153188705444, "learning_rate": 4.250004767547785e-05, "loss": 0.0872, "step": 1850 }, { "epoch": 1.2980364656381487, "grad_norm": 0.24050627648830414, "learning_rate": 4.2492112638143874e-05, "loss": 0.0872, "step": 1851 }, { "epoch": 1.2987377279102383, "grad_norm": 0.3021355867385864, "learning_rate": 4.248417414696226e-05, "loss": 0.0832, "step": 1852 }, { "epoch": 1.2994389901823282, "grad_norm": 1.6476370096206665, "learning_rate": 4.247623220350047e-05, "loss": 0.4671, "step": 1853 }, { "epoch": 1.3001402524544179, "grad_norm": 0.24071550369262695, "learning_rate": 4.246828680932666e-05, "loss": 0.0871, "step": 1854 }, { "epoch": 1.3008415147265078, "grad_norm": 0.260211706161499, "learning_rate": 4.246033796600965e-05, "loss": 0.0852, "step": 1855 }, { "epoch": 1.3015427769985974, "grad_norm": 0.8241651654243469, "learning_rate": 4.245238567511897e-05, "loss": 0.4348, "step": 1856 }, { "epoch": 1.302244039270687, "grad_norm": 0.25222766399383545, "learning_rate": 4.24444299382248e-05, "loss": 0.0851, "step": 1857 }, { "epoch": 1.302945301542777, "grad_norm": 0.2543630003929138, "learning_rate": 4.243647075689802e-05, "loss": 0.085, "step": 1858 }, { "epoch": 1.3036465638148669, "grad_norm": 0.2756558060646057, "learning_rate": 4.242850813271018e-05, "loss": 0.0854, "step": 1859 }, { "epoch": 1.3043478260869565, "grad_norm": 0.30475950241088867, "learning_rate": 4.242054206723352e-05, "loss": 0.2579, "step": 1860 }, { "epoch": 1.3050490883590462, "grad_norm": 0.253603458404541, "learning_rate": 4.2412572562040944e-05, "loss": 0.0853, "step": 1861 }, { "epoch": 1.305750350631136, "grad_norm": 0.24011780321598053, "learning_rate": 4.2404599618706056e-05, "loss": 0.0867, "step": 1862 }, { "epoch": 1.3064516129032258, "grad_norm": 0.8198506832122803, "learning_rate": 4.2396623238803114e-05, "loss": 0.43, "step": 1863 }, { "epoch": 1.3071528751753156, "grad_norm": 0.25068801641464233, "learning_rate": 4.2388643423907085e-05, "loss": 0.0854, "step": 1864 }, { "epoch": 1.3078541374474053, "grad_norm": 0.2542220652103424, "learning_rate": 4.238066017559357e-05, "loss": 0.0854, "step": 1865 }, { "epoch": 1.308555399719495, "grad_norm": 0.3125442564487457, "learning_rate": 4.237267349543891e-05, "loss": 0.2584, "step": 1866 }, { "epoch": 1.3092566619915849, "grad_norm": 0.3033631443977356, "learning_rate": 4.236468338502006e-05, "loss": 0.2579, "step": 1867 }, { "epoch": 1.3099579242636747, "grad_norm": 0.2535426616668701, "learning_rate": 4.235668984591469e-05, "loss": 0.0845, "step": 1868 }, { "epoch": 1.3106591865357644, "grad_norm": 0.23865766823291779, "learning_rate": 4.2348692879701146e-05, "loss": 0.0862, "step": 1869 }, { "epoch": 1.311360448807854, "grad_norm": 0.27090051770210266, "learning_rate": 4.234069248795843e-05, "loss": 0.083, "step": 1870 }, { "epoch": 1.312061711079944, "grad_norm": 0.24629966914653778, "learning_rate": 4.2332688672266255e-05, "loss": 0.083, "step": 1871 }, { "epoch": 1.3127629733520336, "grad_norm": 0.31891489028930664, "learning_rate": 4.232468143420496e-05, "loss": 0.26, "step": 1872 }, { "epoch": 1.3134642356241235, "grad_norm": 0.34045419096946716, "learning_rate": 4.23166707753556e-05, "loss": 0.2599, "step": 1873 }, { "epoch": 1.3141654978962132, "grad_norm": 0.31612932682037354, "learning_rate": 4.23086566972999e-05, "loss": 0.2584, "step": 1874 }, { "epoch": 1.3148667601683028, "grad_norm": 0.3218870759010315, "learning_rate": 4.230063920162025e-05, "loss": 0.2611, "step": 1875 }, { "epoch": 1.3155680224403927, "grad_norm": 0.2366591840982437, "learning_rate": 4.229261828989971e-05, "loss": 0.0852, "step": 1876 }, { "epoch": 1.3162692847124824, "grad_norm": 0.840890645980835, "learning_rate": 4.228459396372202e-05, "loss": 0.4361, "step": 1877 }, { "epoch": 1.3169705469845723, "grad_norm": 0.2438347041606903, "learning_rate": 4.227656622467162e-05, "loss": 0.0821, "step": 1878 }, { "epoch": 1.317671809256662, "grad_norm": 0.24560777842998505, "learning_rate": 4.2268535074333585e-05, "loss": 0.0827, "step": 1879 }, { "epoch": 1.3183730715287518, "grad_norm": 0.3194405138492584, "learning_rate": 4.226050051429367e-05, "loss": 0.2614, "step": 1880 }, { "epoch": 1.3190743338008415, "grad_norm": 0.27099093794822693, "learning_rate": 4.225246254613833e-05, "loss": 0.0763, "step": 1881 }, { "epoch": 1.3197755960729314, "grad_norm": 0.23831240832805634, "learning_rate": 4.224442117145467e-05, "loss": 0.0858, "step": 1882 }, { "epoch": 1.320476858345021, "grad_norm": 0.23728512227535248, "learning_rate": 4.223637639183047e-05, "loss": 0.0852, "step": 1883 }, { "epoch": 1.3211781206171107, "grad_norm": 0.24519884586334229, "learning_rate": 4.222832820885419e-05, "loss": 0.0819, "step": 1884 }, { "epoch": 1.3218793828892006, "grad_norm": 0.8506645560264587, "learning_rate": 4.222027662411495e-05, "loss": 0.4362, "step": 1885 }, { "epoch": 1.3225806451612903, "grad_norm": 0.32022953033447266, "learning_rate": 4.2212221639202564e-05, "loss": 0.2595, "step": 1886 }, { "epoch": 1.3232819074333801, "grad_norm": 0.23800760507583618, "learning_rate": 4.220416325570749e-05, "loss": 0.0853, "step": 1887 }, { "epoch": 1.3239831697054698, "grad_norm": 0.23816540837287903, "learning_rate": 4.2196101475220866e-05, "loss": 0.0851, "step": 1888 }, { "epoch": 1.3246844319775595, "grad_norm": 0.23727218806743622, "learning_rate": 4.218803629933452e-05, "loss": 0.0849, "step": 1889 }, { "epoch": 1.3253856942496494, "grad_norm": 0.23644833266735077, "learning_rate": 4.217996772964092e-05, "loss": 0.0848, "step": 1890 }, { "epoch": 1.3260869565217392, "grad_norm": 0.2547038197517395, "learning_rate": 4.217189576773323e-05, "loss": 0.0785, "step": 1891 }, { "epoch": 1.326788218793829, "grad_norm": 0.23748643696308136, "learning_rate": 4.216382041520526e-05, "loss": 0.0848, "step": 1892 }, { "epoch": 1.3274894810659186, "grad_norm": 0.24398639798164368, "learning_rate": 4.215574167365151e-05, "loss": 0.081, "step": 1893 }, { "epoch": 1.3281907433380085, "grad_norm": 0.3423919081687927, "learning_rate": 4.2147659544667146e-05, "loss": 0.2553, "step": 1894 }, { "epoch": 1.3288920056100981, "grad_norm": 0.2395777702331543, "learning_rate": 4.213957402984799e-05, "loss": 0.0798, "step": 1895 }, { "epoch": 1.329593267882188, "grad_norm": 0.2380422055721283, "learning_rate": 4.213148513079055e-05, "loss": 0.0791, "step": 1896 }, { "epoch": 1.3302945301542777, "grad_norm": 0.23494809865951538, "learning_rate": 4.212339284909198e-05, "loss": 0.0833, "step": 1897 }, { "epoch": 1.3309957924263673, "grad_norm": 0.8737236857414246, "learning_rate": 4.2115297186350127e-05, "loss": 0.4341, "step": 1898 }, { "epoch": 1.3316970546984572, "grad_norm": 0.3492420017719269, "learning_rate": 4.210719814416349e-05, "loss": 0.2649, "step": 1899 }, { "epoch": 1.3323983169705471, "grad_norm": 0.2332715392112732, "learning_rate": 4.209909572413123e-05, "loss": 0.082, "step": 1900 }, { "epoch": 1.3330995792426368, "grad_norm": 0.2336559295654297, "learning_rate": 4.2090989927853196e-05, "loss": 0.0823, "step": 1901 }, { "epoch": 1.3338008415147264, "grad_norm": 0.23297575116157532, "learning_rate": 4.208288075692989e-05, "loss": 0.082, "step": 1902 }, { "epoch": 1.3345021037868163, "grad_norm": 0.23898297548294067, "learning_rate": 4.207476821296247e-05, "loss": 0.0716, "step": 1903 }, { "epoch": 1.335203366058906, "grad_norm": 0.24929705262184143, "learning_rate": 4.206665229755279e-05, "loss": 0.0663, "step": 1904 }, { "epoch": 1.3359046283309959, "grad_norm": 0.23329392075538635, "learning_rate": 4.205853301230333e-05, "loss": 0.0812, "step": 1905 }, { "epoch": 1.3366058906030855, "grad_norm": 0.23321650922298431, "learning_rate": 4.205041035881728e-05, "loss": 0.081, "step": 1906 }, { "epoch": 1.3373071528751752, "grad_norm": 0.3810141384601593, "learning_rate": 4.204228433869845e-05, "loss": 0.2561, "step": 1907 }, { "epoch": 1.338008415147265, "grad_norm": 0.22919665277004242, "learning_rate": 4.2034154953551344e-05, "loss": 0.0743, "step": 1908 }, { "epoch": 1.3387096774193548, "grad_norm": 0.22760212421417236, "learning_rate": 4.2026022204981134e-05, "loss": 0.0732, "step": 1909 }, { "epoch": 1.3394109396914446, "grad_norm": 0.3605912923812866, "learning_rate": 4.2017886094593636e-05, "loss": 0.2637, "step": 1910 }, { "epoch": 1.3401122019635343, "grad_norm": 0.22967416048049927, "learning_rate": 4.200974662399534e-05, "loss": 0.0785, "step": 1911 }, { "epoch": 1.3408134642356242, "grad_norm": 0.37904098629951477, "learning_rate": 4.2001603794793386e-05, "loss": 0.2609, "step": 1912 }, { "epoch": 1.3415147265077139, "grad_norm": 0.3953646123409271, "learning_rate": 4.199345760859562e-05, "loss": 0.2679, "step": 1913 }, { "epoch": 1.3422159887798037, "grad_norm": 0.3908027708530426, "learning_rate": 4.1985308067010485e-05, "loss": 0.259, "step": 1914 }, { "epoch": 1.3429172510518934, "grad_norm": 0.22916193306446075, "learning_rate": 4.197715517164715e-05, "loss": 0.0777, "step": 1915 }, { "epoch": 1.343618513323983, "grad_norm": 0.22968442738056183, "learning_rate": 4.196899892411541e-05, "loss": 0.0782, "step": 1916 }, { "epoch": 1.344319775596073, "grad_norm": 0.22959044575691223, "learning_rate": 4.196083932602572e-05, "loss": 0.0777, "step": 1917 }, { "epoch": 1.3450210378681626, "grad_norm": 0.22974923253059387, "learning_rate": 4.1952676378989215e-05, "loss": 0.0777, "step": 1918 }, { "epoch": 1.3457223001402525, "grad_norm": 0.40173396468162537, "learning_rate": 4.194451008461768e-05, "loss": 0.2647, "step": 1919 }, { "epoch": 1.3464235624123422, "grad_norm": 0.4065837860107422, "learning_rate": 4.193634044452357e-05, "loss": 0.2583, "step": 1920 }, { "epoch": 1.3471248246844318, "grad_norm": 0.4070141613483429, "learning_rate": 4.192816746031999e-05, "loss": 0.2566, "step": 1921 }, { "epoch": 1.3478260869565217, "grad_norm": 0.38982337713241577, "learning_rate": 4.191999113362071e-05, "loss": 0.2691, "step": 1922 }, { "epoch": 1.3485273492286116, "grad_norm": 0.2234804630279541, "learning_rate": 4.191181146604016e-05, "loss": 0.0711, "step": 1923 }, { "epoch": 1.3492286115007013, "grad_norm": 0.22072987258434296, "learning_rate": 4.1903628459193427e-05, "loss": 0.0708, "step": 1924 }, { "epoch": 1.349929873772791, "grad_norm": 0.9521637558937073, "learning_rate": 4.189544211469626e-05, "loss": 0.46, "step": 1925 }, { "epoch": 1.3506311360448808, "grad_norm": 0.23054993152618408, "learning_rate": 4.188725243416507e-05, "loss": 0.0782, "step": 1926 }, { "epoch": 1.3513323983169705, "grad_norm": 0.21977007389068604, "learning_rate": 4.1879059419216926e-05, "loss": 0.0649, "step": 1927 }, { "epoch": 1.3520336605890604, "grad_norm": 0.23298673331737518, "learning_rate": 4.187086307146955e-05, "loss": 0.079, "step": 1928 }, { "epoch": 1.35273492286115, "grad_norm": 0.23098044097423553, "learning_rate": 4.186266339254131e-05, "loss": 0.0785, "step": 1929 }, { "epoch": 1.3534361851332397, "grad_norm": 0.937506377696991, "learning_rate": 4.185446038405127e-05, "loss": 0.4584, "step": 1930 }, { "epoch": 1.3541374474053296, "grad_norm": 0.22158822417259216, "learning_rate": 4.184625404761913e-05, "loss": 0.0658, "step": 1931 }, { "epoch": 1.3548387096774195, "grad_norm": 0.23268508911132812, "learning_rate": 4.183804438486522e-05, "loss": 0.0791, "step": 1932 }, { "epoch": 1.3555399719495091, "grad_norm": 0.37929609417915344, "learning_rate": 4.182983139741056e-05, "loss": 0.2593, "step": 1933 }, { "epoch": 1.3562412342215988, "grad_norm": 0.9068270325660706, "learning_rate": 4.1821615086876836e-05, "loss": 0.4497, "step": 1934 }, { "epoch": 1.3569424964936887, "grad_norm": 0.2257976084947586, "learning_rate": 4.1813395454886354e-05, "loss": 0.0732, "step": 1935 }, { "epoch": 1.3576437587657784, "grad_norm": 0.22682733833789825, "learning_rate": 4.1805172503062104e-05, "loss": 0.0739, "step": 1936 }, { "epoch": 1.3583450210378682, "grad_norm": 0.22386112809181213, "learning_rate": 4.179694623302772e-05, "loss": 0.0736, "step": 1937 }, { "epoch": 1.359046283309958, "grad_norm": 0.22168788313865662, "learning_rate": 4.1788716646407494e-05, "loss": 0.0736, "step": 1938 }, { "epoch": 1.3597475455820476, "grad_norm": 0.351818323135376, "learning_rate": 4.1780483744826365e-05, "loss": 0.2635, "step": 1939 }, { "epoch": 1.3604488078541375, "grad_norm": 0.23426426947116852, "learning_rate": 4.1772247529909946e-05, "loss": 0.0806, "step": 1940 }, { "epoch": 1.3611500701262271, "grad_norm": 0.23043283820152283, "learning_rate": 4.1764008003284486e-05, "loss": 0.0746, "step": 1941 }, { "epoch": 1.361851332398317, "grad_norm": 0.23502907156944275, "learning_rate": 4.175576516657688e-05, "loss": 0.0811, "step": 1942 }, { "epoch": 1.3625525946704067, "grad_norm": 0.23288844525814056, "learning_rate": 4.1747519021414706e-05, "loss": 0.0801, "step": 1943 }, { "epoch": 1.3632538569424966, "grad_norm": 0.2251008301973343, "learning_rate": 4.173926956942618e-05, "loss": 0.0734, "step": 1944 }, { "epoch": 1.3639551192145862, "grad_norm": 0.2332216054201126, "learning_rate": 4.173101681224016e-05, "loss": 0.0795, "step": 1945 }, { "epoch": 1.3646563814866761, "grad_norm": 0.23487618565559387, "learning_rate": 4.1722760751486184e-05, "loss": 0.0796, "step": 1946 }, { "epoch": 1.3653576437587658, "grad_norm": 0.23272180557250977, "learning_rate": 4.1714501388794416e-05, "loss": 0.0786, "step": 1947 }, { "epoch": 1.3660589060308554, "grad_norm": 0.2158563882112503, "learning_rate": 4.170623872579567e-05, "loss": 0.0645, "step": 1948 }, { "epoch": 1.3667601683029453, "grad_norm": 0.23231235146522522, "learning_rate": 4.169797276412144e-05, "loss": 0.0781, "step": 1949 }, { "epoch": 1.367461430575035, "grad_norm": 0.4053735136985779, "learning_rate": 4.168970350540384e-05, "loss": 0.2625, "step": 1950 }, { "epoch": 1.3681626928471249, "grad_norm": 0.1951194405555725, "learning_rate": 4.168143095127567e-05, "loss": 0.0535, "step": 1951 }, { "epoch": 1.3688639551192145, "grad_norm": 0.4326797127723694, "learning_rate": 4.167315510337033e-05, "loss": 0.2565, "step": 1952 }, { "epoch": 1.3695652173913042, "grad_norm": 0.21293926239013672, "learning_rate": 4.1664875963321924e-05, "loss": 0.0678, "step": 1953 }, { "epoch": 1.370266479663394, "grad_norm": 6.322765350341797, "learning_rate": 4.165659353276518e-05, "loss": 0.7911, "step": 1954 }, { "epoch": 1.370967741935484, "grad_norm": 0.19282902777194977, "learning_rate": 4.1648307813335474e-05, "loss": 0.0522, "step": 1955 }, { "epoch": 1.3716690042075736, "grad_norm": 0.21778354048728943, "learning_rate": 4.1640018806668836e-05, "loss": 0.0685, "step": 1956 }, { "epoch": 1.3723702664796633, "grad_norm": 0.4007341265678406, "learning_rate": 4.163172651440194e-05, "loss": 0.2706, "step": 1957 }, { "epoch": 1.3730715287517532, "grad_norm": 0.21561473608016968, "learning_rate": 4.1623430938172114e-05, "loss": 0.0682, "step": 1958 }, { "epoch": 1.3737727910238429, "grad_norm": 0.4137192368507385, "learning_rate": 4.161513207961734e-05, "loss": 0.2767, "step": 1959 }, { "epoch": 1.3744740532959328, "grad_norm": 0.21730127930641174, "learning_rate": 4.160682994037624e-05, "loss": 0.0684, "step": 1960 }, { "epoch": 1.3751753155680224, "grad_norm": 0.40883225202560425, "learning_rate": 4.159852452208808e-05, "loss": 0.2751, "step": 1961 }, { "epoch": 1.375876577840112, "grad_norm": 0.22975365817546844, "learning_rate": 4.159021582639279e-05, "loss": 0.0769, "step": 1962 }, { "epoch": 1.376577840112202, "grad_norm": 0.3953935503959656, "learning_rate": 4.1581903854930924e-05, "loss": 0.2726, "step": 1963 }, { "epoch": 1.3772791023842919, "grad_norm": 0.3882234990596771, "learning_rate": 4.15735886093437e-05, "loss": 0.2671, "step": 1964 }, { "epoch": 1.3779803646563815, "grad_norm": 0.2176571637392044, "learning_rate": 4.156527009127298e-05, "loss": 0.0698, "step": 1965 }, { "epoch": 1.3786816269284712, "grad_norm": 0.3844260573387146, "learning_rate": 4.155694830236127e-05, "loss": 0.2666, "step": 1966 }, { "epoch": 1.379382889200561, "grad_norm": 0.23092791438102722, "learning_rate": 4.1548623244251715e-05, "loss": 0.0783, "step": 1967 }, { "epoch": 1.3800841514726507, "grad_norm": 0.38516172766685486, "learning_rate": 4.154029491858812e-05, "loss": 0.2599, "step": 1968 }, { "epoch": 1.3807854137447406, "grad_norm": 0.23297591507434845, "learning_rate": 4.1531963327014925e-05, "loss": 0.0799, "step": 1969 }, { "epoch": 1.3814866760168303, "grad_norm": 0.2237105369567871, "learning_rate": 4.1523628471177215e-05, "loss": 0.0721, "step": 1970 }, { "epoch": 1.38218793828892, "grad_norm": 0.22099696099758148, "learning_rate": 4.151529035272072e-05, "loss": 0.0713, "step": 1971 }, { "epoch": 1.3828892005610098, "grad_norm": 0.23146215081214905, "learning_rate": 4.1506948973291826e-05, "loss": 0.0793, "step": 1972 }, { "epoch": 1.3835904628330995, "grad_norm": 0.21675410866737366, "learning_rate": 4.149860433453753e-05, "loss": 0.065, "step": 1973 }, { "epoch": 1.3842917251051894, "grad_norm": 0.23225334286689758, "learning_rate": 4.149025643810552e-05, "loss": 0.0795, "step": 1974 }, { "epoch": 1.384992987377279, "grad_norm": 0.22489944100379944, "learning_rate": 4.148190528564409e-05, "loss": 0.0718, "step": 1975 }, { "epoch": 1.385694249649369, "grad_norm": 0.22231286764144897, "learning_rate": 4.147355087880219e-05, "loss": 0.0705, "step": 1976 }, { "epoch": 1.3863955119214586, "grad_norm": 0.38252994418144226, "learning_rate": 4.146519321922942e-05, "loss": 0.2685, "step": 1977 }, { "epoch": 1.3870967741935485, "grad_norm": 0.22975420951843262, "learning_rate": 4.1456832308576e-05, "loss": 0.0774, "step": 1978 }, { "epoch": 1.3877980364656382, "grad_norm": 0.22921159863471985, "learning_rate": 4.144846814849282e-05, "loss": 0.0776, "step": 1979 }, { "epoch": 1.3884992987377278, "grad_norm": 0.22957833111286163, "learning_rate": 4.144010074063139e-05, "loss": 0.0763, "step": 1980 }, { "epoch": 1.3892005610098177, "grad_norm": 1.0252691507339478, "learning_rate": 4.143173008664387e-05, "loss": 0.4571, "step": 1981 }, { "epoch": 1.3899018232819074, "grad_norm": 0.2314363270998001, "learning_rate": 4.142335618818306e-05, "loss": 0.0772, "step": 1982 }, { "epoch": 1.3906030855539973, "grad_norm": 0.21522434055805206, "learning_rate": 4.14149790469024e-05, "loss": 0.0683, "step": 1983 }, { "epoch": 1.391304347826087, "grad_norm": 0.21906399726867676, "learning_rate": 4.140659866445598e-05, "loss": 0.0695, "step": 1984 }, { "epoch": 1.3920056100981766, "grad_norm": 0.21537551283836365, "learning_rate": 4.13982150424985e-05, "loss": 0.0689, "step": 1985 }, { "epoch": 1.3927068723702665, "grad_norm": 0.4161413311958313, "learning_rate": 4.138982818268534e-05, "loss": 0.2637, "step": 1986 }, { "epoch": 1.3934081346423564, "grad_norm": 0.22818845510482788, "learning_rate": 4.13814380866725e-05, "loss": 0.0751, "step": 1987 }, { "epoch": 1.394109396914446, "grad_norm": 0.22358815371990204, "learning_rate": 4.137304475611661e-05, "loss": 0.068, "step": 1988 }, { "epoch": 1.3948106591865357, "grad_norm": 0.43541014194488525, "learning_rate": 4.136464819267495e-05, "loss": 0.2665, "step": 1989 }, { "epoch": 1.3955119214586256, "grad_norm": 0.4279073476791382, "learning_rate": 4.135624839800543e-05, "loss": 0.2769, "step": 1990 }, { "epoch": 1.3962131837307152, "grad_norm": 0.22699986398220062, "learning_rate": 4.134784537376661e-05, "loss": 0.0742, "step": 1991 }, { "epoch": 1.3969144460028051, "grad_norm": 0.1873912662267685, "learning_rate": 4.133943912161769e-05, "loss": 0.0511, "step": 1992 }, { "epoch": 1.3976157082748948, "grad_norm": 0.45007383823394775, "learning_rate": 4.133102964321848e-05, "loss": 0.2604, "step": 1993 }, { "epoch": 1.3983169705469845, "grad_norm": 0.4264034926891327, "learning_rate": 4.1322616940229455e-05, "loss": 0.2669, "step": 1994 }, { "epoch": 1.3990182328190743, "grad_norm": 0.21150954067707062, "learning_rate": 4.131420101431173e-05, "loss": 0.0666, "step": 1995 }, { "epoch": 1.3997194950911642, "grad_norm": 0.21257172524929047, "learning_rate": 4.130578186712702e-05, "loss": 0.067, "step": 1996 }, { "epoch": 1.4004207573632539, "grad_norm": 0.22799430787563324, "learning_rate": 4.129735950033773e-05, "loss": 0.0742, "step": 1997 }, { "epoch": 1.4011220196353436, "grad_norm": 0.42362648248672485, "learning_rate": 4.1288933915606845e-05, "loss": 0.2757, "step": 1998 }, { "epoch": 1.4018232819074334, "grad_norm": 0.22688065469264984, "learning_rate": 4.1280505114598014e-05, "loss": 0.0742, "step": 1999 }, { "epoch": 1.402524544179523, "grad_norm": 0.22159546613693237, "learning_rate": 4.127207309897553e-05, "loss": 0.0666, "step": 2000 }, { "epoch": 1.403225806451613, "grad_norm": 0.4412073493003845, "learning_rate": 4.12636378704043e-05, "loss": 0.2685, "step": 2001 }, { "epoch": 1.4039270687237027, "grad_norm": 0.44365039467811584, "learning_rate": 4.125519943054987e-05, "loss": 0.2562, "step": 2002 }, { "epoch": 1.4046283309957923, "grad_norm": 0.22754186391830444, "learning_rate": 4.124675778107845e-05, "loss": 0.0742, "step": 2003 }, { "epoch": 1.4053295932678822, "grad_norm": 1.0340498685836792, "learning_rate": 4.123831292365683e-05, "loss": 0.4805, "step": 2004 }, { "epoch": 1.4060308555399719, "grad_norm": 0.4210747480392456, "learning_rate": 4.122986485995247e-05, "loss": 0.276, "step": 2005 }, { "epoch": 1.4067321178120618, "grad_norm": 0.3984631597995758, "learning_rate": 4.122141359163345e-05, "loss": 0.2709, "step": 2006 }, { "epoch": 1.4074333800841514, "grad_norm": 0.2023405134677887, "learning_rate": 4.1212959120368506e-05, "loss": 0.0609, "step": 2007 }, { "epoch": 1.4081346423562413, "grad_norm": 0.21299812197685242, "learning_rate": 4.1204501447826974e-05, "loss": 0.0694, "step": 2008 }, { "epoch": 1.408835904628331, "grad_norm": 0.9528634548187256, "learning_rate": 4.119604057567883e-05, "loss": 0.4609, "step": 2009 }, { "epoch": 1.4095371669004209, "grad_norm": 0.38690805435180664, "learning_rate": 4.118757650559469e-05, "loss": 0.2701, "step": 2010 }, { "epoch": 1.4102384291725105, "grad_norm": 0.23245863616466522, "learning_rate": 4.1179109239245816e-05, "loss": 0.079, "step": 2011 }, { "epoch": 1.4109396914446002, "grad_norm": 0.3704327344894409, "learning_rate": 4.117063877830407e-05, "loss": 0.2682, "step": 2012 }, { "epoch": 1.41164095371669, "grad_norm": 0.23426707088947296, "learning_rate": 4.116216512444196e-05, "loss": 0.0803, "step": 2013 }, { "epoch": 1.4123422159887797, "grad_norm": 0.22280727326869965, "learning_rate": 4.115368827933263e-05, "loss": 0.0733, "step": 2014 }, { "epoch": 1.4130434782608696, "grad_norm": 0.3515925109386444, "learning_rate": 4.1145208244649836e-05, "loss": 0.2646, "step": 2015 }, { "epoch": 1.4137447405329593, "grad_norm": 0.23781463503837585, "learning_rate": 4.113672502206798e-05, "loss": 0.0746, "step": 2016 }, { "epoch": 1.414446002805049, "grad_norm": 0.22562900185585022, "learning_rate": 4.11282386132621e-05, "loss": 0.0743, "step": 2017 }, { "epoch": 1.4151472650771388, "grad_norm": 0.23807522654533386, "learning_rate": 4.1119749019907835e-05, "loss": 0.0825, "step": 2018 }, { "epoch": 1.4158485273492287, "grad_norm": 0.2307080328464508, "learning_rate": 4.1111256243681486e-05, "loss": 0.0677, "step": 2019 }, { "epoch": 1.4165497896213184, "grad_norm": 0.344584196805954, "learning_rate": 4.110276028625995e-05, "loss": 0.2654, "step": 2020 }, { "epoch": 1.417251051893408, "grad_norm": 0.23104862868785858, "learning_rate": 4.109426114932078e-05, "loss": 0.076, "step": 2021 }, { "epoch": 1.417952314165498, "grad_norm": 0.21186183393001556, "learning_rate": 4.108575883454213e-05, "loss": 0.0586, "step": 2022 }, { "epoch": 1.4186535764375876, "grad_norm": 0.23232975602149963, "learning_rate": 4.107725334360282e-05, "loss": 0.0749, "step": 2023 }, { "epoch": 1.4193548387096775, "grad_norm": 0.23977768421173096, "learning_rate": 4.1068744678182265e-05, "loss": 0.083, "step": 2024 }, { "epoch": 1.4200561009817672, "grad_norm": 0.2380952537059784, "learning_rate": 4.10602328399605e-05, "loss": 0.0823, "step": 2025 }, { "epoch": 1.4207573632538568, "grad_norm": 0.349254310131073, "learning_rate": 4.105171783061822e-05, "loss": 0.2646, "step": 2026 }, { "epoch": 1.4214586255259467, "grad_norm": 0.3637901842594147, "learning_rate": 4.104319965183673e-05, "loss": 0.2692, "step": 2027 }, { "epoch": 1.4221598877980366, "grad_norm": 0.23766379058361053, "learning_rate": 4.1034678305297944e-05, "loss": 0.0822, "step": 2028 }, { "epoch": 1.4228611500701263, "grad_norm": 5.99507999420166, "learning_rate": 4.102615379268442e-05, "loss": 1.0878, "step": 2029 }, { "epoch": 1.423562412342216, "grad_norm": 0.2246590405702591, "learning_rate": 4.101762611567935e-05, "loss": 0.0737, "step": 2030 }, { "epoch": 1.4242636746143058, "grad_norm": 0.2362731397151947, "learning_rate": 4.1009095275966526e-05, "loss": 0.0815, "step": 2031 }, { "epoch": 1.4249649368863955, "grad_norm": 0.3471863865852356, "learning_rate": 4.100056127523038e-05, "loss": 0.2641, "step": 2032 }, { "epoch": 1.4256661991584854, "grad_norm": 0.22608239948749542, "learning_rate": 4.099202411515597e-05, "loss": 0.0745, "step": 2033 }, { "epoch": 1.426367461430575, "grad_norm": 0.2265549600124359, "learning_rate": 4.098348379742897e-05, "loss": 0.0747, "step": 2034 }, { "epoch": 1.4270687237026647, "grad_norm": 0.34286677837371826, "learning_rate": 4.097494032373567e-05, "loss": 0.262, "step": 2035 }, { "epoch": 1.4277699859747546, "grad_norm": 0.22295841574668884, "learning_rate": 4.096639369576301e-05, "loss": 0.0686, "step": 2036 }, { "epoch": 1.4284712482468442, "grad_norm": 0.22810086607933044, "learning_rate": 4.095784391519853e-05, "loss": 0.0756, "step": 2037 }, { "epoch": 1.4291725105189341, "grad_norm": 0.35635852813720703, "learning_rate": 4.09492909837304e-05, "loss": 0.2578, "step": 2038 }, { "epoch": 1.4298737727910238, "grad_norm": 0.22740039229393005, "learning_rate": 4.0940734903047404e-05, "loss": 0.0757, "step": 2039 }, { "epoch": 1.4305750350631137, "grad_norm": 0.3318709433078766, "learning_rate": 4.093217567483896e-05, "loss": 0.2615, "step": 2040 }, { "epoch": 1.4312762973352033, "grad_norm": 0.23820233345031738, "learning_rate": 4.0923613300795106e-05, "loss": 0.0833, "step": 2041 }, { "epoch": 1.4319775596072932, "grad_norm": 0.2373020350933075, "learning_rate": 4.0915047782606495e-05, "loss": 0.0824, "step": 2042 }, { "epoch": 1.432678821879383, "grad_norm": 0.22532641887664795, "learning_rate": 4.09064791219644e-05, "loss": 0.0748, "step": 2043 }, { "epoch": 1.4333800841514726, "grad_norm": 3.5960569381713867, "learning_rate": 4.0897907320560716e-05, "loss": 0.5376, "step": 2044 }, { "epoch": 1.4340813464235624, "grad_norm": 0.23558112978935242, "learning_rate": 4.088933238008797e-05, "loss": 0.0827, "step": 2045 }, { "epoch": 1.434782608695652, "grad_norm": 0.23570747673511505, "learning_rate": 4.088075430223929e-05, "loss": 0.0826, "step": 2046 }, { "epoch": 1.435483870967742, "grad_norm": 0.22855247557163239, "learning_rate": 4.087217308870843e-05, "loss": 0.0757, "step": 2047 }, { "epoch": 1.4361851332398317, "grad_norm": 0.23638735711574554, "learning_rate": 4.086358874118977e-05, "loss": 0.0831, "step": 2048 }, { "epoch": 1.4368863955119213, "grad_norm": 0.23582282662391663, "learning_rate": 4.085500126137831e-05, "loss": 0.0826, "step": 2049 }, { "epoch": 1.4375876577840112, "grad_norm": 4.614734649658203, "learning_rate": 4.084641065096966e-05, "loss": 0.7348, "step": 2050 }, { "epoch": 1.438288920056101, "grad_norm": 0.22912804782390594, "learning_rate": 4.083781691166003e-05, "loss": 0.0762, "step": 2051 }, { "epoch": 1.4389901823281908, "grad_norm": 0.22945010662078857, "learning_rate": 4.082922004514631e-05, "loss": 0.07, "step": 2052 }, { "epoch": 1.4396914446002804, "grad_norm": 0.33206793665885925, "learning_rate": 4.082062005312592e-05, "loss": 0.2628, "step": 2053 }, { "epoch": 1.4403927068723703, "grad_norm": 0.23541158437728882, "learning_rate": 4.081201693729697e-05, "loss": 0.083, "step": 2054 }, { "epoch": 1.44109396914446, "grad_norm": 0.23480893671512604, "learning_rate": 4.0803410699358164e-05, "loss": 0.0833, "step": 2055 }, { "epoch": 1.4417952314165499, "grad_norm": 0.23258638381958008, "learning_rate": 4.0794801341008804e-05, "loss": 0.0777, "step": 2056 }, { "epoch": 1.4424964936886395, "grad_norm": 0.235279843211174, "learning_rate": 4.0786188863948825e-05, "loss": 0.0719, "step": 2057 }, { "epoch": 1.4431977559607292, "grad_norm": 0.23783181607723236, "learning_rate": 4.0777573269878785e-05, "loss": 0.0718, "step": 2058 }, { "epoch": 1.443899018232819, "grad_norm": 0.2329820692539215, "learning_rate": 4.076895456049984e-05, "loss": 0.0769, "step": 2059 }, { "epoch": 1.444600280504909, "grad_norm": 0.2408151775598526, "learning_rate": 4.076033273751377e-05, "loss": 0.0711, "step": 2060 }, { "epoch": 1.4453015427769986, "grad_norm": 0.3525781035423279, "learning_rate": 4.075170780262296e-05, "loss": 0.2507, "step": 2061 }, { "epoch": 1.4460028050490883, "grad_norm": 0.23362283408641815, "learning_rate": 4.074307975753045e-05, "loss": 0.0824, "step": 2062 }, { "epoch": 1.4467040673211782, "grad_norm": 0.24538685381412506, "learning_rate": 4.073444860393981e-05, "loss": 0.0651, "step": 2063 }, { "epoch": 1.4474053295932678, "grad_norm": 0.23283827304840088, "learning_rate": 4.0725814343555324e-05, "loss": 0.0819, "step": 2064 }, { "epoch": 1.4481065918653577, "grad_norm": 0.23289959132671356, "learning_rate": 4.071717697808182e-05, "loss": 0.0686, "step": 2065 }, { "epoch": 1.4488078541374474, "grad_norm": 0.3462880849838257, "learning_rate": 4.070853650922476e-05, "loss": 0.2632, "step": 2066 }, { "epoch": 1.449509116409537, "grad_norm": 0.23089002072811127, "learning_rate": 4.0699892938690224e-05, "loss": 0.0801, "step": 2067 }, { "epoch": 1.450210378681627, "grad_norm": 0.22386489808559418, "learning_rate": 4.06912462681849e-05, "loss": 0.0727, "step": 2068 }, { "epoch": 1.4509116409537166, "grad_norm": 0.37263861298561096, "learning_rate": 4.068259649941609e-05, "loss": 0.2629, "step": 2069 }, { "epoch": 1.4516129032258065, "grad_norm": 0.22901591658592224, "learning_rate": 4.0673943634091705e-05, "loss": 0.0791, "step": 2070 }, { "epoch": 1.4523141654978962, "grad_norm": 0.9608551859855652, "learning_rate": 4.066528767392026e-05, "loss": 0.4573, "step": 2071 }, { "epoch": 1.453015427769986, "grad_norm": 0.9188109636306763, "learning_rate": 4.065662862061089e-05, "loss": 0.4479, "step": 2072 }, { "epoch": 1.4537166900420757, "grad_norm": 0.21890804171562195, "learning_rate": 4.064796647587336e-05, "loss": 0.0716, "step": 2073 }, { "epoch": 1.4544179523141656, "grad_norm": 0.22970229387283325, "learning_rate": 4.0639301241417995e-05, "loss": 0.0793, "step": 2074 }, { "epoch": 1.4551192145862553, "grad_norm": 0.22462625801563263, "learning_rate": 4.063063291895578e-05, "loss": 0.0589, "step": 2075 }, { "epoch": 1.455820476858345, "grad_norm": 0.3663899600505829, "learning_rate": 4.0621961510198287e-05, "loss": 0.2687, "step": 2076 }, { "epoch": 1.4565217391304348, "grad_norm": 0.3745279014110565, "learning_rate": 4.06132870168577e-05, "loss": 0.2616, "step": 2077 }, { "epoch": 1.4572230014025245, "grad_norm": 0.23009489476680756, "learning_rate": 4.0604609440646806e-05, "loss": 0.0793, "step": 2078 }, { "epoch": 1.4579242636746144, "grad_norm": 0.22795535624027252, "learning_rate": 4.059592878327901e-05, "loss": 0.079, "step": 2079 }, { "epoch": 1.458625525946704, "grad_norm": 0.22903209924697876, "learning_rate": 4.058724504646834e-05, "loss": 0.0789, "step": 2080 }, { "epoch": 1.4593267882187937, "grad_norm": 0.2318805605173111, "learning_rate": 4.0578558231929395e-05, "loss": 0.0799, "step": 2081 }, { "epoch": 1.4600280504908836, "grad_norm": 0.22059044241905212, "learning_rate": 4.0569868341377404e-05, "loss": 0.0719, "step": 2082 }, { "epoch": 1.4607293127629735, "grad_norm": 0.2299024611711502, "learning_rate": 4.056117537652821e-05, "loss": 0.0789, "step": 2083 }, { "epoch": 1.4614305750350631, "grad_norm": 0.370530903339386, "learning_rate": 4.055247933909824e-05, "loss": 0.2694, "step": 2084 }, { "epoch": 1.4621318373071528, "grad_norm": 0.37057939171791077, "learning_rate": 4.0543780230804554e-05, "loss": 0.257, "step": 2085 }, { "epoch": 1.4628330995792427, "grad_norm": 0.23030626773834229, "learning_rate": 4.0535078053364794e-05, "loss": 0.0783, "step": 2086 }, { "epoch": 1.4635343618513323, "grad_norm": 0.9446799755096436, "learning_rate": 4.052637280849723e-05, "loss": 0.4497, "step": 2087 }, { "epoch": 1.4642356241234222, "grad_norm": 0.2304876297712326, "learning_rate": 4.0517664497920723e-05, "loss": 0.0787, "step": 2088 }, { "epoch": 1.464936886395512, "grad_norm": 0.21885275840759277, "learning_rate": 4.0508953123354756e-05, "loss": 0.0708, "step": 2089 }, { "epoch": 1.4656381486676016, "grad_norm": 0.21851450204849243, "learning_rate": 4.050023868651938e-05, "loss": 0.0713, "step": 2090 }, { "epoch": 1.4663394109396914, "grad_norm": 0.21610958874225616, "learning_rate": 4.0491521189135296e-05, "loss": 0.0643, "step": 2091 }, { "epoch": 1.4670406732117813, "grad_norm": 3.8018481731414795, "learning_rate": 4.048280063292378e-05, "loss": 0.5641, "step": 2092 }, { "epoch": 1.467741935483871, "grad_norm": 0.9677783250808716, "learning_rate": 4.047407701960673e-05, "loss": 0.4457, "step": 2093 }, { "epoch": 1.4684431977559607, "grad_norm": 0.21827274560928345, "learning_rate": 4.0465350350906624e-05, "loss": 0.0582, "step": 2094 }, { "epoch": 1.4691444600280505, "grad_norm": 0.36149054765701294, "learning_rate": 4.045662062854657e-05, "loss": 0.264, "step": 2095 }, { "epoch": 1.4698457223001402, "grad_norm": 3.4727556705474854, "learning_rate": 4.044788785425026e-05, "loss": 0.5145, "step": 2096 }, { "epoch": 1.47054698457223, "grad_norm": 0.3528432846069336, "learning_rate": 4.043915202974199e-05, "loss": 0.2652, "step": 2097 }, { "epoch": 1.4712482468443198, "grad_norm": 5.336772441864014, "learning_rate": 4.043041315674668e-05, "loss": 0.8557, "step": 2098 }, { "epoch": 1.4719495091164094, "grad_norm": 0.327062726020813, "learning_rate": 4.042167123698982e-05, "loss": 0.261, "step": 2099 }, { "epoch": 1.4726507713884993, "grad_norm": 0.8511505722999573, "learning_rate": 4.041292627219752e-05, "loss": 0.432, "step": 2100 }, { "epoch": 1.473352033660589, "grad_norm": 0.31497982144355774, "learning_rate": 4.0404178264096506e-05, "loss": 0.2585, "step": 2101 }, { "epoch": 1.4740532959326789, "grad_norm": 0.3171490430831909, "learning_rate": 4.039542721441406e-05, "loss": 0.2566, "step": 2102 }, { "epoch": 1.4747545582047685, "grad_norm": 0.31606337428092957, "learning_rate": 4.0386673124878105e-05, "loss": 0.2509, "step": 2103 }, { "epoch": 1.4754558204768584, "grad_norm": 0.31491971015930176, "learning_rate": 4.0377915997217153e-05, "loss": 0.25, "step": 2104 }, { "epoch": 1.476157082748948, "grad_norm": 0.2513245940208435, "learning_rate": 4.036915583316031e-05, "loss": 0.0857, "step": 2105 }, { "epoch": 1.476858345021038, "grad_norm": 0.25839763879776, "learning_rate": 4.036039263443729e-05, "loss": 0.0875, "step": 2106 }, { "epoch": 1.4775596072931276, "grad_norm": 0.2993863523006439, "learning_rate": 4.0351626402778396e-05, "loss": 0.2531, "step": 2107 }, { "epoch": 1.4782608695652173, "grad_norm": 0.24510283768177032, "learning_rate": 4.034285713991454e-05, "loss": 0.0896, "step": 2108 }, { "epoch": 1.4789621318373072, "grad_norm": 0.2991485595703125, "learning_rate": 4.0334084847577224e-05, "loss": 0.0877, "step": 2109 }, { "epoch": 1.4796633941093968, "grad_norm": 0.24728618562221527, "learning_rate": 4.032530952749855e-05, "loss": 0.0904, "step": 2110 }, { "epoch": 1.4803646563814867, "grad_norm": 0.2875675857067108, "learning_rate": 4.031653118141123e-05, "loss": 0.2556, "step": 2111 }, { "epoch": 1.4810659186535764, "grad_norm": 0.24655497074127197, "learning_rate": 4.0307749811048556e-05, "loss": 0.0904, "step": 2112 }, { "epoch": 1.481767180925666, "grad_norm": 0.26288124918937683, "learning_rate": 4.029896541814443e-05, "loss": 0.09, "step": 2113 }, { "epoch": 1.482468443197756, "grad_norm": 0.2745910882949829, "learning_rate": 4.0290178004433346e-05, "loss": 0.2514, "step": 2114 }, { "epoch": 1.4831697054698458, "grad_norm": 0.2592353820800781, "learning_rate": 4.0281387571650374e-05, "loss": 0.0896, "step": 2115 }, { "epoch": 1.4838709677419355, "grad_norm": 0.25578710436820984, "learning_rate": 4.027259412153122e-05, "loss": 0.0883, "step": 2116 }, { "epoch": 1.4845722300140252, "grad_norm": 0.2776719033718109, "learning_rate": 4.026379765581216e-05, "loss": 0.087, "step": 2117 }, { "epoch": 1.485273492286115, "grad_norm": 0.24473948776721954, "learning_rate": 4.0254998176230085e-05, "loss": 0.0901, "step": 2118 }, { "epoch": 1.4859747545582047, "grad_norm": 0.2550821900367737, "learning_rate": 4.024619568452244e-05, "loss": 0.0882, "step": 2119 }, { "epoch": 1.4866760168302946, "grad_norm": 0.27797260880470276, "learning_rate": 4.023739018242732e-05, "loss": 0.2539, "step": 2120 }, { "epoch": 1.4873772791023843, "grad_norm": 0.27749398350715637, "learning_rate": 4.022858167168336e-05, "loss": 0.2517, "step": 2121 }, { "epoch": 1.488078541374474, "grad_norm": 0.25040188431739807, "learning_rate": 4.021977015402983e-05, "loss": 0.0865, "step": 2122 }, { "epoch": 1.4887798036465638, "grad_norm": 0.243833988904953, "learning_rate": 4.021095563120659e-05, "loss": 0.0897, "step": 2123 }, { "epoch": 1.4894810659186537, "grad_norm": 0.2655108869075775, "learning_rate": 4.020213810495406e-05, "loss": 0.0839, "step": 2124 }, { "epoch": 1.4901823281907434, "grad_norm": 0.2716559171676636, "learning_rate": 4.01933175770133e-05, "loss": 0.0836, "step": 2125 }, { "epoch": 1.490883590462833, "grad_norm": 0.2858564257621765, "learning_rate": 4.018449404912591e-05, "loss": 0.2543, "step": 2126 }, { "epoch": 1.491584852734923, "grad_norm": 0.24227552115917206, "learning_rate": 4.0175667523034136e-05, "loss": 0.0891, "step": 2127 }, { "epoch": 1.4922861150070126, "grad_norm": 0.2847979664802551, "learning_rate": 4.016683800048077e-05, "loss": 0.2535, "step": 2128 }, { "epoch": 1.4929873772791025, "grad_norm": 0.24033360183238983, "learning_rate": 4.0158005483209234e-05, "loss": 0.0881, "step": 2129 }, { "epoch": 1.4936886395511921, "grad_norm": 0.7888100743293762, "learning_rate": 4.014916997296352e-05, "loss": 0.4226, "step": 2130 }, { "epoch": 1.4943899018232818, "grad_norm": 0.24011127650737762, "learning_rate": 4.01403314714882e-05, "loss": 0.0879, "step": 2131 }, { "epoch": 1.4950911640953717, "grad_norm": 0.30793288350105286, "learning_rate": 4.013148998052847e-05, "loss": 0.2497, "step": 2132 }, { "epoch": 1.4957924263674613, "grad_norm": 0.24275735020637512, "learning_rate": 4.0122645501830095e-05, "loss": 0.0839, "step": 2133 }, { "epoch": 1.4964936886395512, "grad_norm": 1.9051650762557983, "learning_rate": 4.0113798037139416e-05, "loss": 0.5174, "step": 2134 }, { "epoch": 1.497194950911641, "grad_norm": 0.3010873794555664, "learning_rate": 4.01049475882034e-05, "loss": 0.2526, "step": 2135 }, { "epoch": 1.4978962131837308, "grad_norm": 0.24105344712734222, "learning_rate": 4.009609415676957e-05, "loss": 0.0886, "step": 2136 }, { "epoch": 1.4985974754558204, "grad_norm": 1.8420761823654175, "learning_rate": 4.008723774458605e-05, "loss": 0.4908, "step": 2137 }, { "epoch": 1.4992987377279103, "grad_norm": 0.23994417488574982, "learning_rate": 4.007837835340157e-05, "loss": 0.0883, "step": 2138 }, { "epoch": 1.5, "grad_norm": 0.7768651843070984, "learning_rate": 4.006951598496542e-05, "loss": 0.4149, "step": 2139 }, { "epoch": 1.5007012622720897, "grad_norm": 0.27877935767173767, "learning_rate": 4.006065064102749e-05, "loss": 0.2536, "step": 2140 }, { "epoch": 1.5014025245441796, "grad_norm": 0.7793298959732056, "learning_rate": 4.005178232333826e-05, "loss": 0.42, "step": 2141 }, { "epoch": 1.5021037868162694, "grad_norm": 0.29519158601760864, "learning_rate": 4.00429110336488e-05, "loss": 0.2502, "step": 2142 }, { "epoch": 1.502805049088359, "grad_norm": 2.014536142349243, "learning_rate": 4.003403677371075e-05, "loss": 0.4633, "step": 2143 }, { "epoch": 1.5035063113604488, "grad_norm": 1.4261164665222168, "learning_rate": 4.002515954527635e-05, "loss": 0.44, "step": 2144 }, { "epoch": 1.5042075736325384, "grad_norm": 0.278766006231308, "learning_rate": 4.001627935009842e-05, "loss": 0.0899, "step": 2145 }, { "epoch": 1.5049088359046283, "grad_norm": 0.26956769824028015, "learning_rate": 4.000739618993039e-05, "loss": 0.0938, "step": 2146 }, { "epoch": 1.5056100981767182, "grad_norm": 0.3313373625278473, "learning_rate": 3.999851006652623e-05, "loss": 0.0956, "step": 2147 }, { "epoch": 1.5063113604488079, "grad_norm": 0.2593673765659332, "learning_rate": 3.998962098164054e-05, "loss": 0.2504, "step": 2148 }, { "epoch": 1.5070126227208975, "grad_norm": 1.1841387748718262, "learning_rate": 3.9980728937028475e-05, "loss": 0.3936, "step": 2149 }, { "epoch": 1.5077138849929874, "grad_norm": 0.2650699019432068, "learning_rate": 3.997183393444579e-05, "loss": 0.2537, "step": 2150 }, { "epoch": 1.5084151472650773, "grad_norm": 0.25597089529037476, "learning_rate": 3.996293597564881e-05, "loss": 0.0945, "step": 2151 }, { "epoch": 1.509116409537167, "grad_norm": 0.2477423995733261, "learning_rate": 3.995403506239446e-05, "loss": 0.2518, "step": 2152 }, { "epoch": 1.5098176718092566, "grad_norm": 0.25597700476646423, "learning_rate": 3.994513119644024e-05, "loss": 0.2524, "step": 2153 }, { "epoch": 1.5105189340813463, "grad_norm": 0.24504631757736206, "learning_rate": 3.9936224379544226e-05, "loss": 0.2453, "step": 2154 }, { "epoch": 1.5112201963534362, "grad_norm": 0.28929975628852844, "learning_rate": 3.9927314613465094e-05, "loss": 0.0994, "step": 2155 }, { "epoch": 1.511921458625526, "grad_norm": 0.2624632716178894, "learning_rate": 3.991840189996209e-05, "loss": 0.0967, "step": 2156 }, { "epoch": 1.5126227208976157, "grad_norm": 0.29909980297088623, "learning_rate": 3.990948624079504e-05, "loss": 0.1011, "step": 2157 }, { "epoch": 1.5133239831697054, "grad_norm": 0.24799995124340057, "learning_rate": 3.990056763772435e-05, "loss": 0.2515, "step": 2158 }, { "epoch": 1.5140252454417953, "grad_norm": 0.26268264651298523, "learning_rate": 3.9891646092511036e-05, "loss": 0.0965, "step": 2159 }, { "epoch": 1.514726507713885, "grad_norm": 0.26442134380340576, "learning_rate": 3.988272160691665e-05, "loss": 0.0975, "step": 2160 }, { "epoch": 1.5154277699859748, "grad_norm": 0.2557312846183777, "learning_rate": 3.987379418270336e-05, "loss": 0.2505, "step": 2161 }, { "epoch": 1.5161290322580645, "grad_norm": 0.2990044355392456, "learning_rate": 3.98648638216339e-05, "loss": 0.1008, "step": 2162 }, { "epoch": 1.5168302945301542, "grad_norm": 0.2662411630153656, "learning_rate": 3.985593052547159e-05, "loss": 0.0974, "step": 2163 }, { "epoch": 1.517531556802244, "grad_norm": 0.24913959205150604, "learning_rate": 3.98469942959803e-05, "loss": 0.2525, "step": 2164 }, { "epoch": 1.518232819074334, "grad_norm": 0.2650552988052368, "learning_rate": 3.9838055134924526e-05, "loss": 0.097, "step": 2165 }, { "epoch": 1.5189340813464236, "grad_norm": 0.8519802093505859, "learning_rate": 3.982911304406931e-05, "loss": 0.2256, "step": 2166 }, { "epoch": 1.5196353436185133, "grad_norm": 0.7242515683174133, "learning_rate": 3.9820168025180305e-05, "loss": 0.2012, "step": 2167 }, { "epoch": 1.520336605890603, "grad_norm": 0.2446037381887436, "learning_rate": 3.981122008002369e-05, "loss": 0.2536, "step": 2168 }, { "epoch": 1.5210378681626928, "grad_norm": 0.26746657490730286, "learning_rate": 3.980226921036627e-05, "loss": 0.0983, "step": 2169 }, { "epoch": 1.5217391304347827, "grad_norm": 0.2659359574317932, "learning_rate": 3.979331541797541e-05, "loss": 0.0977, "step": 2170 }, { "epoch": 1.5224403927068724, "grad_norm": 0.539145290851593, "learning_rate": 3.9784358704619045e-05, "loss": 0.1235, "step": 2171 }, { "epoch": 1.523141654978962, "grad_norm": 0.2668910026550293, "learning_rate": 3.9775399072065695e-05, "loss": 0.098, "step": 2172 }, { "epoch": 1.523842917251052, "grad_norm": 0.3134414553642273, "learning_rate": 3.976643652208445e-05, "loss": 0.1036, "step": 2173 }, { "epoch": 1.5245441795231418, "grad_norm": 0.3488001823425293, "learning_rate": 3.9757471056444996e-05, "loss": 0.107, "step": 2174 }, { "epoch": 1.5252454417952315, "grad_norm": 0.2646876275539398, "learning_rate": 3.974850267691757e-05, "loss": 0.0976, "step": 2175 }, { "epoch": 1.5259467040673211, "grad_norm": 0.261659175157547, "learning_rate": 3.973953138527299e-05, "loss": 0.096, "step": 2176 }, { "epoch": 1.5266479663394108, "grad_norm": 0.2990657687187195, "learning_rate": 3.9730557183282654e-05, "loss": 0.0999, "step": 2177 }, { "epoch": 1.5273492286115007, "grad_norm": 0.2585776448249817, "learning_rate": 3.972158007271853e-05, "loss": 0.0953, "step": 2178 }, { "epoch": 1.5280504908835906, "grad_norm": 0.2569161355495453, "learning_rate": 3.9712600055353186e-05, "loss": 0.0947, "step": 2179 }, { "epoch": 1.5287517531556802, "grad_norm": 0.34878113865852356, "learning_rate": 3.970361713295973e-05, "loss": 0.1031, "step": 2180 }, { "epoch": 1.52945301542777, "grad_norm": 0.3187731206417084, "learning_rate": 3.969463130731183e-05, "loss": 0.1001, "step": 2181 }, { "epoch": 1.5301542776998598, "grad_norm": 0.25321316719055176, "learning_rate": 3.968564258018378e-05, "loss": 0.093, "step": 2182 }, { "epoch": 1.5308555399719497, "grad_norm": 0.7541787624359131, "learning_rate": 3.9676650953350425e-05, "loss": 0.4134, "step": 2183 }, { "epoch": 1.5315568022440393, "grad_norm": 0.27451619505882263, "learning_rate": 3.9667656428587165e-05, "loss": 0.0942, "step": 2184 }, { "epoch": 1.532258064516129, "grad_norm": 0.28458282351493835, "learning_rate": 3.965865900766997e-05, "loss": 0.2559, "step": 2185 }, { "epoch": 1.5329593267882187, "grad_norm": 0.917009711265564, "learning_rate": 3.964965869237542e-05, "loss": 0.2294, "step": 2186 }, { "epoch": 1.5336605890603086, "grad_norm": 0.283390611410141, "learning_rate": 3.964065548448065e-05, "loss": 0.258, "step": 2187 }, { "epoch": 1.5343618513323984, "grad_norm": 1.9149168729782104, "learning_rate": 3.963164938576332e-05, "loss": 0.3973, "step": 2188 }, { "epoch": 1.535063113604488, "grad_norm": 0.27191561460494995, "learning_rate": 3.962264039800173e-05, "loss": 0.2521, "step": 2189 }, { "epoch": 1.5357643758765778, "grad_norm": 0.26718395948410034, "learning_rate": 3.961362852297472e-05, "loss": 0.2536, "step": 2190 }, { "epoch": 1.5364656381486677, "grad_norm": 0.7980940937995911, "learning_rate": 3.960461376246169e-05, "loss": 0.2166, "step": 2191 }, { "epoch": 1.5371669004207573, "grad_norm": 0.28447139263153076, "learning_rate": 3.9595596118242614e-05, "loss": 0.0974, "step": 2192 }, { "epoch": 1.5378681626928472, "grad_norm": 0.25615188479423523, "learning_rate": 3.958657559209806e-05, "loss": 0.2566, "step": 2193 }, { "epoch": 1.5385694249649369, "grad_norm": 0.2933798134326935, "learning_rate": 3.957755218580913e-05, "loss": 0.0994, "step": 2194 }, { "epoch": 1.5392706872370265, "grad_norm": 0.25405237078666687, "learning_rate": 3.956852590115753e-05, "loss": 0.0939, "step": 2195 }, { "epoch": 1.5399719495091164, "grad_norm": 0.3763245940208435, "learning_rate": 3.955949673992549e-05, "loss": 0.1104, "step": 2196 }, { "epoch": 1.5406732117812063, "grad_norm": 0.25400835275650024, "learning_rate": 3.955046470389585e-05, "loss": 0.094, "step": 2197 }, { "epoch": 1.541374474053296, "grad_norm": 0.2921596169471741, "learning_rate": 3.9541429794852004e-05, "loss": 0.0992, "step": 2198 }, { "epoch": 1.5420757363253856, "grad_norm": 0.2574375569820404, "learning_rate": 3.953239201457789e-05, "loss": 0.2495, "step": 2199 }, { "epoch": 1.5427769985974753, "grad_norm": 0.27753257751464844, "learning_rate": 3.9523351364858054e-05, "loss": 0.2594, "step": 2200 }, { "epoch": 1.5434782608695652, "grad_norm": 0.36490336060523987, "learning_rate": 3.951430784747758e-05, "loss": 0.1077, "step": 2201 }, { "epoch": 1.544179523141655, "grad_norm": 0.25659871101379395, "learning_rate": 3.950526146422213e-05, "loss": 0.2488, "step": 2202 }, { "epoch": 1.5448807854137447, "grad_norm": 0.2609190344810486, "learning_rate": 3.949621221687792e-05, "loss": 0.2505, "step": 2203 }, { "epoch": 1.5455820476858344, "grad_norm": 0.261858731508255, "learning_rate": 3.948716010723175e-05, "loss": 0.2522, "step": 2204 }, { "epoch": 1.5462833099579243, "grad_norm": 0.2634364068508148, "learning_rate": 3.947810513707096e-05, "loss": 0.2577, "step": 2205 }, { "epoch": 1.5469845722300142, "grad_norm": 0.29834258556365967, "learning_rate": 3.9469047308183485e-05, "loss": 0.0987, "step": 2206 }, { "epoch": 1.5476858345021038, "grad_norm": 0.3044244050979614, "learning_rate": 3.94599866223578e-05, "loss": 0.1006, "step": 2207 }, { "epoch": 1.5483870967741935, "grad_norm": 0.7442694306373596, "learning_rate": 3.9450923081382954e-05, "loss": 0.4103, "step": 2208 }, { "epoch": 1.5490883590462832, "grad_norm": 0.2911306321620941, "learning_rate": 3.944185668704856e-05, "loss": 0.0989, "step": 2209 }, { "epoch": 1.549789621318373, "grad_norm": 0.2518026828765869, "learning_rate": 3.943278744114479e-05, "loss": 0.0933, "step": 2210 }, { "epoch": 1.550490883590463, "grad_norm": 0.25254371762275696, "learning_rate": 3.942371534546239e-05, "loss": 0.0933, "step": 2211 }, { "epoch": 1.5511921458625526, "grad_norm": 0.25867241621017456, "learning_rate": 3.941464040179266e-05, "loss": 0.2498, "step": 2212 }, { "epoch": 1.5518934081346423, "grad_norm": 0.29166993498802185, "learning_rate": 3.9405562611927444e-05, "loss": 0.0981, "step": 2213 }, { "epoch": 1.5525946704067322, "grad_norm": 0.25246548652648926, "learning_rate": 3.9396481977659186e-05, "loss": 0.0933, "step": 2214 }, { "epoch": 1.553295932678822, "grad_norm": 1.0245563983917236, "learning_rate": 3.9387398500780884e-05, "loss": 0.3587, "step": 2215 }, { "epoch": 1.5539971949509117, "grad_norm": 0.2501509487628937, "learning_rate": 3.937831218308605e-05, "loss": 0.0928, "step": 2216 }, { "epoch": 1.5546984572230014, "grad_norm": 0.2656465470790863, "learning_rate": 3.9369223026368825e-05, "loss": 0.2509, "step": 2217 }, { "epoch": 1.555399719495091, "grad_norm": 0.25252652168273926, "learning_rate": 3.9360131032423867e-05, "loss": 0.0933, "step": 2218 }, { "epoch": 1.556100981767181, "grad_norm": 0.28683343529701233, "learning_rate": 3.93510362030464e-05, "loss": 0.2583, "step": 2219 }, { "epoch": 1.5568022440392708, "grad_norm": 0.3089422881603241, "learning_rate": 3.934193854003223e-05, "loss": 0.2648, "step": 2220 }, { "epoch": 1.5575035063113605, "grad_norm": 0.25361326336860657, "learning_rate": 3.933283804517769e-05, "loss": 0.0938, "step": 2221 }, { "epoch": 1.5582047685834501, "grad_norm": 0.2997463047504425, "learning_rate": 3.9323734720279695e-05, "loss": 0.1002, "step": 2222 }, { "epoch": 1.55890603085554, "grad_norm": 0.26068463921546936, "learning_rate": 3.9314628567135715e-05, "loss": 0.2493, "step": 2223 }, { "epoch": 1.5596072931276297, "grad_norm": 0.252510666847229, "learning_rate": 3.930551958754376e-05, "loss": 0.0934, "step": 2224 }, { "epoch": 1.5603085553997196, "grad_norm": 0.30144768953323364, "learning_rate": 3.929640778330244e-05, "loss": 0.2632, "step": 2225 }, { "epoch": 1.5610098176718092, "grad_norm": 0.28887906670570374, "learning_rate": 3.928729315621087e-05, "loss": 0.0976, "step": 2226 }, { "epoch": 1.561711079943899, "grad_norm": 0.2592410445213318, "learning_rate": 3.9278175708068774e-05, "loss": 0.2494, "step": 2227 }, { "epoch": 1.5624123422159888, "grad_norm": 0.25830215215682983, "learning_rate": 3.926905544067638e-05, "loss": 0.2495, "step": 2228 }, { "epoch": 1.5631136044880787, "grad_norm": 0.24950061738491058, "learning_rate": 3.925993235583453e-05, "loss": 0.0926, "step": 2229 }, { "epoch": 1.5638148667601683, "grad_norm": 0.25050556659698486, "learning_rate": 3.925080645534457e-05, "loss": 0.0927, "step": 2230 }, { "epoch": 1.564516129032258, "grad_norm": 0.24958568811416626, "learning_rate": 3.924167774100843e-05, "loss": 0.0923, "step": 2231 }, { "epoch": 1.5652173913043477, "grad_norm": 0.32020339369773865, "learning_rate": 3.9232546214628594e-05, "loss": 0.0991, "step": 2232 }, { "epoch": 1.5659186535764376, "grad_norm": 0.2484491616487503, "learning_rate": 3.922341187800811e-05, "loss": 0.0918, "step": 2233 }, { "epoch": 1.5666199158485274, "grad_norm": 0.2780219316482544, "learning_rate": 3.921427473295054e-05, "loss": 0.0944, "step": 2234 }, { "epoch": 1.567321178120617, "grad_norm": 0.27306824922561646, "learning_rate": 3.920513478126005e-05, "loss": 0.2548, "step": 2235 }, { "epoch": 1.5680224403927068, "grad_norm": 0.2738950848579407, "learning_rate": 3.9195992024741324e-05, "loss": 0.093, "step": 2236 }, { "epoch": 1.5687237026647967, "grad_norm": 0.28765398263931274, "learning_rate": 3.918684646519963e-05, "loss": 0.2579, "step": 2237 }, { "epoch": 1.5694249649368865, "grad_norm": 0.24332071840763092, "learning_rate": 3.9177698104440766e-05, "loss": 0.0901, "step": 2238 }, { "epoch": 1.5701262272089762, "grad_norm": 0.27449116110801697, "learning_rate": 3.91685469442711e-05, "loss": 0.2533, "step": 2239 }, { "epoch": 1.5708274894810659, "grad_norm": 0.26406389474868774, "learning_rate": 3.915939298649753e-05, "loss": 0.0907, "step": 2240 }, { "epoch": 1.5715287517531555, "grad_norm": 1.1362838745117188, "learning_rate": 3.915023623292754e-05, "loss": 0.2446, "step": 2241 }, { "epoch": 1.5722300140252454, "grad_norm": 0.29112550616264343, "learning_rate": 3.9141076685369125e-05, "loss": 0.0908, "step": 2242 }, { "epoch": 1.5729312762973353, "grad_norm": 0.289530485868454, "learning_rate": 3.913191434563087e-05, "loss": 0.2567, "step": 2243 }, { "epoch": 1.573632538569425, "grad_norm": 0.2629472315311432, "learning_rate": 3.912274921552189e-05, "loss": 0.0907, "step": 2244 }, { "epoch": 1.5743338008415146, "grad_norm": 0.24276921153068542, "learning_rate": 3.911358129685184e-05, "loss": 0.0899, "step": 2245 }, { "epoch": 1.5750350631136045, "grad_norm": 0.2957235872745514, "learning_rate": 3.9104410591430976e-05, "loss": 0.0927, "step": 2246 }, { "epoch": 1.5757363253856944, "grad_norm": 0.2761714458465576, "learning_rate": 3.909523710107004e-05, "loss": 0.2528, "step": 2247 }, { "epoch": 1.576437587657784, "grad_norm": 0.2988482415676117, "learning_rate": 3.908606082758035e-05, "loss": 0.2537, "step": 2248 }, { "epoch": 1.5771388499298737, "grad_norm": 0.7750107049942017, "learning_rate": 3.90768817727738e-05, "loss": 0.4208, "step": 2249 }, { "epoch": 1.5778401122019634, "grad_norm": 0.24056494235992432, "learning_rate": 3.9067699938462804e-05, "loss": 0.0889, "step": 2250 }, { "epoch": 1.5785413744740533, "grad_norm": 0.3206344246864319, "learning_rate": 3.905851532646032e-05, "loss": 0.0897, "step": 2251 }, { "epoch": 1.5792426367461432, "grad_norm": 0.7739734649658203, "learning_rate": 3.9049327938579875e-05, "loss": 0.4209, "step": 2252 }, { "epoch": 1.5799438990182328, "grad_norm": 0.24191449582576752, "learning_rate": 3.904013777663553e-05, "loss": 0.0892, "step": 2253 }, { "epoch": 1.5806451612903225, "grad_norm": 1.2687175273895264, "learning_rate": 3.90309448424419e-05, "loss": 0.5872, "step": 2254 }, { "epoch": 1.5813464235624124, "grad_norm": 1.3067768812179565, "learning_rate": 3.902174913781414e-05, "loss": 0.4161, "step": 2255 }, { "epoch": 1.582047685834502, "grad_norm": 0.2423582226037979, "learning_rate": 3.901255066456797e-05, "loss": 0.0895, "step": 2256 }, { "epoch": 1.582748948106592, "grad_norm": 0.7662090063095093, "learning_rate": 3.900334942451963e-05, "loss": 0.419, "step": 2257 }, { "epoch": 1.5834502103786816, "grad_norm": 0.26799434423446655, "learning_rate": 3.8994145419485935e-05, "loss": 0.2497, "step": 2258 }, { "epoch": 1.5841514726507713, "grad_norm": 0.27292075753211975, "learning_rate": 3.898493865128422e-05, "loss": 0.0934, "step": 2259 }, { "epoch": 1.5848527349228612, "grad_norm": 1.2496873140335083, "learning_rate": 3.897572912173238e-05, "loss": 0.5751, "step": 2260 }, { "epoch": 1.585553997194951, "grad_norm": 0.26979315280914307, "learning_rate": 3.896651683264886e-05, "loss": 0.252, "step": 2261 }, { "epoch": 1.5862552594670407, "grad_norm": 0.3139861226081848, "learning_rate": 3.895730178585263e-05, "loss": 0.0955, "step": 2262 }, { "epoch": 1.5869565217391304, "grad_norm": 0.26708847284317017, "learning_rate": 3.8948083983163225e-05, "loss": 0.2504, "step": 2263 }, { "epoch": 1.58765778401122, "grad_norm": 0.28017091751098633, "learning_rate": 3.8938863426400715e-05, "loss": 0.0954, "step": 2264 }, { "epoch": 1.58835904628331, "grad_norm": 0.25559327006340027, "learning_rate": 3.892964011738571e-05, "loss": 0.2509, "step": 2265 }, { "epoch": 1.5890603085553998, "grad_norm": 0.28447529673576355, "learning_rate": 3.8920414057939384e-05, "loss": 0.0964, "step": 2266 }, { "epoch": 1.5897615708274895, "grad_norm": 0.2848474681377411, "learning_rate": 3.891118524988342e-05, "loss": 0.0963, "step": 2267 }, { "epoch": 1.5904628330995791, "grad_norm": 0.2873401641845703, "learning_rate": 3.890195369504006e-05, "loss": 0.0948, "step": 2268 }, { "epoch": 1.591164095371669, "grad_norm": 0.26477891206741333, "learning_rate": 3.8892719395232105e-05, "loss": 0.2492, "step": 2269 }, { "epoch": 1.591865357643759, "grad_norm": 0.4410552978515625, "learning_rate": 3.888348235228287e-05, "loss": 0.099, "step": 2270 }, { "epoch": 1.5925666199158486, "grad_norm": 0.31063157320022583, "learning_rate": 3.887424256801624e-05, "loss": 0.0945, "step": 2271 }, { "epoch": 1.5932678821879382, "grad_norm": 0.2562982738018036, "learning_rate": 3.886500004425662e-05, "loss": 0.249, "step": 2272 }, { "epoch": 1.593969144460028, "grad_norm": 0.27826476097106934, "learning_rate": 3.885575478282894e-05, "loss": 0.0935, "step": 2273 }, { "epoch": 1.5946704067321178, "grad_norm": 0.2529725432395935, "learning_rate": 3.8846506785558736e-05, "loss": 0.0935, "step": 2274 }, { "epoch": 1.5953716690042077, "grad_norm": 0.26347896456718445, "learning_rate": 3.883725605427201e-05, "loss": 0.2515, "step": 2275 }, { "epoch": 1.5960729312762973, "grad_norm": 0.2621895670890808, "learning_rate": 3.8828002590795344e-05, "loss": 0.2498, "step": 2276 }, { "epoch": 1.596774193548387, "grad_norm": 0.27900898456573486, "learning_rate": 3.881874639695584e-05, "loss": 0.0932, "step": 2277 }, { "epoch": 1.597475455820477, "grad_norm": 0.25436538457870483, "learning_rate": 3.880948747458117e-05, "loss": 0.0931, "step": 2278 }, { "epoch": 1.5981767180925668, "grad_norm": 0.7727996110916138, "learning_rate": 3.88002258254995e-05, "loss": 0.4152, "step": 2279 }, { "epoch": 1.5988779803646564, "grad_norm": 0.25149011611938477, "learning_rate": 3.8790961451539585e-05, "loss": 0.0922, "step": 2280 }, { "epoch": 1.5995792426367461, "grad_norm": 0.2529125213623047, "learning_rate": 3.8781694354530673e-05, "loss": 0.0925, "step": 2281 }, { "epoch": 1.6002805049088358, "grad_norm": 0.27275747060775757, "learning_rate": 3.8772424536302564e-05, "loss": 0.0916, "step": 2282 }, { "epoch": 1.6009817671809257, "grad_norm": 0.26830190420150757, "learning_rate": 3.876315199868561e-05, "loss": 0.0907, "step": 2283 }, { "epoch": 1.6016830294530155, "grad_norm": 1.1401536464691162, "learning_rate": 3.87538767435107e-05, "loss": 0.2499, "step": 2284 }, { "epoch": 1.6023842917251052, "grad_norm": 0.25086838006973267, "learning_rate": 3.874459877260923e-05, "loss": 0.0917, "step": 2285 }, { "epoch": 1.6030855539971949, "grad_norm": 0.27577632665634155, "learning_rate": 3.873531808781315e-05, "loss": 0.2504, "step": 2286 }, { "epoch": 1.6037868162692848, "grad_norm": 0.2799355983734131, "learning_rate": 3.872603469095496e-05, "loss": 0.2479, "step": 2287 }, { "epoch": 1.6044880785413744, "grad_norm": 0.29574277997016907, "learning_rate": 3.871674858386768e-05, "loss": 0.0881, "step": 2288 }, { "epoch": 1.6051893408134643, "grad_norm": 0.25113701820373535, "learning_rate": 3.870745976838486e-05, "loss": 0.0921, "step": 2289 }, { "epoch": 1.605890603085554, "grad_norm": 0.280291885137558, "learning_rate": 3.86981682463406e-05, "loss": 0.2483, "step": 2290 }, { "epoch": 1.6065918653576436, "grad_norm": 0.265438973903656, "learning_rate": 3.868887401956952e-05, "loss": 0.0899, "step": 2291 }, { "epoch": 1.6072931276297335, "grad_norm": 0.2650667130947113, "learning_rate": 3.867957708990679e-05, "loss": 0.0891, "step": 2292 }, { "epoch": 1.6079943899018234, "grad_norm": 0.7690279483795166, "learning_rate": 3.8670277459188106e-05, "loss": 0.4124, "step": 2293 }, { "epoch": 1.608695652173913, "grad_norm": 1.8419525623321533, "learning_rate": 3.8660975129249685e-05, "loss": 0.4508, "step": 2294 }, { "epoch": 1.6093969144460027, "grad_norm": 0.3418840765953064, "learning_rate": 3.86516701019283e-05, "loss": 0.0857, "step": 2295 }, { "epoch": 1.6100981767180924, "grad_norm": 0.7611752152442932, "learning_rate": 3.8642362379061234e-05, "loss": 0.4107, "step": 2296 }, { "epoch": 1.6107994389901823, "grad_norm": 0.27202120423316956, "learning_rate": 3.863305196248631e-05, "loss": 0.091, "step": 2297 }, { "epoch": 1.6115007012622722, "grad_norm": 0.2739139497280121, "learning_rate": 3.86237388540419e-05, "loss": 0.2515, "step": 2298 }, { "epoch": 1.6122019635343618, "grad_norm": 0.27541273832321167, "learning_rate": 3.861442305556688e-05, "loss": 0.091, "step": 2299 }, { "epoch": 1.6129032258064515, "grad_norm": 0.7493345737457275, "learning_rate": 3.8605104568900685e-05, "loss": 0.4083, "step": 2300 }, { "epoch": 1.6136044880785414, "grad_norm": 0.2562761604785919, "learning_rate": 3.859578339588326e-05, "loss": 0.0934, "step": 2301 }, { "epoch": 1.6143057503506313, "grad_norm": 0.274596244096756, "learning_rate": 3.858645953835507e-05, "loss": 0.0911, "step": 2302 }, { "epoch": 1.615007012622721, "grad_norm": 0.27097782492637634, "learning_rate": 3.857713299815715e-05, "loss": 0.2528, "step": 2303 }, { "epoch": 1.6157082748948106, "grad_norm": 0.323472261428833, "learning_rate": 3.8567803777131027e-05, "loss": 0.0914, "step": 2304 }, { "epoch": 1.6164095371669003, "grad_norm": 0.28179916739463806, "learning_rate": 3.8558471877118786e-05, "loss": 0.0942, "step": 2305 }, { "epoch": 1.6171107994389902, "grad_norm": 0.27056458592414856, "learning_rate": 3.854913729996301e-05, "loss": 0.2482, "step": 2306 }, { "epoch": 1.61781206171108, "grad_norm": 0.2666454315185547, "learning_rate": 3.853980004750684e-05, "loss": 0.252, "step": 2307 }, { "epoch": 1.6185133239831697, "grad_norm": 0.2762633264064789, "learning_rate": 3.8530460121593914e-05, "loss": 0.0897, "step": 2308 }, { "epoch": 1.6192145862552594, "grad_norm": 0.27904683351516724, "learning_rate": 3.8521117524068445e-05, "loss": 0.0918, "step": 2309 }, { "epoch": 1.6199158485273493, "grad_norm": 0.303520143032074, "learning_rate": 3.851177225677513e-05, "loss": 0.2491, "step": 2310 }, { "epoch": 1.6206171107994392, "grad_norm": 0.2689781188964844, "learning_rate": 3.85024243215592e-05, "loss": 0.2485, "step": 2311 }, { "epoch": 1.6213183730715288, "grad_norm": 0.27728700637817383, "learning_rate": 3.849307372026643e-05, "loss": 0.0909, "step": 2312 }, { "epoch": 1.6220196353436185, "grad_norm": 0.7580780386924744, "learning_rate": 3.84837204547431e-05, "loss": 0.4044, "step": 2313 }, { "epoch": 1.6227208976157081, "grad_norm": 0.27867722511291504, "learning_rate": 3.847436452683605e-05, "loss": 0.0916, "step": 2314 }, { "epoch": 1.623422159887798, "grad_norm": 0.27752572298049927, "learning_rate": 3.846500593839262e-05, "loss": 0.2538, "step": 2315 }, { "epoch": 1.624123422159888, "grad_norm": 0.3010219633579254, "learning_rate": 3.845564469126066e-05, "loss": 0.0863, "step": 2316 }, { "epoch": 1.6248246844319776, "grad_norm": 0.272114098072052, "learning_rate": 3.844628078728858e-05, "loss": 0.0899, "step": 2317 }, { "epoch": 1.6255259467040672, "grad_norm": 0.29275357723236084, "learning_rate": 3.8436914228325296e-05, "loss": 0.0846, "step": 2318 }, { "epoch": 1.6262272089761571, "grad_norm": 0.2545571029186249, "learning_rate": 3.8427545016220255e-05, "loss": 0.0923, "step": 2319 }, { "epoch": 1.6269284712482468, "grad_norm": 0.26934224367141724, "learning_rate": 3.841817315282342e-05, "loss": 0.0882, "step": 2320 }, { "epoch": 1.6276297335203367, "grad_norm": 0.26511111855506897, "learning_rate": 3.8408798639985276e-05, "loss": 0.0878, "step": 2321 }, { "epoch": 1.6283309957924264, "grad_norm": 0.2766767740249634, "learning_rate": 3.839942147955684e-05, "loss": 0.2495, "step": 2322 }, { "epoch": 1.629032258064516, "grad_norm": 0.25879207253456116, "learning_rate": 3.839004167338966e-05, "loss": 0.0856, "step": 2323 }, { "epoch": 1.629733520336606, "grad_norm": 0.2540234923362732, "learning_rate": 3.838065922333578e-05, "loss": 0.0912, "step": 2324 }, { "epoch": 1.6304347826086958, "grad_norm": 0.2874945402145386, "learning_rate": 3.837127413124778e-05, "loss": 0.2526, "step": 2325 }, { "epoch": 1.6311360448807855, "grad_norm": 0.30172988772392273, "learning_rate": 3.836188639897878e-05, "loss": 0.2455, "step": 2326 }, { "epoch": 1.6318373071528751, "grad_norm": 0.8014464974403381, "learning_rate": 3.8352496028382385e-05, "loss": 0.4114, "step": 2327 }, { "epoch": 1.6325385694249648, "grad_norm": 0.7974407076835632, "learning_rate": 3.834310302131274e-05, "loss": 0.41, "step": 2328 }, { "epoch": 1.6332398316970547, "grad_norm": 0.31164035201072693, "learning_rate": 3.833370737962453e-05, "loss": 0.2567, "step": 2329 }, { "epoch": 1.6339410939691446, "grad_norm": 3.1609301567077637, "learning_rate": 3.832430910517293e-05, "loss": 0.5764, "step": 2330 }, { "epoch": 1.6346423562412342, "grad_norm": 2.8725521564483643, "learning_rate": 3.831490819981364e-05, "loss": 0.3998, "step": 2331 }, { "epoch": 1.6353436185133239, "grad_norm": 0.26580455899238586, "learning_rate": 3.830550466540288e-05, "loss": 0.0788, "step": 2332 }, { "epoch": 1.6360448807854138, "grad_norm": 0.2923280894756317, "learning_rate": 3.8296098503797405e-05, "loss": 0.2463, "step": 2333 }, { "epoch": 1.6367461430575037, "grad_norm": 0.2805198132991791, "learning_rate": 3.8286689716854484e-05, "loss": 0.0812, "step": 2334 }, { "epoch": 1.6374474053295933, "grad_norm": 0.27078357338905334, "learning_rate": 3.827727830643188e-05, "loss": 0.2514, "step": 2335 }, { "epoch": 1.638148667601683, "grad_norm": 0.2596358358860016, "learning_rate": 3.82678642743879e-05, "loss": 0.0939, "step": 2336 }, { "epoch": 1.6388499298737726, "grad_norm": 2.108755350112915, "learning_rate": 3.8258447622581374e-05, "loss": 0.3383, "step": 2337 }, { "epoch": 1.6395511921458625, "grad_norm": 0.2935386896133423, "learning_rate": 3.824902835287161e-05, "loss": 0.2413, "step": 2338 }, { "epoch": 1.6402524544179524, "grad_norm": 0.2814261317253113, "learning_rate": 3.8239606467118486e-05, "loss": 0.093, "step": 2339 }, { "epoch": 1.640953716690042, "grad_norm": 0.26337987184524536, "learning_rate": 3.823018196718235e-05, "loss": 0.0956, "step": 2340 }, { "epoch": 1.6416549789621318, "grad_norm": 0.281667023897171, "learning_rate": 3.822075485492409e-05, "loss": 0.2442, "step": 2341 }, { "epoch": 1.6423562412342216, "grad_norm": 0.26460543274879456, "learning_rate": 3.821132513220511e-05, "loss": 0.096, "step": 2342 }, { "epoch": 1.6430575035063115, "grad_norm": 0.26323410868644714, "learning_rate": 3.820189280088732e-05, "loss": 0.096, "step": 2343 }, { "epoch": 1.6437587657784012, "grad_norm": 0.29081225395202637, "learning_rate": 3.819245786283317e-05, "loss": 0.096, "step": 2344 }, { "epoch": 1.6444600280504909, "grad_norm": 0.2925470173358917, "learning_rate": 3.818302031990556e-05, "loss": 0.0965, "step": 2345 }, { "epoch": 1.6451612903225805, "grad_norm": 0.26471275091171265, "learning_rate": 3.817358017396799e-05, "loss": 0.0964, "step": 2346 }, { "epoch": 1.6458625525946704, "grad_norm": 0.2847835123538971, "learning_rate": 3.816413742688443e-05, "loss": 0.0929, "step": 2347 }, { "epoch": 1.6465638148667603, "grad_norm": 0.253609836101532, "learning_rate": 3.815469208051934e-05, "loss": 0.249, "step": 2348 }, { "epoch": 1.64726507713885, "grad_norm": 0.2865881323814392, "learning_rate": 3.814524413673774e-05, "loss": 0.0941, "step": 2349 }, { "epoch": 1.6479663394109396, "grad_norm": 0.28489258885383606, "learning_rate": 3.813579359740514e-05, "loss": 0.0955, "step": 2350 }, { "epoch": 1.6486676016830295, "grad_norm": 1.3933829069137573, "learning_rate": 3.812634046438756e-05, "loss": 0.2651, "step": 2351 }, { "epoch": 1.6493688639551192, "grad_norm": 0.2600442171096802, "learning_rate": 3.8116884739551545e-05, "loss": 0.0952, "step": 2352 }, { "epoch": 1.650070126227209, "grad_norm": 0.25999823212623596, "learning_rate": 3.810742642476414e-05, "loss": 0.095, "step": 2353 }, { "epoch": 1.6507713884992987, "grad_norm": 0.3626475930213928, "learning_rate": 3.809796552189291e-05, "loss": 0.0952, "step": 2354 }, { "epoch": 1.6514726507713884, "grad_norm": 0.258493572473526, "learning_rate": 3.8088502032805917e-05, "loss": 0.0946, "step": 2355 }, { "epoch": 1.6521739130434783, "grad_norm": 0.26547497510910034, "learning_rate": 3.8079035959371744e-05, "loss": 0.2488, "step": 2356 }, { "epoch": 1.6528751753155682, "grad_norm": 0.27957192063331604, "learning_rate": 3.806956730345949e-05, "loss": 0.0946, "step": 2357 }, { "epoch": 1.6535764375876578, "grad_norm": 0.2922751307487488, "learning_rate": 3.806009606693876e-05, "loss": 0.2485, "step": 2358 }, { "epoch": 1.6542776998597475, "grad_norm": 0.27937477827072144, "learning_rate": 3.8050622251679657e-05, "loss": 0.094, "step": 2359 }, { "epoch": 1.6549789621318372, "grad_norm": 0.2786133289337158, "learning_rate": 3.8041145859552806e-05, "loss": 0.0947, "step": 2360 }, { "epoch": 1.655680224403927, "grad_norm": 0.7420443892478943, "learning_rate": 3.8031666892429343e-05, "loss": 0.4043, "step": 2361 }, { "epoch": 1.656381486676017, "grad_norm": 0.2759292423725128, "learning_rate": 3.802218535218089e-05, "loss": 0.0937, "step": 2362 }, { "epoch": 1.6570827489481066, "grad_norm": 0.27092841267585754, "learning_rate": 3.801270124067961e-05, "loss": 0.0923, "step": 2363 }, { "epoch": 1.6577840112201963, "grad_norm": 0.2529390752315521, "learning_rate": 3.800321455979815e-05, "loss": 0.0929, "step": 2364 }, { "epoch": 1.6584852734922861, "grad_norm": 0.272755891084671, "learning_rate": 3.799372531140968e-05, "loss": 0.2543, "step": 2365 }, { "epoch": 1.659186535764376, "grad_norm": 0.25223198533058167, "learning_rate": 3.798423349738784e-05, "loss": 0.0928, "step": 2366 }, { "epoch": 1.6598877980364657, "grad_norm": 0.25054189562797546, "learning_rate": 3.797473911960685e-05, "loss": 0.0921, "step": 2367 }, { "epoch": 1.6605890603085554, "grad_norm": 0.2502390146255493, "learning_rate": 3.796524217994135e-05, "loss": 0.0919, "step": 2368 }, { "epoch": 1.661290322580645, "grad_norm": 0.2878706753253937, "learning_rate": 3.795574268026655e-05, "loss": 0.2482, "step": 2369 }, { "epoch": 1.661991584852735, "grad_norm": 0.2732163071632385, "learning_rate": 3.7946240622458135e-05, "loss": 0.2523, "step": 2370 }, { "epoch": 1.6626928471248248, "grad_norm": 0.2490067481994629, "learning_rate": 3.793673600839231e-05, "loss": 0.0915, "step": 2371 }, { "epoch": 1.6633941093969145, "grad_norm": 0.24919527769088745, "learning_rate": 3.792722883994576e-05, "loss": 0.0917, "step": 2372 }, { "epoch": 1.6640953716690041, "grad_norm": 0.2668115794658661, "learning_rate": 3.791771911899571e-05, "loss": 0.0912, "step": 2373 }, { "epoch": 1.664796633941094, "grad_norm": 0.2604781687259674, "learning_rate": 3.790820684741986e-05, "loss": 0.0884, "step": 2374 }, { "epoch": 1.665497896213184, "grad_norm": 0.28585606813430786, "learning_rate": 3.789869202709643e-05, "loss": 0.2479, "step": 2375 }, { "epoch": 1.6661991584852736, "grad_norm": 0.24561753869056702, "learning_rate": 3.788917465990414e-05, "loss": 0.0901, "step": 2376 }, { "epoch": 1.6669004207573632, "grad_norm": 0.24665339291095734, "learning_rate": 3.787965474772219e-05, "loss": 0.0903, "step": 2377 }, { "epoch": 1.6676016830294529, "grad_norm": 0.25356394052505493, "learning_rate": 3.7870132292430335e-05, "loss": 0.0864, "step": 2378 }, { "epoch": 1.6683029453015428, "grad_norm": 0.25588178634643555, "learning_rate": 3.786060729590878e-05, "loss": 0.087, "step": 2379 }, { "epoch": 1.6690042075736327, "grad_norm": 0.2909731864929199, "learning_rate": 3.7851079760038256e-05, "loss": 0.2521, "step": 2380 }, { "epoch": 1.6697054698457223, "grad_norm": 0.3239637613296509, "learning_rate": 3.784154968669999e-05, "loss": 0.2468, "step": 2381 }, { "epoch": 1.670406732117812, "grad_norm": 0.28873640298843384, "learning_rate": 3.783201707777572e-05, "loss": 0.2499, "step": 2382 }, { "epoch": 1.6711079943899019, "grad_norm": 0.2424420267343521, "learning_rate": 3.782248193514766e-05, "loss": 0.0884, "step": 2383 }, { "epoch": 1.6718092566619915, "grad_norm": 0.2413787841796875, "learning_rate": 3.7812944260698546e-05, "loss": 0.0883, "step": 2384 }, { "epoch": 1.6725105189340814, "grad_norm": 0.26781734824180603, "learning_rate": 3.780340405631162e-05, "loss": 0.0809, "step": 2385 }, { "epoch": 1.673211781206171, "grad_norm": 0.296526700258255, "learning_rate": 3.7793861323870594e-05, "loss": 0.2524, "step": 2386 }, { "epoch": 1.6739130434782608, "grad_norm": 0.24467261135578156, "learning_rate": 3.7784316065259715e-05, "loss": 0.0829, "step": 2387 }, { "epoch": 1.6746143057503506, "grad_norm": 0.24152083694934845, "learning_rate": 3.7774768282363693e-05, "loss": 0.0822, "step": 2388 }, { "epoch": 1.6753155680224405, "grad_norm": 0.24142557382583618, "learning_rate": 3.776521797706777e-05, "loss": 0.0811, "step": 2389 }, { "epoch": 1.6760168302945302, "grad_norm": 0.24283429980278015, "learning_rate": 3.7755665151257654e-05, "loss": 0.0815, "step": 2390 }, { "epoch": 1.6767180925666199, "grad_norm": 0.23913495242595673, "learning_rate": 3.774610980681958e-05, "loss": 0.0862, "step": 2391 }, { "epoch": 1.6774193548387095, "grad_norm": 0.31462880969047546, "learning_rate": 3.773655194564026e-05, "loss": 0.2563, "step": 2392 }, { "epoch": 1.6781206171107994, "grad_norm": 0.23805411159992218, "learning_rate": 3.772699156960692e-05, "loss": 0.0797, "step": 2393 }, { "epoch": 1.6788218793828893, "grad_norm": 0.23698851466178894, "learning_rate": 3.771742868060726e-05, "loss": 0.0792, "step": 2394 }, { "epoch": 1.679523141654979, "grad_norm": 0.23700600862503052, "learning_rate": 3.770786328052949e-05, "loss": 0.0717, "step": 2395 }, { "epoch": 1.6802244039270686, "grad_norm": 2.8549983501434326, "learning_rate": 3.7698295371262324e-05, "loss": 0.4277, "step": 2396 }, { "epoch": 1.6809256661991585, "grad_norm": 0.2369629144668579, "learning_rate": 3.7688724954694964e-05, "loss": 0.0848, "step": 2397 }, { "epoch": 1.6816269284712484, "grad_norm": 0.8605071902275085, "learning_rate": 3.7679152032717094e-05, "loss": 0.4323, "step": 2398 }, { "epoch": 1.682328190743338, "grad_norm": 0.23460467159748077, "learning_rate": 3.76695766072189e-05, "loss": 0.0837, "step": 2399 }, { "epoch": 1.6830294530154277, "grad_norm": 0.23629875481128693, "learning_rate": 3.765999868009108e-05, "loss": 0.072, "step": 2400 }, { "epoch": 1.6837307152875174, "grad_norm": 0.23723338544368744, "learning_rate": 3.7650418253224806e-05, "loss": 0.0719, "step": 2401 }, { "epoch": 1.6844319775596073, "grad_norm": 0.2350204586982727, "learning_rate": 3.764083532851175e-05, "loss": 0.0842, "step": 2402 }, { "epoch": 1.6851332398316972, "grad_norm": 0.2354377657175064, "learning_rate": 3.763124990784407e-05, "loss": 0.0837, "step": 2403 }, { "epoch": 1.6858345021037868, "grad_norm": 3.136338949203491, "learning_rate": 3.7621661993114444e-05, "loss": 0.7711, "step": 2404 }, { "epoch": 1.6865357643758765, "grad_norm": 0.2332981824874878, "learning_rate": 3.7612071586216e-05, "loss": 0.0781, "step": 2405 }, { "epoch": 1.6872370266479664, "grad_norm": 0.3161124885082245, "learning_rate": 3.760247868904239e-05, "loss": 0.2608, "step": 2406 }, { "epoch": 1.6879382889200563, "grad_norm": 0.2405671328306198, "learning_rate": 3.759288330348775e-05, "loss": 0.0727, "step": 2407 }, { "epoch": 1.688639551192146, "grad_norm": 0.26296815276145935, "learning_rate": 3.7583285431446696e-05, "loss": 0.0691, "step": 2408 }, { "epoch": 1.6893408134642356, "grad_norm": 0.23487159609794617, "learning_rate": 3.7573685074814356e-05, "loss": 0.0843, "step": 2409 }, { "epoch": 1.6900420757363253, "grad_norm": 0.8418309688568115, "learning_rate": 3.756408223548633e-05, "loss": 0.4375, "step": 2410 }, { "epoch": 1.6907433380084151, "grad_norm": 0.23415590822696686, "learning_rate": 3.755447691535871e-05, "loss": 0.0839, "step": 2411 }, { "epoch": 1.691444600280505, "grad_norm": 0.3180890381336212, "learning_rate": 3.75448691163281e-05, "loss": 0.2607, "step": 2412 }, { "epoch": 1.6921458625525947, "grad_norm": 0.23453405499458313, "learning_rate": 3.753525884029157e-05, "loss": 0.0845, "step": 2413 }, { "epoch": 1.6928471248246844, "grad_norm": 0.2358517348766327, "learning_rate": 3.752564608914667e-05, "loss": 0.0781, "step": 2414 }, { "epoch": 1.6935483870967742, "grad_norm": 0.3165251910686493, "learning_rate": 3.7516030864791474e-05, "loss": 0.2588, "step": 2415 }, { "epoch": 1.694249649368864, "grad_norm": 0.23596401512622833, "learning_rate": 3.750641316912451e-05, "loss": 0.0846, "step": 2416 }, { "epoch": 1.6949509116409538, "grad_norm": 0.23395119607448578, "learning_rate": 3.7496793004044824e-05, "loss": 0.0842, "step": 2417 }, { "epoch": 1.6956521739130435, "grad_norm": 0.23862455785274506, "learning_rate": 3.748717037145192e-05, "loss": 0.0798, "step": 2418 }, { "epoch": 1.6963534361851331, "grad_norm": 0.2509998679161072, "learning_rate": 3.7477545273245807e-05, "loss": 0.0739, "step": 2419 }, { "epoch": 1.697054698457223, "grad_norm": 0.2357727289199829, "learning_rate": 3.746791771132698e-05, "loss": 0.084, "step": 2420 }, { "epoch": 1.697755960729313, "grad_norm": 0.32357001304626465, "learning_rate": 3.7458287687596415e-05, "loss": 0.2616, "step": 2421 }, { "epoch": 1.6984572230014026, "grad_norm": 0.23616521060466766, "learning_rate": 3.7448655203955576e-05, "loss": 0.0837, "step": 2422 }, { "epoch": 1.6991584852734922, "grad_norm": 0.32459557056427, "learning_rate": 3.743902026230641e-05, "loss": 0.2607, "step": 2423 }, { "epoch": 1.699859747545582, "grad_norm": 0.3443213105201721, "learning_rate": 3.742938286455137e-05, "loss": 0.2545, "step": 2424 }, { "epoch": 1.7005610098176718, "grad_norm": 0.2465660721063614, "learning_rate": 3.741974301259336e-05, "loss": 0.0735, "step": 2425 }, { "epoch": 1.7012622720897617, "grad_norm": 0.22731611132621765, "learning_rate": 3.741010070833579e-05, "loss": 0.0756, "step": 2426 }, { "epoch": 1.7019635343618513, "grad_norm": 0.8490071892738342, "learning_rate": 3.740045595368255e-05, "loss": 0.4365, "step": 2427 }, { "epoch": 1.702664796633941, "grad_norm": 0.2327738106250763, "learning_rate": 3.739080875053801e-05, "loss": 0.0827, "step": 2428 }, { "epoch": 1.7033660589060309, "grad_norm": 0.32564160227775574, "learning_rate": 3.7381159100807026e-05, "loss": 0.2618, "step": 2429 }, { "epoch": 1.7040673211781208, "grad_norm": 0.2440715879201889, "learning_rate": 3.7371507006394955e-05, "loss": 0.0724, "step": 2430 }, { "epoch": 1.7047685834502104, "grad_norm": 0.32882604002952576, "learning_rate": 3.73618524692076e-05, "loss": 0.2629, "step": 2431 }, { "epoch": 1.7054698457223, "grad_norm": 0.3377149701118469, "learning_rate": 3.7352195491151266e-05, "loss": 0.2565, "step": 2432 }, { "epoch": 1.7061711079943898, "grad_norm": 0.3211466372013092, "learning_rate": 3.734253607413275e-05, "loss": 0.2585, "step": 2433 }, { "epoch": 1.7068723702664796, "grad_norm": 2.751668930053711, "learning_rate": 3.7332874220059314e-05, "loss": 0.4178, "step": 2434 }, { "epoch": 1.7075736325385695, "grad_norm": 0.34669768810272217, "learning_rate": 3.732320993083871e-05, "loss": 0.2469, "step": 2435 }, { "epoch": 1.7082748948106592, "grad_norm": 0.23644858598709106, "learning_rate": 3.731354320837918e-05, "loss": 0.0787, "step": 2436 }, { "epoch": 1.7089761570827489, "grad_norm": 0.23603442311286926, "learning_rate": 3.730387405458941e-05, "loss": 0.0844, "step": 2437 }, { "epoch": 1.7096774193548387, "grad_norm": 0.24319280683994293, "learning_rate": 3.729420247137861e-05, "loss": 0.0814, "step": 2438 }, { "epoch": 1.7103786816269286, "grad_norm": 0.2517856955528259, "learning_rate": 3.7284528460656444e-05, "loss": 0.0748, "step": 2439 }, { "epoch": 1.7110799438990183, "grad_norm": 2.422008752822876, "learning_rate": 3.7274852024333054e-05, "loss": 0.5499, "step": 2440 }, { "epoch": 1.711781206171108, "grad_norm": 0.33350133895874023, "learning_rate": 3.7265173164319096e-05, "loss": 0.2477, "step": 2441 }, { "epoch": 1.7124824684431976, "grad_norm": 0.3005514442920685, "learning_rate": 3.7255491882525645e-05, "loss": 0.2571, "step": 2442 }, { "epoch": 1.7131837307152875, "grad_norm": 0.3235367238521576, "learning_rate": 3.7245808180864306e-05, "loss": 0.2548, "step": 2443 }, { "epoch": 1.7138849929873774, "grad_norm": 0.2430594563484192, "learning_rate": 3.7236122061247144e-05, "loss": 0.0819, "step": 2444 }, { "epoch": 1.714586255259467, "grad_norm": 0.29191550612449646, "learning_rate": 3.7226433525586685e-05, "loss": 0.2566, "step": 2445 }, { "epoch": 1.7152875175315567, "grad_norm": 0.2992602586746216, "learning_rate": 3.721674257579596e-05, "loss": 0.2534, "step": 2446 }, { "epoch": 1.7159887798036466, "grad_norm": 0.357247531414032, "learning_rate": 3.720704921378845e-05, "loss": 0.0721, "step": 2447 }, { "epoch": 1.7166900420757363, "grad_norm": 0.29099974036216736, "learning_rate": 3.719735344147815e-05, "loss": 0.2555, "step": 2448 }, { "epoch": 1.7173913043478262, "grad_norm": 0.2919072210788727, "learning_rate": 3.718765526077948e-05, "loss": 0.2555, "step": 2449 }, { "epoch": 1.7180925666199158, "grad_norm": 0.252594530582428, "learning_rate": 3.7177954673607386e-05, "loss": 0.0857, "step": 2450 }, { "epoch": 1.7187938288920055, "grad_norm": 2.0320324897766113, "learning_rate": 3.716825168187725e-05, "loss": 0.3372, "step": 2451 }, { "epoch": 1.7194950911640954, "grad_norm": 1.9129940271377563, "learning_rate": 3.715854628750495e-05, "loss": 0.3225, "step": 2452 }, { "epoch": 1.7201963534361853, "grad_norm": 0.27349144220352173, "learning_rate": 3.7148838492406825e-05, "loss": 0.083, "step": 2453 }, { "epoch": 1.720897615708275, "grad_norm": 0.7685236930847168, "learning_rate": 3.7139128298499704e-05, "loss": 0.4124, "step": 2454 }, { "epoch": 1.7215988779803646, "grad_norm": 0.24469006061553955, "learning_rate": 3.712941570770088e-05, "loss": 0.0898, "step": 2455 }, { "epoch": 1.7223001402524543, "grad_norm": 0.27469193935394287, "learning_rate": 3.711970072192812e-05, "loss": 0.2521, "step": 2456 }, { "epoch": 1.7230014025245441, "grad_norm": 0.28609195351600647, "learning_rate": 3.710998334309966e-05, "loss": 0.2529, "step": 2457 }, { "epoch": 1.723702664796634, "grad_norm": 0.2482578009366989, "learning_rate": 3.7100263573134225e-05, "loss": 0.0916, "step": 2458 }, { "epoch": 1.7244039270687237, "grad_norm": 0.7542937397956848, "learning_rate": 3.709054141395099e-05, "loss": 0.4127, "step": 2459 }, { "epoch": 1.7251051893408134, "grad_norm": 0.26824966073036194, "learning_rate": 3.708081686746962e-05, "loss": 0.2509, "step": 2460 }, { "epoch": 1.7258064516129032, "grad_norm": 0.27043405175209045, "learning_rate": 3.707108993561024e-05, "loss": 0.093, "step": 2461 }, { "epoch": 1.7265077138849931, "grad_norm": 0.26378414034843445, "learning_rate": 3.7061360620293436e-05, "loss": 0.2514, "step": 2462 }, { "epoch": 1.7272089761570828, "grad_norm": 0.2617020308971405, "learning_rate": 3.70516289234403e-05, "loss": 0.2507, "step": 2463 }, { "epoch": 1.7279102384291725, "grad_norm": 0.2756495177745819, "learning_rate": 3.704189484697236e-05, "loss": 0.0949, "step": 2464 }, { "epoch": 1.7286115007012621, "grad_norm": 0.26502692699432373, "learning_rate": 3.703215839281163e-05, "loss": 0.2506, "step": 2465 }, { "epoch": 1.729312762973352, "grad_norm": 0.2542143166065216, "learning_rate": 3.7022419562880585e-05, "loss": 0.0935, "step": 2466 }, { "epoch": 1.730014025245442, "grad_norm": 0.27401769161224365, "learning_rate": 3.701267835910218e-05, "loss": 0.0945, "step": 2467 }, { "epoch": 1.7307152875175316, "grad_norm": 0.3078174889087677, "learning_rate": 3.7002934783399825e-05, "loss": 0.0936, "step": 2468 }, { "epoch": 1.7314165497896212, "grad_norm": 0.2751760184764862, "learning_rate": 3.6993188837697415e-05, "loss": 0.0949, "step": 2469 }, { "epoch": 1.7321178120617111, "grad_norm": 0.2608521282672882, "learning_rate": 3.6983440523919285e-05, "loss": 0.2497, "step": 2470 }, { "epoch": 1.732819074333801, "grad_norm": 0.25364118814468384, "learning_rate": 3.697368984399028e-05, "loss": 0.0935, "step": 2471 }, { "epoch": 1.7335203366058907, "grad_norm": 0.2576127052307129, "learning_rate": 3.696393679983569e-05, "loss": 0.2497, "step": 2472 }, { "epoch": 1.7342215988779803, "grad_norm": 0.7459859251976013, "learning_rate": 3.695418139338124e-05, "loss": 0.4102, "step": 2473 }, { "epoch": 1.73492286115007, "grad_norm": 0.2704770565032959, "learning_rate": 3.694442362655317e-05, "loss": 0.0936, "step": 2474 }, { "epoch": 1.7356241234221599, "grad_norm": 0.26335614919662476, "learning_rate": 3.693466350127818e-05, "loss": 0.2518, "step": 2475 }, { "epoch": 1.7363253856942498, "grad_norm": 0.33727821707725525, "learning_rate": 3.69249010194834e-05, "loss": 0.0942, "step": 2476 }, { "epoch": 1.7370266479663394, "grad_norm": 1.2375340461730957, "learning_rate": 3.691513618309647e-05, "loss": 0.5645, "step": 2477 }, { "epoch": 1.737727910238429, "grad_norm": 0.3027742803096771, "learning_rate": 3.6905368994045456e-05, "loss": 0.09, "step": 2478 }, { "epoch": 1.738429172510519, "grad_norm": 2.002547025680542, "learning_rate": 3.689559945425892e-05, "loss": 0.3032, "step": 2479 }, { "epoch": 1.7391304347826086, "grad_norm": 0.27176961302757263, "learning_rate": 3.688582756566587e-05, "loss": 0.0939, "step": 2480 }, { "epoch": 1.7398316970546985, "grad_norm": 0.7476723790168762, "learning_rate": 3.687605333019577e-05, "loss": 0.407, "step": 2481 }, { "epoch": 1.7405329593267882, "grad_norm": 0.2683611810207367, "learning_rate": 3.686627674977858e-05, "loss": 0.0936, "step": 2482 }, { "epoch": 1.7412342215988779, "grad_norm": 0.27495163679122925, "learning_rate": 3.6856497826344696e-05, "loss": 0.0947, "step": 2483 }, { "epoch": 1.7419354838709677, "grad_norm": 0.2719372510910034, "learning_rate": 3.6846716561824965e-05, "loss": 0.0937, "step": 2484 }, { "epoch": 1.7426367461430576, "grad_norm": 0.27495214343070984, "learning_rate": 3.683693295815074e-05, "loss": 0.0951, "step": 2485 }, { "epoch": 1.7433380084151473, "grad_norm": 0.25716739892959595, "learning_rate": 3.6827147017253804e-05, "loss": 0.0947, "step": 2486 }, { "epoch": 1.744039270687237, "grad_norm": 0.2732238471508026, "learning_rate": 3.68173587410664e-05, "loss": 0.0945, "step": 2487 }, { "epoch": 1.7447405329593266, "grad_norm": 0.2577061355113983, "learning_rate": 3.6807568131521234e-05, "loss": 0.2491, "step": 2488 }, { "epoch": 1.7454417952314165, "grad_norm": 0.26700451970100403, "learning_rate": 3.67977751905515e-05, "loss": 0.251, "step": 2489 }, { "epoch": 1.7461430575035064, "grad_norm": 0.27066153287887573, "learning_rate": 3.6787979920090816e-05, "loss": 0.2506, "step": 2490 }, { "epoch": 1.746844319775596, "grad_norm": 0.7388224005699158, "learning_rate": 3.677818232207327e-05, "loss": 0.4024, "step": 2491 }, { "epoch": 1.7475455820476857, "grad_norm": 0.2671506106853485, "learning_rate": 3.676838239843343e-05, "loss": 0.2502, "step": 2492 }, { "epoch": 1.7482468443197756, "grad_norm": 0.2573460042476654, "learning_rate": 3.67585801511063e-05, "loss": 0.2493, "step": 2493 }, { "epoch": 1.7489481065918655, "grad_norm": 0.2566382884979248, "learning_rate": 3.6748775582027354e-05, "loss": 0.0943, "step": 2494 }, { "epoch": 1.7496493688639552, "grad_norm": 0.25588440895080566, "learning_rate": 3.6738968693132517e-05, "loss": 0.0939, "step": 2495 }, { "epoch": 1.7503506311360448, "grad_norm": 0.2809423804283142, "learning_rate": 3.6729159486358175e-05, "loss": 0.0952, "step": 2496 }, { "epoch": 1.7510518934081345, "grad_norm": 0.2617984116077423, "learning_rate": 3.6719347963641176e-05, "loss": 0.2491, "step": 2497 }, { "epoch": 1.7517531556802244, "grad_norm": 0.27159810066223145, "learning_rate": 3.670953412691882e-05, "loss": 0.0935, "step": 2498 }, { "epoch": 1.7524544179523143, "grad_norm": 0.31395500898361206, "learning_rate": 3.6699717978128863e-05, "loss": 0.0931, "step": 2499 }, { "epoch": 1.753155680224404, "grad_norm": 0.298294335603714, "learning_rate": 3.6689899519209526e-05, "loss": 0.0929, "step": 2500 }, { "epoch": 1.7538569424964936, "grad_norm": 0.266610324382782, "learning_rate": 3.668007875209948e-05, "loss": 0.2515, "step": 2501 }, { "epoch": 1.7545582047685835, "grad_norm": 0.7420229315757751, "learning_rate": 3.6670255678737845e-05, "loss": 0.407, "step": 2502 }, { "epoch": 1.7552594670406734, "grad_norm": 0.26865988969802856, "learning_rate": 3.666043030106422e-05, "loss": 0.2506, "step": 2503 }, { "epoch": 1.755960729312763, "grad_norm": 0.26420119404792786, "learning_rate": 3.665060262101862e-05, "loss": 0.2495, "step": 2504 }, { "epoch": 1.7566619915848527, "grad_norm": 0.754969596862793, "learning_rate": 3.664077264054156e-05, "loss": 0.4117, "step": 2505 }, { "epoch": 1.7573632538569424, "grad_norm": 1.1010891199111938, "learning_rate": 3.6630940361573975e-05, "loss": 0.2665, "step": 2506 }, { "epoch": 1.7580645161290323, "grad_norm": 0.33137574791908264, "learning_rate": 3.6621105786057256e-05, "loss": 0.0942, "step": 2507 }, { "epoch": 1.7587657784011221, "grad_norm": 0.7378925085067749, "learning_rate": 3.6611268915933276e-05, "loss": 0.4008, "step": 2508 }, { "epoch": 1.7594670406732118, "grad_norm": 0.25892359018325806, "learning_rate": 3.660142975314433e-05, "loss": 0.25, "step": 2509 }, { "epoch": 1.7601683029453015, "grad_norm": 0.28075993061065674, "learning_rate": 3.6591588299633186e-05, "loss": 0.0956, "step": 2510 }, { "epoch": 1.7608695652173914, "grad_norm": 0.2520832419395447, "learning_rate": 3.658174455734305e-05, "loss": 0.2478, "step": 2511 }, { "epoch": 1.761570827489481, "grad_norm": 0.2844184339046478, "learning_rate": 3.6571898528217575e-05, "loss": 0.0965, "step": 2512 }, { "epoch": 1.762272089761571, "grad_norm": 0.2860913574695587, "learning_rate": 3.6562050214200894e-05, "loss": 0.0972, "step": 2513 }, { "epoch": 1.7629733520336606, "grad_norm": 0.25583869218826294, "learning_rate": 3.6552199617237574e-05, "loss": 0.2486, "step": 2514 }, { "epoch": 1.7636746143057502, "grad_norm": 0.2892024517059326, "learning_rate": 3.654234673927261e-05, "loss": 0.0976, "step": 2515 }, { "epoch": 1.7643758765778401, "grad_norm": 0.286350816488266, "learning_rate": 3.6532491582251497e-05, "loss": 0.0967, "step": 2516 }, { "epoch": 1.76507713884993, "grad_norm": 0.26936355233192444, "learning_rate": 3.6522634148120145e-05, "loss": 0.0976, "step": 2517 }, { "epoch": 1.7657784011220197, "grad_norm": 0.288884699344635, "learning_rate": 3.651277443882491e-05, "loss": 0.097, "step": 2518 }, { "epoch": 1.7664796633941093, "grad_norm": 0.2502918243408203, "learning_rate": 3.650291245631262e-05, "loss": 0.2476, "step": 2519 }, { "epoch": 1.767180925666199, "grad_norm": 0.26614171266555786, "learning_rate": 3.649304820253052e-05, "loss": 0.0968, "step": 2520 }, { "epoch": 1.7678821879382889, "grad_norm": 0.25515520572662354, "learning_rate": 3.648318167942636e-05, "loss": 0.2442, "step": 2521 }, { "epoch": 1.7685834502103788, "grad_norm": 0.2652827203273773, "learning_rate": 3.6473312888948266e-05, "loss": 0.0967, "step": 2522 }, { "epoch": 1.7692847124824684, "grad_norm": 0.2852623164653778, "learning_rate": 3.6463441833044875e-05, "loss": 0.0965, "step": 2523 }, { "epoch": 1.769985974754558, "grad_norm": 0.7177554368972778, "learning_rate": 3.6453568513665227e-05, "loss": 0.3996, "step": 2524 }, { "epoch": 1.770687237026648, "grad_norm": 0.735159695148468, "learning_rate": 3.644369293275883e-05, "loss": 0.3969, "step": 2525 }, { "epoch": 1.7713884992987379, "grad_norm": 0.2581464946269989, "learning_rate": 3.643381509227564e-05, "loss": 0.2486, "step": 2526 }, { "epoch": 1.7720897615708275, "grad_norm": 1.1857367753982544, "learning_rate": 3.642393499416604e-05, "loss": 0.2708, "step": 2527 }, { "epoch": 1.7727910238429172, "grad_norm": 0.26511383056640625, "learning_rate": 3.641405264038089e-05, "loss": 0.0964, "step": 2528 }, { "epoch": 1.7734922861150069, "grad_norm": 0.2572481632232666, "learning_rate": 3.6404168032871464e-05, "loss": 0.2436, "step": 2529 }, { "epoch": 1.7741935483870968, "grad_norm": 0.2623668611049652, "learning_rate": 3.6394281173589495e-05, "loss": 0.2484, "step": 2530 }, { "epoch": 1.7748948106591866, "grad_norm": 0.253880113363266, "learning_rate": 3.6384392064487175e-05, "loss": 0.2462, "step": 2531 }, { "epoch": 1.7755960729312763, "grad_norm": 0.28718292713165283, "learning_rate": 3.637450070751711e-05, "loss": 0.0971, "step": 2532 }, { "epoch": 1.776297335203366, "grad_norm": 0.2902108430862427, "learning_rate": 3.636460710463237e-05, "loss": 0.0969, "step": 2533 }, { "epoch": 1.7769985974754559, "grad_norm": 0.7104911804199219, "learning_rate": 3.6354711257786464e-05, "loss": 0.3946, "step": 2534 }, { "epoch": 1.7776998597475457, "grad_norm": 0.2480180710554123, "learning_rate": 3.6344813168933356e-05, "loss": 0.2461, "step": 2535 }, { "epoch": 1.7784011220196354, "grad_norm": 0.3034035861492157, "learning_rate": 3.633491284002741e-05, "loss": 0.0995, "step": 2536 }, { "epoch": 1.779102384291725, "grad_norm": 0.25234273076057434, "learning_rate": 3.6325010273023496e-05, "loss": 0.2463, "step": 2537 }, { "epoch": 1.7798036465638147, "grad_norm": 0.3440544605255127, "learning_rate": 3.631510546987688e-05, "loss": 0.0988, "step": 2538 }, { "epoch": 1.7805049088359046, "grad_norm": 0.33672165870666504, "learning_rate": 3.630519843254328e-05, "loss": 0.0996, "step": 2539 }, { "epoch": 1.7812061711079945, "grad_norm": 0.30412840843200684, "learning_rate": 3.629528916297885e-05, "loss": 0.097, "step": 2540 }, { "epoch": 1.7819074333800842, "grad_norm": 0.3726062774658203, "learning_rate": 3.628537766314021e-05, "loss": 0.0982, "step": 2541 }, { "epoch": 1.7826086956521738, "grad_norm": 0.27324530482292175, "learning_rate": 3.627546393498439e-05, "loss": 0.098, "step": 2542 }, { "epoch": 1.7833099579242637, "grad_norm": 0.2760748565196991, "learning_rate": 3.6265547980468875e-05, "loss": 0.099, "step": 2543 }, { "epoch": 1.7840112201963534, "grad_norm": 0.27286580204963684, "learning_rate": 3.625562980155159e-05, "loss": 0.0977, "step": 2544 }, { "epoch": 1.7847124824684433, "grad_norm": 0.3176290988922119, "learning_rate": 3.62457094001909e-05, "loss": 0.0954, "step": 2545 }, { "epoch": 1.785413744740533, "grad_norm": 0.3421517610549927, "learning_rate": 3.62357867783456e-05, "loss": 0.0895, "step": 2546 }, { "epoch": 1.7861150070126226, "grad_norm": 0.2679896056652069, "learning_rate": 3.622586193797492e-05, "loss": 0.0959, "step": 2547 }, { "epoch": 1.7868162692847125, "grad_norm": 0.2790212333202362, "learning_rate": 3.621593488103855e-05, "loss": 0.0929, "step": 2548 }, { "epoch": 1.7875175315568024, "grad_norm": 0.2601375877857208, "learning_rate": 3.620600560949661e-05, "loss": 0.0938, "step": 2549 }, { "epoch": 1.788218793828892, "grad_norm": 0.27369073033332825, "learning_rate": 3.6196074125309634e-05, "loss": 0.0924, "step": 2550 }, { "epoch": 1.7889200561009817, "grad_norm": 0.2607261538505554, "learning_rate": 3.618614043043861e-05, "loss": 0.0938, "step": 2551 }, { "epoch": 1.7896213183730714, "grad_norm": 0.2917751669883728, "learning_rate": 3.617620452684498e-05, "loss": 0.2453, "step": 2552 }, { "epoch": 1.7903225806451613, "grad_norm": 0.2879749536514282, "learning_rate": 3.61662664164906e-05, "loss": 0.2478, "step": 2553 }, { "epoch": 1.7910238429172511, "grad_norm": 0.2641933560371399, "learning_rate": 3.615632610133776e-05, "loss": 0.0879, "step": 2554 }, { "epoch": 1.7917251051893408, "grad_norm": 0.25750765204429626, "learning_rate": 3.6146383583349196e-05, "loss": 0.0918, "step": 2555 }, { "epoch": 1.7924263674614305, "grad_norm": 0.2536839246749878, "learning_rate": 3.613643886448807e-05, "loss": 0.0906, "step": 2556 }, { "epoch": 1.7931276297335204, "grad_norm": 0.2908202111721039, "learning_rate": 3.6126491946718e-05, "loss": 0.0755, "step": 2557 }, { "epoch": 1.7938288920056102, "grad_norm": 0.30481237173080444, "learning_rate": 3.6116542832003005e-05, "loss": 0.2486, "step": 2558 }, { "epoch": 1.7945301542777, "grad_norm": 0.29838797450065613, "learning_rate": 3.6106591522307566e-05, "loss": 0.255, "step": 2559 }, { "epoch": 1.7952314165497896, "grad_norm": 0.2578081488609314, "learning_rate": 3.6096638019596575e-05, "loss": 0.0839, "step": 2560 }, { "epoch": 1.7959326788218792, "grad_norm": 0.2597649395465851, "learning_rate": 3.608668232583537e-05, "loss": 0.077, "step": 2561 }, { "epoch": 1.7966339410939691, "grad_norm": 0.86063152551651, "learning_rate": 3.607672444298973e-05, "loss": 0.4177, "step": 2562 }, { "epoch": 1.797335203366059, "grad_norm": 0.33349546790122986, "learning_rate": 3.606676437302585e-05, "loss": 0.2478, "step": 2563 }, { "epoch": 1.7980364656381487, "grad_norm": 0.2561742663383484, "learning_rate": 3.605680211791036e-05, "loss": 0.0761, "step": 2564 }, { "epoch": 1.7987377279102383, "grad_norm": 0.25079506635665894, "learning_rate": 3.6046837679610336e-05, "loss": 0.0875, "step": 2565 }, { "epoch": 1.7994389901823282, "grad_norm": 0.2486097663640976, "learning_rate": 3.6036871060093255e-05, "loss": 0.0868, "step": 2566 }, { "epoch": 1.800140252454418, "grad_norm": 0.2501959502696991, "learning_rate": 3.602690226132706e-05, "loss": 0.0871, "step": 2567 }, { "epoch": 1.8008415147265078, "grad_norm": 0.3316382169723511, "learning_rate": 3.6016931285280096e-05, "loss": 0.261, "step": 2568 }, { "epoch": 1.8015427769985974, "grad_norm": 0.24818192422389984, "learning_rate": 3.6006958133921155e-05, "loss": 0.0865, "step": 2569 }, { "epoch": 1.802244039270687, "grad_norm": 0.3297709822654724, "learning_rate": 3.599698280921945e-05, "loss": 0.2571, "step": 2570 }, { "epoch": 1.802945301542777, "grad_norm": 0.3195790648460388, "learning_rate": 3.598700531314463e-05, "loss": 0.2582, "step": 2571 }, { "epoch": 1.8036465638148669, "grad_norm": 0.23963873088359833, "learning_rate": 3.597702564766676e-05, "loss": 0.078, "step": 2572 }, { "epoch": 1.8043478260869565, "grad_norm": 0.25094297528266907, "learning_rate": 3.596704381475636e-05, "loss": 0.0804, "step": 2573 }, { "epoch": 1.8050490883590462, "grad_norm": 0.24491003155708313, "learning_rate": 3.595705981638434e-05, "loss": 0.0798, "step": 2574 }, { "epoch": 1.805750350631136, "grad_norm": 0.24954557418823242, "learning_rate": 3.594707365452206e-05, "loss": 0.0854, "step": 2575 }, { "epoch": 1.8064516129032258, "grad_norm": 2.9606411457061768, "learning_rate": 3.593708533114132e-05, "loss": 0.5972, "step": 2576 }, { "epoch": 1.8071528751753156, "grad_norm": 0.24500301480293274, "learning_rate": 3.592709484821432e-05, "loss": 0.0783, "step": 2577 }, { "epoch": 1.8078541374474053, "grad_norm": 0.2503689229488373, "learning_rate": 3.5917102207713696e-05, "loss": 0.0856, "step": 2578 }, { "epoch": 1.808555399719495, "grad_norm": 0.36720389127731323, "learning_rate": 3.590710741161252e-05, "loss": 0.2556, "step": 2579 }, { "epoch": 1.8092566619915849, "grad_norm": 0.2479264885187149, "learning_rate": 3.589711046188428e-05, "loss": 0.0724, "step": 2580 }, { "epoch": 1.8099579242636747, "grad_norm": 0.33964404463768005, "learning_rate": 3.588711136050289e-05, "loss": 0.2534, "step": 2581 }, { "epoch": 1.8106591865357644, "grad_norm": 0.3329477906227112, "learning_rate": 3.587711010944268e-05, "loss": 0.2591, "step": 2582 }, { "epoch": 1.811360448807854, "grad_norm": 0.24495403468608856, "learning_rate": 3.5867106710678434e-05, "loss": 0.0717, "step": 2583 }, { "epoch": 1.8120617110799437, "grad_norm": 0.33054840564727783, "learning_rate": 3.585710116618531e-05, "loss": 0.2602, "step": 2584 }, { "epoch": 1.8127629733520336, "grad_norm": 0.3538224697113037, "learning_rate": 3.5847093477938956e-05, "loss": 0.2627, "step": 2585 }, { "epoch": 1.8134642356241235, "grad_norm": 0.24861197173595428, "learning_rate": 3.583708364791538e-05, "loss": 0.0849, "step": 2586 }, { "epoch": 1.8141654978962132, "grad_norm": 0.25006285309791565, "learning_rate": 3.582707167809106e-05, "loss": 0.0858, "step": 2587 }, { "epoch": 1.8148667601683028, "grad_norm": 0.2480914145708084, "learning_rate": 3.581705757044286e-05, "loss": 0.0855, "step": 2588 }, { "epoch": 1.8155680224403927, "grad_norm": 0.34012505412101746, "learning_rate": 3.580704132694809e-05, "loss": 0.2529, "step": 2589 }, { "epoch": 1.8162692847124826, "grad_norm": 0.24760489165782928, "learning_rate": 3.579702294958448e-05, "loss": 0.0849, "step": 2590 }, { "epoch": 1.8169705469845723, "grad_norm": 0.24700473248958588, "learning_rate": 3.578700244033016e-05, "loss": 0.0785, "step": 2591 }, { "epoch": 1.817671809256662, "grad_norm": 0.2474672645330429, "learning_rate": 3.577697980116371e-05, "loss": 0.0854, "step": 2592 }, { "epoch": 1.8183730715287516, "grad_norm": 0.3379094898700714, "learning_rate": 3.576695503406411e-05, "loss": 0.2607, "step": 2593 }, { "epoch": 1.8190743338008415, "grad_norm": 0.24955923855304718, "learning_rate": 3.575692814101078e-05, "loss": 0.0727, "step": 2594 }, { "epoch": 1.8197755960729314, "grad_norm": 0.3470456600189209, "learning_rate": 3.574689912398353e-05, "loss": 0.2503, "step": 2595 }, { "epoch": 1.820476858345021, "grad_norm": 0.36206117272377014, "learning_rate": 3.5736867984962616e-05, "loss": 0.2545, "step": 2596 }, { "epoch": 1.8211781206171107, "grad_norm": 0.2470400631427765, "learning_rate": 3.5726834725928714e-05, "loss": 0.0848, "step": 2597 }, { "epoch": 1.8218793828892006, "grad_norm": 0.24721656739711761, "learning_rate": 3.5716799348862887e-05, "loss": 0.0776, "step": 2598 }, { "epoch": 1.8225806451612905, "grad_norm": 0.8876863718032837, "learning_rate": 3.570676185574665e-05, "loss": 0.4268, "step": 2599 }, { "epoch": 1.8232819074333801, "grad_norm": 0.2484787106513977, "learning_rate": 3.569672224856191e-05, "loss": 0.0843, "step": 2600 }, { "epoch": 1.8239831697054698, "grad_norm": 0.8601622581481934, "learning_rate": 3.5686680529291024e-05, "loss": 0.4323, "step": 2601 }, { "epoch": 1.8246844319775595, "grad_norm": 0.24929621815681458, "learning_rate": 3.567663669991674e-05, "loss": 0.0852, "step": 2602 }, { "epoch": 1.8253856942496494, "grad_norm": 0.24435053765773773, "learning_rate": 3.5666590762422224e-05, "loss": 0.0787, "step": 2603 }, { "epoch": 1.8260869565217392, "grad_norm": 0.2494080364704132, "learning_rate": 3.565654271879106e-05, "loss": 0.0858, "step": 2604 }, { "epoch": 1.826788218793829, "grad_norm": 0.3322739601135254, "learning_rate": 3.5646492571007275e-05, "loss": 0.2497, "step": 2605 }, { "epoch": 1.8274894810659186, "grad_norm": 0.24821241199970245, "learning_rate": 3.5636440321055254e-05, "loss": 0.0859, "step": 2606 }, { "epoch": 1.8281907433380085, "grad_norm": 0.25201326608657837, "learning_rate": 3.5626385970919865e-05, "loss": 0.0857, "step": 2607 }, { "epoch": 1.8288920056100981, "grad_norm": 0.23916952311992645, "learning_rate": 3.561632952258632e-05, "loss": 0.0775, "step": 2608 }, { "epoch": 1.829593267882188, "grad_norm": 0.35111141204833984, "learning_rate": 3.560627097804032e-05, "loss": 0.2481, "step": 2609 }, { "epoch": 1.8302945301542777, "grad_norm": 0.24143435060977936, "learning_rate": 3.559621033926791e-05, "loss": 0.0722, "step": 2610 }, { "epoch": 1.8309957924263673, "grad_norm": 0.2511986792087555, "learning_rate": 3.55861476082556e-05, "loss": 0.0847, "step": 2611 }, { "epoch": 1.8316970546984572, "grad_norm": 0.9116089940071106, "learning_rate": 3.557608278699029e-05, "loss": 0.4186, "step": 2612 }, { "epoch": 1.8323983169705471, "grad_norm": 0.24661694467067719, "learning_rate": 3.556601587745929e-05, "loss": 0.0785, "step": 2613 }, { "epoch": 1.8330995792426368, "grad_norm": 0.2424381673336029, "learning_rate": 3.555594688165033e-05, "loss": 0.0773, "step": 2614 }, { "epoch": 1.8338008415147264, "grad_norm": 0.25058606266975403, "learning_rate": 3.554587580155156e-05, "loss": 0.0857, "step": 2615 }, { "epoch": 1.834502103786816, "grad_norm": 0.2378440946340561, "learning_rate": 3.553580263915152e-05, "loss": 0.0763, "step": 2616 }, { "epoch": 1.835203366058906, "grad_norm": 0.24049502611160278, "learning_rate": 3.552572739643918e-05, "loss": 0.0769, "step": 2617 }, { "epoch": 1.8359046283309959, "grad_norm": 0.3547701835632324, "learning_rate": 3.551565007540391e-05, "loss": 0.2504, "step": 2618 }, { "epoch": 1.8366058906030855, "grad_norm": 0.2304314225912094, "learning_rate": 3.5505570678035494e-05, "loss": 0.06, "step": 2619 }, { "epoch": 1.8373071528751752, "grad_norm": 0.898367166519165, "learning_rate": 3.549548920632413e-05, "loss": 0.4363, "step": 2620 }, { "epoch": 1.838008415147265, "grad_norm": 0.2486027330160141, "learning_rate": 3.548540566226043e-05, "loss": 0.0773, "step": 2621 }, { "epoch": 1.838709677419355, "grad_norm": 0.24962961673736572, "learning_rate": 3.547532004783538e-05, "loss": 0.0835, "step": 2622 }, { "epoch": 1.8394109396914446, "grad_norm": 0.24876870214939117, "learning_rate": 3.546523236504044e-05, "loss": 0.0839, "step": 2623 }, { "epoch": 1.8401122019635343, "grad_norm": 0.38189825415611267, "learning_rate": 3.54551426158674e-05, "loss": 0.2325, "step": 2624 }, { "epoch": 1.840813464235624, "grad_norm": 0.2529781758785248, "learning_rate": 3.5445050802308524e-05, "loss": 0.0829, "step": 2625 }, { "epoch": 1.8415147265077139, "grad_norm": 5.070644378662109, "learning_rate": 3.543495692635645e-05, "loss": 0.8094, "step": 2626 }, { "epoch": 1.8422159887798037, "grad_norm": 0.3458581268787384, "learning_rate": 3.5424860990004246e-05, "loss": 0.2608, "step": 2627 }, { "epoch": 1.8429172510518934, "grad_norm": 0.37703490257263184, "learning_rate": 3.541476299524535e-05, "loss": 0.2448, "step": 2628 }, { "epoch": 1.843618513323983, "grad_norm": 0.22986379265785217, "learning_rate": 3.5404662944073635e-05, "loss": 0.061, "step": 2629 }, { "epoch": 1.844319775596073, "grad_norm": 0.245625838637352, "learning_rate": 3.5394560838483375e-05, "loss": 0.0771, "step": 2630 }, { "epoch": 1.8450210378681628, "grad_norm": 0.2522338628768921, "learning_rate": 3.538445668046925e-05, "loss": 0.0851, "step": 2631 }, { "epoch": 1.8457223001402525, "grad_norm": 0.8921801447868347, "learning_rate": 3.537435047202635e-05, "loss": 0.4366, "step": 2632 }, { "epoch": 1.8464235624123422, "grad_norm": 0.36080631613731384, "learning_rate": 3.536424221515015e-05, "loss": 0.2417, "step": 2633 }, { "epoch": 1.8471248246844318, "grad_norm": 0.24047181010246277, "learning_rate": 3.5354131911836544e-05, "loss": 0.0774, "step": 2634 }, { "epoch": 1.8478260869565217, "grad_norm": 0.3300210237503052, "learning_rate": 3.534401956408184e-05, "loss": 0.2586, "step": 2635 }, { "epoch": 1.8485273492286116, "grad_norm": 0.25114351511001587, "learning_rate": 3.533390517388272e-05, "loss": 0.0732, "step": 2636 }, { "epoch": 1.8492286115007013, "grad_norm": 0.38832151889801025, "learning_rate": 3.5323788743236306e-05, "loss": 0.2398, "step": 2637 }, { "epoch": 1.849929873772791, "grad_norm": 0.2448602318763733, "learning_rate": 3.53136702741401e-05, "loss": 0.0794, "step": 2638 }, { "epoch": 1.8506311360448808, "grad_norm": 0.3393131494522095, "learning_rate": 3.5303549768592e-05, "loss": 0.2343, "step": 2639 }, { "epoch": 1.8513323983169705, "grad_norm": 0.24369467794895172, "learning_rate": 3.529342722859034e-05, "loss": 0.0791, "step": 2640 }, { "epoch": 1.8520336605890604, "grad_norm": 0.2521948218345642, "learning_rate": 3.5283302656133805e-05, "loss": 0.0877, "step": 2641 }, { "epoch": 1.85273492286115, "grad_norm": 0.8556388020515442, "learning_rate": 3.527317605322153e-05, "loss": 0.4292, "step": 2642 }, { "epoch": 1.8534361851332397, "grad_norm": 0.31728100776672363, "learning_rate": 3.5263047421853016e-05, "loss": 0.2587, "step": 2643 }, { "epoch": 1.8541374474053296, "grad_norm": 0.8292129635810852, "learning_rate": 3.525291676402819e-05, "loss": 0.4249, "step": 2644 }, { "epoch": 1.8548387096774195, "grad_norm": 0.24397996068000793, "learning_rate": 3.524278408174736e-05, "loss": 0.0799, "step": 2645 }, { "epoch": 1.8555399719495091, "grad_norm": 0.24556703865528107, "learning_rate": 3.5232649377011254e-05, "loss": 0.0805, "step": 2646 }, { "epoch": 1.8562412342215988, "grad_norm": 0.25349724292755127, "learning_rate": 3.522251265182097e-05, "loss": 0.0884, "step": 2647 }, { "epoch": 1.8569424964936885, "grad_norm": 0.25562751293182373, "learning_rate": 3.521237390817803e-05, "loss": 0.0891, "step": 2648 }, { "epoch": 1.8576437587657784, "grad_norm": 0.24975010752677917, "learning_rate": 3.520223314808434e-05, "loss": 0.0819, "step": 2649 }, { "epoch": 1.8583450210378682, "grad_norm": 0.24719400703907013, "learning_rate": 3.519209037354222e-05, "loss": 0.0815, "step": 2650 }, { "epoch": 1.859046283309958, "grad_norm": 0.3152770698070526, "learning_rate": 3.5181945586554384e-05, "loss": 0.2583, "step": 2651 }, { "epoch": 1.8597475455820476, "grad_norm": 0.2560403347015381, "learning_rate": 3.5171798789123916e-05, "loss": 0.0891, "step": 2652 }, { "epoch": 1.8604488078541375, "grad_norm": 0.2540737986564636, "learning_rate": 3.516164998325434e-05, "loss": 0.0888, "step": 2653 }, { "epoch": 1.8611500701262274, "grad_norm": 0.8282027244567871, "learning_rate": 3.515149917094953e-05, "loss": 0.4214, "step": 2654 }, { "epoch": 1.861851332398317, "grad_norm": 0.25447383522987366, "learning_rate": 3.514134635421381e-05, "loss": 0.0895, "step": 2655 }, { "epoch": 1.8625525946704067, "grad_norm": 0.8234844207763672, "learning_rate": 3.5131191535051857e-05, "loss": 0.4102, "step": 2656 }, { "epoch": 1.8632538569424963, "grad_norm": 0.31151658296585083, "learning_rate": 3.512103471546876e-05, "loss": 0.2468, "step": 2657 }, { "epoch": 1.8639551192145862, "grad_norm": 0.24141313135623932, "learning_rate": 3.511087589746999e-05, "loss": 0.0732, "step": 2658 }, { "epoch": 1.8646563814866761, "grad_norm": 0.2498801052570343, "learning_rate": 3.5100715083061435e-05, "loss": 0.0827, "step": 2659 }, { "epoch": 1.8653576437587658, "grad_norm": 0.25772717595100403, "learning_rate": 3.5090552274249374e-05, "loss": 0.0905, "step": 2660 }, { "epoch": 1.8660589060308554, "grad_norm": 0.23793666064739227, "learning_rate": 3.5080387473040445e-05, "loss": 0.0737, "step": 2661 }, { "epoch": 1.8667601683029453, "grad_norm": 0.29985082149505615, "learning_rate": 3.507022068144172e-05, "loss": 0.2546, "step": 2662 }, { "epoch": 1.8674614305750352, "grad_norm": 0.24168255925178528, "learning_rate": 3.506005190146066e-05, "loss": 0.0812, "step": 2663 }, { "epoch": 1.8681626928471249, "grad_norm": 0.2399907112121582, "learning_rate": 3.504988113510509e-05, "loss": 0.0734, "step": 2664 }, { "epoch": 1.8688639551192145, "grad_norm": 0.24113157391548157, "learning_rate": 3.503970838438325e-05, "loss": 0.0807, "step": 2665 }, { "epoch": 1.8695652173913042, "grad_norm": 0.24595430493354797, "learning_rate": 3.5029533651303776e-05, "loss": 0.0804, "step": 2666 }, { "epoch": 1.870266479663394, "grad_norm": 0.2980153262615204, "learning_rate": 3.501935693787568e-05, "loss": 0.2531, "step": 2667 }, { "epoch": 1.870967741935484, "grad_norm": 0.31275293231010437, "learning_rate": 3.500917824610837e-05, "loss": 0.2561, "step": 2668 }, { "epoch": 1.8716690042075736, "grad_norm": 0.33599400520324707, "learning_rate": 3.499899757801164e-05, "loss": 0.2394, "step": 2669 }, { "epoch": 1.8723702664796633, "grad_norm": 0.25532102584838867, "learning_rate": 3.49888149355957e-05, "loss": 0.0894, "step": 2670 }, { "epoch": 1.8730715287517532, "grad_norm": 0.3267499804496765, "learning_rate": 3.497863032087111e-05, "loss": 0.2489, "step": 2671 }, { "epoch": 1.8737727910238429, "grad_norm": 0.2560516893863678, "learning_rate": 3.496844373584885e-05, "loss": 0.089, "step": 2672 }, { "epoch": 1.8744740532959328, "grad_norm": 0.2440955489873886, "learning_rate": 3.495825518254027e-05, "loss": 0.0799, "step": 2673 }, { "epoch": 1.8751753155680224, "grad_norm": 0.3197394907474518, "learning_rate": 3.4948064662957136e-05, "loss": 0.2464, "step": 2674 }, { "epoch": 1.875876577840112, "grad_norm": 0.24246841669082642, "learning_rate": 3.493787217911156e-05, "loss": 0.0798, "step": 2675 }, { "epoch": 1.876577840112202, "grad_norm": 0.2412116974592209, "learning_rate": 3.492767773301609e-05, "loss": 0.0799, "step": 2676 }, { "epoch": 1.8772791023842919, "grad_norm": 0.8496125936508179, "learning_rate": 3.491748132668362e-05, "loss": 0.4133, "step": 2677 }, { "epoch": 1.8779803646563815, "grad_norm": 0.3498612642288208, "learning_rate": 3.4907282962127446e-05, "loss": 0.2351, "step": 2678 }, { "epoch": 1.8786816269284712, "grad_norm": 0.31541717052459717, "learning_rate": 3.489708264136126e-05, "loss": 0.2422, "step": 2679 }, { "epoch": 1.8793828892005608, "grad_norm": 0.3254282474517822, "learning_rate": 3.4886880366399136e-05, "loss": 0.2464, "step": 2680 }, { "epoch": 1.8800841514726507, "grad_norm": 0.25707948207855225, "learning_rate": 3.4876676139255515e-05, "loss": 0.0899, "step": 2681 }, { "epoch": 1.8807854137447406, "grad_norm": 0.24601979553699493, "learning_rate": 3.4866469961945256e-05, "loss": 0.0806, "step": 2682 }, { "epoch": 1.8814866760168303, "grad_norm": 0.8129667043685913, "learning_rate": 3.4856261836483586e-05, "loss": 0.4168, "step": 2683 }, { "epoch": 1.88218793828892, "grad_norm": 0.2616741955280304, "learning_rate": 3.4846051764886104e-05, "loss": 0.0898, "step": 2684 }, { "epoch": 1.8828892005610098, "grad_norm": 0.2906718850135803, "learning_rate": 3.4835839749168817e-05, "loss": 0.2523, "step": 2685 }, { "epoch": 1.8835904628330997, "grad_norm": 0.24595122039318085, "learning_rate": 3.4825625791348096e-05, "loss": 0.0817, "step": 2686 }, { "epoch": 1.8842917251051894, "grad_norm": 0.28494709730148315, "learning_rate": 3.4815409893440714e-05, "loss": 0.2496, "step": 2687 }, { "epoch": 1.884992987377279, "grad_norm": 0.24298636615276337, "learning_rate": 3.480519205746381e-05, "loss": 0.0807, "step": 2688 }, { "epoch": 1.8856942496493687, "grad_norm": 5.377334117889404, "learning_rate": 3.4794972285434916e-05, "loss": 0.672, "step": 2689 }, { "epoch": 1.8863955119214586, "grad_norm": 0.22890619933605194, "learning_rate": 3.478475057937194e-05, "loss": 0.0721, "step": 2690 }, { "epoch": 1.8870967741935485, "grad_norm": 0.24881653487682343, "learning_rate": 3.4774526941293194e-05, "loss": 0.0827, "step": 2691 }, { "epoch": 1.8877980364656382, "grad_norm": 0.3200373351573944, "learning_rate": 3.476430137321733e-05, "loss": 0.2327, "step": 2692 }, { "epoch": 1.8884992987377278, "grad_norm": 5.309676647186279, "learning_rate": 3.4754073877163406e-05, "loss": 0.6718, "step": 2693 }, { "epoch": 1.8892005610098177, "grad_norm": 0.24857763946056366, "learning_rate": 3.4743844455150875e-05, "loss": 0.0837, "step": 2694 }, { "epoch": 1.8899018232819076, "grad_norm": 0.2639242708683014, "learning_rate": 3.473361310919954e-05, "loss": 0.0936, "step": 2695 }, { "epoch": 1.8906030855539973, "grad_norm": 0.28946802020072937, "learning_rate": 3.47233798413296e-05, "loss": 0.2423, "step": 2696 }, { "epoch": 1.891304347826087, "grad_norm": 0.2520594000816345, "learning_rate": 3.4713144653561635e-05, "loss": 0.0847, "step": 2697 }, { "epoch": 1.8920056100981766, "grad_norm": 0.29155951738357544, "learning_rate": 3.47029075479166e-05, "loss": 0.2401, "step": 2698 }, { "epoch": 1.8927068723702665, "grad_norm": 0.27380648255348206, "learning_rate": 3.469266852641582e-05, "loss": 0.0717, "step": 2699 }, { "epoch": 1.8934081346423564, "grad_norm": 0.28327181935310364, "learning_rate": 3.4682427591081016e-05, "loss": 0.2441, "step": 2700 }, { "epoch": 1.894109396914446, "grad_norm": 0.26635462045669556, "learning_rate": 3.4672184743934285e-05, "loss": 0.0941, "step": 2701 }, { "epoch": 1.8948106591865357, "grad_norm": 0.7571332454681396, "learning_rate": 3.4661939986998075e-05, "loss": 0.4057, "step": 2702 }, { "epoch": 1.8955119214586256, "grad_norm": 0.2575763463973999, "learning_rate": 3.465169332229525e-05, "loss": 0.0859, "step": 2703 }, { "epoch": 1.8962131837307152, "grad_norm": 0.2642214894294739, "learning_rate": 3.464144475184902e-05, "loss": 0.0945, "step": 2704 }, { "epoch": 1.8969144460028051, "grad_norm": 0.2555651068687439, "learning_rate": 3.4631194277683e-05, "loss": 0.086, "step": 2705 }, { "epoch": 1.8976157082748948, "grad_norm": 0.2642749547958374, "learning_rate": 3.462094190182114e-05, "loss": 0.0798, "step": 2706 }, { "epoch": 1.8983169705469845, "grad_norm": 0.2736316919326782, "learning_rate": 3.4610687626287794e-05, "loss": 0.2414, "step": 2707 }, { "epoch": 1.8990182328190743, "grad_norm": 0.2623199224472046, "learning_rate": 3.4600431453107705e-05, "loss": 0.0875, "step": 2708 }, { "epoch": 1.8997194950911642, "grad_norm": 0.26399534940719604, "learning_rate": 3.459017338430595e-05, "loss": 0.094, "step": 2709 }, { "epoch": 1.9004207573632539, "grad_norm": 0.7794820666313171, "learning_rate": 3.457991342190801e-05, "loss": 0.4091, "step": 2710 }, { "epoch": 1.9011220196353436, "grad_norm": 0.2616789937019348, "learning_rate": 3.456965156793974e-05, "loss": 0.0934, "step": 2711 }, { "epoch": 1.9018232819074332, "grad_norm": 0.2630029320716858, "learning_rate": 3.455938782442735e-05, "loss": 0.0942, "step": 2712 }, { "epoch": 1.902524544179523, "grad_norm": 0.2574983835220337, "learning_rate": 3.454912219339744e-05, "loss": 0.0864, "step": 2713 }, { "epoch": 1.903225806451613, "grad_norm": 0.75737464427948, "learning_rate": 3.4538854676876974e-05, "loss": 0.4055, "step": 2714 }, { "epoch": 1.9039270687237027, "grad_norm": 0.2605516314506531, "learning_rate": 3.4528585276893294e-05, "loss": 0.0934, "step": 2715 }, { "epoch": 1.9046283309957923, "grad_norm": 0.2620047926902771, "learning_rate": 3.45183139954741e-05, "loss": 0.0936, "step": 2716 }, { "epoch": 1.9053295932678822, "grad_norm": 0.2639998197555542, "learning_rate": 3.450804083464749e-05, "loss": 0.0938, "step": 2717 }, { "epoch": 1.906030855539972, "grad_norm": 1.2728328704833984, "learning_rate": 3.4497765796441897e-05, "loss": 0.5631, "step": 2718 }, { "epoch": 1.9067321178120618, "grad_norm": 0.2633334696292877, "learning_rate": 3.448748888288617e-05, "loss": 0.0797, "step": 2719 }, { "epoch": 1.9074333800841514, "grad_norm": 0.25650396943092346, "learning_rate": 3.447721009600949e-05, "loss": 0.0858, "step": 2720 }, { "epoch": 1.908134642356241, "grad_norm": 0.25907161831855774, "learning_rate": 3.446692943784141e-05, "loss": 0.2473, "step": 2721 }, { "epoch": 1.908835904628331, "grad_norm": 0.2824145555496216, "learning_rate": 3.4456646910411887e-05, "loss": 0.2477, "step": 2722 }, { "epoch": 1.9095371669004209, "grad_norm": 0.2747066617012024, "learning_rate": 3.4446362515751205e-05, "loss": 0.2522, "step": 2723 }, { "epoch": 1.9102384291725105, "grad_norm": 0.2560669779777527, "learning_rate": 3.4436076255890044e-05, "loss": 0.0848, "step": 2724 }, { "epoch": 1.9109396914446002, "grad_norm": 0.2645627558231354, "learning_rate": 3.442578813285944e-05, "loss": 0.0938, "step": 2725 }, { "epoch": 1.91164095371669, "grad_norm": 0.2800223231315613, "learning_rate": 3.4415498148690804e-05, "loss": 0.2499, "step": 2726 }, { "epoch": 1.91234221598878, "grad_norm": 0.2640516757965088, "learning_rate": 3.440520630541591e-05, "loss": 0.0938, "step": 2727 }, { "epoch": 1.9130434782608696, "grad_norm": 0.7719916701316833, "learning_rate": 3.439491260506689e-05, "loss": 0.4025, "step": 2728 }, { "epoch": 1.9137447405329593, "grad_norm": 0.2497432827949524, "learning_rate": 3.4384617049676276e-05, "loss": 0.0845, "step": 2729 }, { "epoch": 1.914446002805049, "grad_norm": 0.28799182176589966, "learning_rate": 3.437431964127692e-05, "loss": 0.2415, "step": 2730 }, { "epoch": 1.9151472650771388, "grad_norm": 0.2633853852748871, "learning_rate": 3.436402038190208e-05, "loss": 0.0938, "step": 2731 }, { "epoch": 1.9158485273492287, "grad_norm": 0.267171710729599, "learning_rate": 3.435371927358534e-05, "loss": 0.0944, "step": 2732 }, { "epoch": 1.9165497896213184, "grad_norm": 0.30439260601997375, "learning_rate": 3.43434163183607e-05, "loss": 0.2365, "step": 2733 }, { "epoch": 1.917251051893408, "grad_norm": 0.2729806900024414, "learning_rate": 3.433311151826247e-05, "loss": 0.2465, "step": 2734 }, { "epoch": 1.917952314165498, "grad_norm": 0.2662490904331207, "learning_rate": 3.432280487532538e-05, "loss": 0.094, "step": 2735 }, { "epoch": 1.9186535764375876, "grad_norm": 0.2640516459941864, "learning_rate": 3.4312496391584466e-05, "loss": 0.0937, "step": 2736 }, { "epoch": 1.9193548387096775, "grad_norm": 0.24219293892383575, "learning_rate": 3.4302186069075164e-05, "loss": 0.0748, "step": 2737 }, { "epoch": 1.9200561009817672, "grad_norm": 0.2533677816390991, "learning_rate": 3.429187390983327e-05, "loss": 0.0766, "step": 2738 }, { "epoch": 1.9207573632538568, "grad_norm": 0.257703959941864, "learning_rate": 3.428155991589493e-05, "loss": 0.0853, "step": 2739 }, { "epoch": 1.9214586255259467, "grad_norm": 0.2628351151943207, "learning_rate": 3.4271244089296685e-05, "loss": 0.093, "step": 2740 }, { "epoch": 1.9221598877980366, "grad_norm": 0.2631506621837616, "learning_rate": 3.4260926432075376e-05, "loss": 0.0927, "step": 2741 }, { "epoch": 1.9228611500701263, "grad_norm": 0.25519460439682007, "learning_rate": 3.425060694626826e-05, "loss": 0.0843, "step": 2742 }, { "epoch": 1.923562412342216, "grad_norm": 0.2538740634918213, "learning_rate": 3.4240285633912936e-05, "loss": 0.0838, "step": 2743 }, { "epoch": 1.9242636746143056, "grad_norm": 0.26258182525634766, "learning_rate": 3.422996249704737e-05, "loss": 0.0917, "step": 2744 }, { "epoch": 1.9249649368863955, "grad_norm": 0.262041836977005, "learning_rate": 3.421963753770987e-05, "loss": 0.0912, "step": 2745 }, { "epoch": 1.9256661991584854, "grad_norm": 0.24340017139911652, "learning_rate": 3.420931075793913e-05, "loss": 0.0812, "step": 2746 }, { "epoch": 1.926367461430575, "grad_norm": 0.24666719138622284, "learning_rate": 3.419898215977418e-05, "loss": 0.0814, "step": 2747 }, { "epoch": 1.9270687237026647, "grad_norm": 0.3154844045639038, "learning_rate": 3.418865174525443e-05, "loss": 0.2462, "step": 2748 }, { "epoch": 1.9277699859747546, "grad_norm": 0.25746414065361023, "learning_rate": 3.417831951641963e-05, "loss": 0.0892, "step": 2749 }, { "epoch": 1.9284712482468445, "grad_norm": 0.24960944056510925, "learning_rate": 3.41679854753099e-05, "loss": 0.0812, "step": 2750 }, { "epoch": 1.9291725105189341, "grad_norm": 0.24564659595489502, "learning_rate": 3.4157649623965716e-05, "loss": 0.0791, "step": 2751 }, { "epoch": 1.9298737727910238, "grad_norm": 0.3574863374233246, "learning_rate": 3.41473119644279e-05, "loss": 0.2488, "step": 2752 }, { "epoch": 1.9305750350631135, "grad_norm": 0.2394183874130249, "learning_rate": 3.413697249873765e-05, "loss": 0.0785, "step": 2753 }, { "epoch": 1.9312762973352033, "grad_norm": 0.2516705393791199, "learning_rate": 3.412663122893651e-05, "loss": 0.0856, "step": 2754 }, { "epoch": 1.9319775596072932, "grad_norm": 0.22228248417377472, "learning_rate": 3.411628815706638e-05, "loss": 0.068, "step": 2755 }, { "epoch": 1.932678821879383, "grad_norm": 0.25377506017684937, "learning_rate": 3.410594328516952e-05, "loss": 0.086, "step": 2756 }, { "epoch": 1.9333800841514726, "grad_norm": 6.4758758544921875, "learning_rate": 3.4095596615288536e-05, "loss": 0.9103, "step": 2757 }, { "epoch": 1.9340813464235624, "grad_norm": 0.24962982535362244, "learning_rate": 3.40852481494664e-05, "loss": 0.0845, "step": 2758 }, { "epoch": 1.9347826086956523, "grad_norm": 0.36575430631637573, "learning_rate": 3.4074897889746427e-05, "loss": 0.2532, "step": 2759 }, { "epoch": 1.935483870967742, "grad_norm": 0.3513936996459961, "learning_rate": 3.4064545838172314e-05, "loss": 0.2649, "step": 2760 }, { "epoch": 1.9361851332398317, "grad_norm": 6.195801258087158, "learning_rate": 3.405419199678807e-05, "loss": 0.7624, "step": 2761 }, { "epoch": 1.9368863955119213, "grad_norm": 0.23430095613002777, "learning_rate": 3.404383636763809e-05, "loss": 0.0753, "step": 2762 }, { "epoch": 1.9375876577840112, "grad_norm": 0.24858713150024414, "learning_rate": 3.40334789527671e-05, "loss": 0.085, "step": 2763 }, { "epoch": 1.938288920056101, "grad_norm": 0.34300515055656433, "learning_rate": 3.402311975422021e-05, "loss": 0.2498, "step": 2764 }, { "epoch": 1.9389901823281908, "grad_norm": 0.33354252576828003, "learning_rate": 3.4012758774042837e-05, "loss": 0.2584, "step": 2765 }, { "epoch": 1.9396914446002804, "grad_norm": 0.2475176304578781, "learning_rate": 3.4002396014280786e-05, "loss": 0.0847, "step": 2766 }, { "epoch": 1.9403927068723703, "grad_norm": 0.32928937673568726, "learning_rate": 3.3992031476980205e-05, "loss": 0.2621, "step": 2767 }, { "epoch": 1.94109396914446, "grad_norm": 0.24712644517421722, "learning_rate": 3.3981665164187584e-05, "loss": 0.0846, "step": 2768 }, { "epoch": 1.9417952314165499, "grad_norm": 0.237661674618721, "learning_rate": 3.397129707794977e-05, "loss": 0.0769, "step": 2769 }, { "epoch": 1.9424964936886395, "grad_norm": 0.3598715364933014, "learning_rate": 3.3960927220313957e-05, "loss": 0.2421, "step": 2770 }, { "epoch": 1.9431977559607292, "grad_norm": 0.8642219305038452, "learning_rate": 3.3950555593327704e-05, "loss": 0.4323, "step": 2771 }, { "epoch": 1.943899018232819, "grad_norm": 0.33017706871032715, "learning_rate": 3.3940182199038893e-05, "loss": 0.2615, "step": 2772 }, { "epoch": 1.944600280504909, "grad_norm": 4.397739410400391, "learning_rate": 3.392980703949577e-05, "loss": 0.5909, "step": 2773 }, { "epoch": 1.9453015427769986, "grad_norm": 0.3359335660934448, "learning_rate": 3.391943011674694e-05, "loss": 0.2467, "step": 2774 }, { "epoch": 1.9460028050490883, "grad_norm": 0.32860705256462097, "learning_rate": 3.390905143284132e-05, "loss": 0.2384, "step": 2775 }, { "epoch": 1.946704067321178, "grad_norm": 0.3180951178073883, "learning_rate": 3.389867098982823e-05, "loss": 0.2487, "step": 2776 }, { "epoch": 1.9474053295932678, "grad_norm": 0.2871747612953186, "learning_rate": 3.388828878975727e-05, "loss": 0.2528, "step": 2777 }, { "epoch": 1.9481065918653577, "grad_norm": 0.24554355442523956, "learning_rate": 3.3877904834678464e-05, "loss": 0.0817, "step": 2778 }, { "epoch": 1.9488078541374474, "grad_norm": 0.252697229385376, "learning_rate": 3.386751912664211e-05, "loss": 0.0897, "step": 2779 }, { "epoch": 1.949509116409537, "grad_norm": 0.29307445883750916, "learning_rate": 3.385713166769889e-05, "loss": 0.2552, "step": 2780 }, { "epoch": 1.950210378681627, "grad_norm": 0.2537166178226471, "learning_rate": 3.384674245989985e-05, "loss": 0.0904, "step": 2781 }, { "epoch": 1.9509116409537168, "grad_norm": 0.279345840215683, "learning_rate": 3.383635150529632e-05, "loss": 0.2508, "step": 2782 }, { "epoch": 1.9516129032258065, "grad_norm": 0.29612624645233154, "learning_rate": 3.3825958805940034e-05, "loss": 0.2467, "step": 2783 }, { "epoch": 1.9523141654978962, "grad_norm": 0.2623206675052643, "learning_rate": 3.381556436388305e-05, "loss": 0.0871, "step": 2784 }, { "epoch": 1.9530154277699858, "grad_norm": 0.2533935606479645, "learning_rate": 3.380516818117776e-05, "loss": 0.0907, "step": 2785 }, { "epoch": 1.9537166900420757, "grad_norm": 0.7876799702644348, "learning_rate": 3.379477025987689e-05, "loss": 0.406, "step": 2786 }, { "epoch": 1.9544179523141656, "grad_norm": 0.27839145064353943, "learning_rate": 3.378437060203357e-05, "loss": 0.2523, "step": 2787 }, { "epoch": 1.9551192145862553, "grad_norm": 0.25464239716529846, "learning_rate": 3.377396920970121e-05, "loss": 0.0855, "step": 2788 }, { "epoch": 1.955820476858345, "grad_norm": 0.25658783316612244, "learning_rate": 3.376356608493357e-05, "loss": 0.0927, "step": 2789 }, { "epoch": 1.9565217391304348, "grad_norm": 0.2562657296657562, "learning_rate": 3.3753161229784766e-05, "loss": 0.0919, "step": 2790 }, { "epoch": 1.9572230014025247, "grad_norm": 0.2666504383087158, "learning_rate": 3.3742754646309286e-05, "loss": 0.2518, "step": 2791 }, { "epoch": 1.9579242636746144, "grad_norm": 0.2871205806732178, "learning_rate": 3.3732346336561896e-05, "loss": 0.2467, "step": 2792 }, { "epoch": 1.958625525946704, "grad_norm": 0.257540225982666, "learning_rate": 3.3721936302597735e-05, "loss": 0.0929, "step": 2793 }, { "epoch": 1.9593267882187937, "grad_norm": 0.33511799573898315, "learning_rate": 3.37115245464723e-05, "loss": 0.0741, "step": 2794 }, { "epoch": 1.9600280504908836, "grad_norm": 0.27549949288368225, "learning_rate": 3.37011110702414e-05, "loss": 0.2537, "step": 2795 }, { "epoch": 1.9607293127629735, "grad_norm": 0.2568589150905609, "learning_rate": 3.3690695875961195e-05, "loss": 0.0925, "step": 2796 }, { "epoch": 1.9614305750350631, "grad_norm": 0.26523733139038086, "learning_rate": 3.3680278965688185e-05, "loss": 0.0792, "step": 2797 }, { "epoch": 1.9621318373071528, "grad_norm": 0.2561504542827606, "learning_rate": 3.36698603414792e-05, "loss": 0.0922, "step": 2798 }, { "epoch": 1.9628330995792427, "grad_norm": 0.3054198920726776, "learning_rate": 3.365944000539143e-05, "loss": 0.2476, "step": 2799 }, { "epoch": 1.9635343618513323, "grad_norm": 0.2883957624435425, "learning_rate": 3.364901795948237e-05, "loss": 0.252, "step": 2800 }, { "epoch": 1.9642356241234222, "grad_norm": 3.710678815841675, "learning_rate": 3.3638594205809874e-05, "loss": 0.4617, "step": 2801 }, { "epoch": 1.964936886395512, "grad_norm": 0.7906757593154907, "learning_rate": 3.362816874643214e-05, "loss": 0.4112, "step": 2802 }, { "epoch": 1.9656381486676016, "grad_norm": 0.2586943209171295, "learning_rate": 3.36177415834077e-05, "loss": 0.093, "step": 2803 }, { "epoch": 1.9663394109396914, "grad_norm": 0.26001113653182983, "learning_rate": 3.360731271879538e-05, "loss": 0.0931, "step": 2804 }, { "epoch": 1.9670406732117813, "grad_norm": 0.26154062151908875, "learning_rate": 3.359688215465442e-05, "loss": 0.0871, "step": 2805 }, { "epoch": 1.967741935483871, "grad_norm": 0.2589048743247986, "learning_rate": 3.358644989304433e-05, "loss": 0.0934, "step": 2806 }, { "epoch": 1.9684431977559607, "grad_norm": 0.2597263753414154, "learning_rate": 3.357601593602498e-05, "loss": 0.0875, "step": 2807 }, { "epoch": 1.9691444600280503, "grad_norm": 0.28766071796417236, "learning_rate": 3.356558028565657e-05, "loss": 0.2492, "step": 2808 }, { "epoch": 1.9698457223001402, "grad_norm": 0.29485711455345154, "learning_rate": 3.355514294399965e-05, "loss": 0.2397, "step": 2809 }, { "epoch": 1.97054698457223, "grad_norm": 0.2578175365924835, "learning_rate": 3.354470391311507e-05, "loss": 0.0873, "step": 2810 }, { "epoch": 1.9712482468443198, "grad_norm": 0.25394582748413086, "learning_rate": 3.353426319506405e-05, "loss": 0.086, "step": 2811 }, { "epoch": 1.9719495091164094, "grad_norm": 0.25732165575027466, "learning_rate": 3.352382079190813e-05, "loss": 0.0929, "step": 2812 }, { "epoch": 1.9726507713884993, "grad_norm": 0.2744271755218506, "learning_rate": 3.351337670570917e-05, "loss": 0.0818, "step": 2813 }, { "epoch": 1.9733520336605892, "grad_norm": 0.2570578455924988, "learning_rate": 3.3502930938529374e-05, "loss": 0.093, "step": 2814 }, { "epoch": 1.9740532959326789, "grad_norm": 0.2567186653614044, "learning_rate": 3.349248349243129e-05, "loss": 0.0922, "step": 2815 }, { "epoch": 1.9747545582047685, "grad_norm": 0.7767812609672546, "learning_rate": 3.3482034369477766e-05, "loss": 0.4015, "step": 2816 }, { "epoch": 1.9754558204768582, "grad_norm": 0.2580896317958832, "learning_rate": 3.347158357173201e-05, "loss": 0.0859, "step": 2817 }, { "epoch": 1.976157082748948, "grad_norm": 0.26000750064849854, "learning_rate": 3.3461131101257546e-05, "loss": 0.0862, "step": 2818 }, { "epoch": 1.976858345021038, "grad_norm": 0.2570190131664276, "learning_rate": 3.345067696011824e-05, "loss": 0.0925, "step": 2819 }, { "epoch": 1.9775596072931276, "grad_norm": 0.27939197421073914, "learning_rate": 3.344022115037826e-05, "loss": 0.2535, "step": 2820 }, { "epoch": 1.9782608695652173, "grad_norm": 0.28465476632118225, "learning_rate": 3.342976367410215e-05, "loss": 0.2443, "step": 2821 }, { "epoch": 1.9789621318373072, "grad_norm": 0.2854703366756439, "learning_rate": 3.341930453335474e-05, "loss": 0.2474, "step": 2822 }, { "epoch": 1.979663394109397, "grad_norm": 0.2580248713493347, "learning_rate": 3.3408843730201214e-05, "loss": 0.0858, "step": 2823 }, { "epoch": 1.9803646563814867, "grad_norm": 0.28614509105682373, "learning_rate": 3.339838126670706e-05, "loss": 0.2433, "step": 2824 }, { "epoch": 1.9810659186535764, "grad_norm": 0.29315993189811707, "learning_rate": 3.3387917144938124e-05, "loss": 0.2549, "step": 2825 }, { "epoch": 1.981767180925666, "grad_norm": 0.24929694831371307, "learning_rate": 3.337745136696057e-05, "loss": 0.084, "step": 2826 }, { "epoch": 1.982468443197756, "grad_norm": 0.2547084093093872, "learning_rate": 3.336698393484087e-05, "loss": 0.0919, "step": 2827 }, { "epoch": 1.9831697054698458, "grad_norm": 0.31145840883255005, "learning_rate": 3.335651485064583e-05, "loss": 0.2418, "step": 2828 }, { "epoch": 1.9838709677419355, "grad_norm": 0.25666263699531555, "learning_rate": 3.334604411644261e-05, "loss": 0.0762, "step": 2829 }, { "epoch": 1.9845722300140252, "grad_norm": 0.26737019419670105, "learning_rate": 3.333557173429866e-05, "loss": 0.0791, "step": 2830 }, { "epoch": 1.985273492286115, "grad_norm": 0.28846320509910583, "learning_rate": 3.3325097706281776e-05, "loss": 0.242, "step": 2831 }, { "epoch": 1.9859747545582047, "grad_norm": 0.2525695264339447, "learning_rate": 3.331462203446007e-05, "loss": 0.0773, "step": 2832 }, { "epoch": 1.9866760168302946, "grad_norm": 3.765403985977173, "learning_rate": 3.330414472090199e-05, "loss": 0.5212, "step": 2833 }, { "epoch": 1.9873772791023843, "grad_norm": 0.24735024571418762, "learning_rate": 3.329366576767628e-05, "loss": 0.0834, "step": 2834 }, { "epoch": 1.988078541374474, "grad_norm": 0.2534492015838623, "learning_rate": 3.328318517685204e-05, "loss": 0.0906, "step": 2835 }, { "epoch": 1.9887798036465638, "grad_norm": 0.2912791669368744, "learning_rate": 3.327270295049868e-05, "loss": 0.2427, "step": 2836 }, { "epoch": 1.9894810659186537, "grad_norm": 0.25244107842445374, "learning_rate": 3.326221909068594e-05, "loss": 0.0905, "step": 2837 }, { "epoch": 1.9901823281907434, "grad_norm": 0.25358426570892334, "learning_rate": 3.325173359948387e-05, "loss": 0.0908, "step": 2838 }, { "epoch": 1.990883590462833, "grad_norm": 0.25070786476135254, "learning_rate": 3.324124647896284e-05, "loss": 0.0906, "step": 2839 }, { "epoch": 1.9915848527349227, "grad_norm": 0.2600039839744568, "learning_rate": 3.3230757731193564e-05, "loss": 0.076, "step": 2840 }, { "epoch": 1.9922861150070126, "grad_norm": 0.25159725546836853, "learning_rate": 3.3220267358247056e-05, "loss": 0.0896, "step": 2841 }, { "epoch": 1.9929873772791025, "grad_norm": 1.3108097314834595, "learning_rate": 3.320977536219465e-05, "loss": 0.5805, "step": 2842 }, { "epoch": 1.9936886395511921, "grad_norm": 0.25189873576164246, "learning_rate": 3.319928174510802e-05, "loss": 0.0896, "step": 2843 }, { "epoch": 1.9943899018232818, "grad_norm": 0.2538452446460724, "learning_rate": 3.318878650905915e-05, "loss": 0.067, "step": 2844 }, { "epoch": 1.9950911640953717, "grad_norm": 0.2521633505821228, "learning_rate": 3.317828965612034e-05, "loss": 0.0894, "step": 2845 }, { "epoch": 1.9957924263674616, "grad_norm": 0.30617907643318176, "learning_rate": 3.31677911883642e-05, "loss": 0.2466, "step": 2846 }, { "epoch": 1.9964936886395512, "grad_norm": 0.24947340786457062, "learning_rate": 3.31572911078637e-05, "loss": 0.0888, "step": 2847 }, { "epoch": 1.997194950911641, "grad_norm": 0.8185268044471741, "learning_rate": 3.314678941669206e-05, "loss": 0.413, "step": 2848 }, { "epoch": 1.9978962131837306, "grad_norm": 0.30889466404914856, "learning_rate": 3.313628611692289e-05, "loss": 0.2481, "step": 2849 }, { "epoch": 1.9985974754558204, "grad_norm": 0.2472163587808609, "learning_rate": 3.312578121063006e-05, "loss": 0.0751, "step": 2850 }, { "epoch": 1.9992987377279103, "grad_norm": 0.25069352984428406, "learning_rate": 3.31152746998878e-05, "loss": 0.0892, "step": 2851 }, { "epoch": 2.0, "grad_norm": 0.2528415322303772, "learning_rate": 3.310476658677063e-05, "loss": 0.0901, "step": 2852 }, { "epoch": 2.0, "eval_f1 (minor class)": 0.0, "eval_loss": 0.1738959699869156, "eval_roc_auc": 0.5282848138787481, "eval_runtime": 233.3119, "eval_samples_per_second": 5.435, "eval_steps_per_second": 1.359, "step": 2852 }, { "epoch": 2.0007012622720897, "grad_norm": 0.24420195817947388, "learning_rate": 3.309425687335339e-05, "loss": 0.073, "step": 2853 }, { "epoch": 2.0014025245441793, "grad_norm": 0.24949035048484802, "learning_rate": 3.3083745561711254e-05, "loss": 0.0886, "step": 2854 }, { "epoch": 2.0021037868162694, "grad_norm": 0.24352121353149414, "learning_rate": 3.3073232653919696e-05, "loss": 0.0806, "step": 2855 }, { "epoch": 2.002805049088359, "grad_norm": 0.2450723946094513, "learning_rate": 3.3062718152054496e-05, "loss": 0.0747, "step": 2856 }, { "epoch": 2.0035063113604488, "grad_norm": 0.3329989016056061, "learning_rate": 3.305220205819176e-05, "loss": 0.241, "step": 2857 }, { "epoch": 2.0042075736325384, "grad_norm": 0.3233387768268585, "learning_rate": 3.304168437440793e-05, "loss": 0.2497, "step": 2858 }, { "epoch": 2.0049088359046285, "grad_norm": 0.30241668224334717, "learning_rate": 3.3031165102779724e-05, "loss": 0.2539, "step": 2859 }, { "epoch": 2.005610098176718, "grad_norm": 0.34913209080696106, "learning_rate": 3.302064424538419e-05, "loss": 0.2448, "step": 2860 }, { "epoch": 2.006311360448808, "grad_norm": 0.31483304500579834, "learning_rate": 3.30101218042987e-05, "loss": 0.247, "step": 2861 }, { "epoch": 2.0070126227208975, "grad_norm": 0.2703505754470825, "learning_rate": 3.299959778160092e-05, "loss": 0.0495, "step": 2862 }, { "epoch": 2.007713884992987, "grad_norm": 0.24164605140686035, "learning_rate": 3.298907217936883e-05, "loss": 0.0731, "step": 2863 }, { "epoch": 2.0084151472650773, "grad_norm": 0.31432145833969116, "learning_rate": 3.2978544999680736e-05, "loss": 0.2459, "step": 2864 }, { "epoch": 2.009116409537167, "grad_norm": 0.2501172125339508, "learning_rate": 3.296801624461525e-05, "loss": 0.0881, "step": 2865 }, { "epoch": 2.0098176718092566, "grad_norm": 0.24958059191703796, "learning_rate": 3.295748591625129e-05, "loss": 0.0872, "step": 2866 }, { "epoch": 2.0105189340813463, "grad_norm": 0.8566231727600098, "learning_rate": 3.2946954016668086e-05, "loss": 0.4055, "step": 2867 }, { "epoch": 2.0112201963534364, "grad_norm": 0.314657062292099, "learning_rate": 3.293642054794519e-05, "loss": 0.2582, "step": 2868 }, { "epoch": 2.011921458625526, "grad_norm": 0.2452186644077301, "learning_rate": 3.292588551216243e-05, "loss": 0.0809, "step": 2869 }, { "epoch": 2.0126227208976157, "grad_norm": 0.8482988476753235, "learning_rate": 3.291534891139998e-05, "loss": 0.4158, "step": 2870 }, { "epoch": 2.0133239831697054, "grad_norm": 0.23982878029346466, "learning_rate": 3.290481074773832e-05, "loss": 0.0799, "step": 2871 }, { "epoch": 2.014025245441795, "grad_norm": 0.2539716362953186, "learning_rate": 3.289427102325822e-05, "loss": 0.0887, "step": 2872 }, { "epoch": 2.014726507713885, "grad_norm": 0.23757599294185638, "learning_rate": 3.2883729740040764e-05, "loss": 0.0793, "step": 2873 }, { "epoch": 2.015427769985975, "grad_norm": 0.2535957396030426, "learning_rate": 3.2873186900167355e-05, "loss": 0.0885, "step": 2874 }, { "epoch": 2.0161290322580645, "grad_norm": 4.902713298797607, "learning_rate": 3.286264250571968e-05, "loss": 0.7855, "step": 2875 }, { "epoch": 2.016830294530154, "grad_norm": 0.23848296701908112, "learning_rate": 3.2852096558779754e-05, "loss": 0.0795, "step": 2876 }, { "epoch": 2.017531556802244, "grad_norm": 0.30523914098739624, "learning_rate": 3.28415490614299e-05, "loss": 0.2563, "step": 2877 }, { "epoch": 2.018232819074334, "grad_norm": 0.25232288241386414, "learning_rate": 3.283100001575274e-05, "loss": 0.0885, "step": 2878 }, { "epoch": 2.0189340813464236, "grad_norm": 0.31757330894470215, "learning_rate": 3.2820449423831186e-05, "loss": 0.2449, "step": 2879 }, { "epoch": 2.0196353436185133, "grad_norm": 0.8439903259277344, "learning_rate": 3.280989728774848e-05, "loss": 0.4007, "step": 2880 }, { "epoch": 2.020336605890603, "grad_norm": 0.311911940574646, "learning_rate": 3.279934360958816e-05, "loss": 0.2477, "step": 2881 }, { "epoch": 2.021037868162693, "grad_norm": 0.31469953060150146, "learning_rate": 3.278878839143407e-05, "loss": 0.2467, "step": 2882 }, { "epoch": 2.0217391304347827, "grad_norm": 0.24170814454555511, "learning_rate": 3.2778231635370346e-05, "loss": 0.0806, "step": 2883 }, { "epoch": 2.0224403927068724, "grad_norm": 0.31074008345603943, "learning_rate": 3.2767673343481456e-05, "loss": 0.2456, "step": 2884 }, { "epoch": 2.023141654978962, "grad_norm": 0.2459161877632141, "learning_rate": 3.275711351785213e-05, "loss": 0.0748, "step": 2885 }, { "epoch": 2.0238429172510517, "grad_norm": 0.25516635179519653, "learning_rate": 3.2746552160567446e-05, "loss": 0.0907, "step": 2886 }, { "epoch": 2.024544179523142, "grad_norm": 0.3127748370170593, "learning_rate": 3.2735989273712744e-05, "loss": 0.2477, "step": 2887 }, { "epoch": 2.0252454417952315, "grad_norm": 0.24473729729652405, "learning_rate": 3.272542485937369e-05, "loss": 0.0826, "step": 2888 }, { "epoch": 2.025946704067321, "grad_norm": 0.800141453742981, "learning_rate": 3.271485891963625e-05, "loss": 0.4173, "step": 2889 }, { "epoch": 2.026647966339411, "grad_norm": 0.2589477002620697, "learning_rate": 3.2704291456586686e-05, "loss": 0.092, "step": 2890 }, { "epoch": 2.027349228611501, "grad_norm": 0.24591121077537537, "learning_rate": 3.269372247231155e-05, "loss": 0.075, "step": 2891 }, { "epoch": 2.0280504908835906, "grad_norm": 0.30050012469291687, "learning_rate": 3.2683151968897724e-05, "loss": 0.2427, "step": 2892 }, { "epoch": 2.0287517531556802, "grad_norm": 0.25799015164375305, "learning_rate": 3.267257994843236e-05, "loss": 0.0916, "step": 2893 }, { "epoch": 2.02945301542777, "grad_norm": 0.2610483765602112, "learning_rate": 3.266200641300293e-05, "loss": 0.0921, "step": 2894 }, { "epoch": 2.0301542776998596, "grad_norm": 0.25079378485679626, "learning_rate": 3.2651431364697186e-05, "loss": 0.0663, "step": 2895 }, { "epoch": 2.0308555399719497, "grad_norm": 4.476100444793701, "learning_rate": 3.264085480560319e-05, "loss": 0.5518, "step": 2896 }, { "epoch": 2.0315568022440393, "grad_norm": 0.28685513138771057, "learning_rate": 3.2630276737809315e-05, "loss": 0.253, "step": 2897 }, { "epoch": 2.032258064516129, "grad_norm": 0.7752599716186523, "learning_rate": 3.2619697163404216e-05, "loss": 0.4116, "step": 2898 }, { "epoch": 2.0329593267882187, "grad_norm": 0.25406166911125183, "learning_rate": 3.260911608447684e-05, "loss": 0.0854, "step": 2899 }, { "epoch": 2.0336605890603083, "grad_norm": 0.25225579738616943, "learning_rate": 3.259853350311644e-05, "loss": 0.0843, "step": 2900 }, { "epoch": 2.0343618513323984, "grad_norm": 0.2648591995239258, "learning_rate": 3.258794942141257e-05, "loss": 0.2471, "step": 2901 }, { "epoch": 2.035063113604488, "grad_norm": 0.2618024945259094, "learning_rate": 3.257736384145506e-05, "loss": 0.0932, "step": 2902 }, { "epoch": 2.0357643758765778, "grad_norm": 0.28303423523902893, "learning_rate": 3.256677676533408e-05, "loss": 0.2407, "step": 2903 }, { "epoch": 2.0364656381486674, "grad_norm": 0.26240411400794983, "learning_rate": 3.255618819514004e-05, "loss": 0.0939, "step": 2904 }, { "epoch": 2.0371669004207575, "grad_norm": 0.2615704834461212, "learning_rate": 3.254559813296368e-05, "loss": 0.0793, "step": 2905 }, { "epoch": 2.037868162692847, "grad_norm": 0.263500839471817, "learning_rate": 3.2535006580896024e-05, "loss": 0.0935, "step": 2906 }, { "epoch": 2.038569424964937, "grad_norm": 0.2609994113445282, "learning_rate": 3.25244135410284e-05, "loss": 0.0935, "step": 2907 }, { "epoch": 2.0392706872370265, "grad_norm": 0.26627644896507263, "learning_rate": 3.251381901545242e-05, "loss": 0.2482, "step": 2908 }, { "epoch": 2.039971949509116, "grad_norm": 0.778956949710846, "learning_rate": 3.2503223006259974e-05, "loss": 0.4133, "step": 2909 }, { "epoch": 2.0406732117812063, "grad_norm": 0.27501824498176575, "learning_rate": 3.249262551554329e-05, "loss": 0.2409, "step": 2910 }, { "epoch": 2.041374474053296, "grad_norm": 0.2844730615615845, "learning_rate": 3.248202654539484e-05, "loss": 0.2427, "step": 2911 }, { "epoch": 2.0420757363253856, "grad_norm": 0.26213210821151733, "learning_rate": 3.2471426097907413e-05, "loss": 0.0934, "step": 2912 }, { "epoch": 2.0427769985974753, "grad_norm": 0.25519290566444397, "learning_rate": 3.2460824175174096e-05, "loss": 0.0859, "step": 2913 }, { "epoch": 2.0434782608695654, "grad_norm": 0.27215972542762756, "learning_rate": 3.2450220779288246e-05, "loss": 0.2413, "step": 2914 }, { "epoch": 2.044179523141655, "grad_norm": 0.26790010929107666, "learning_rate": 3.2439615912343526e-05, "loss": 0.0948, "step": 2915 }, { "epoch": 2.0448807854137447, "grad_norm": 0.26281920075416565, "learning_rate": 3.2429009576433875e-05, "loss": 0.0938, "step": 2916 }, { "epoch": 2.0455820476858344, "grad_norm": 0.2544053792953491, "learning_rate": 3.241840177365355e-05, "loss": 0.0861, "step": 2917 }, { "epoch": 2.046283309957924, "grad_norm": 0.26326197385787964, "learning_rate": 3.2407792506097066e-05, "loss": 0.0861, "step": 2918 }, { "epoch": 2.046984572230014, "grad_norm": 4.018224239349365, "learning_rate": 3.2397181775859246e-05, "loss": 0.6685, "step": 2919 }, { "epoch": 2.047685834502104, "grad_norm": 0.2836417257785797, "learning_rate": 3.23865695850352e-05, "loss": 0.2527, "step": 2920 }, { "epoch": 2.0483870967741935, "grad_norm": 3.0721871852874756, "learning_rate": 3.237595593572032e-05, "loss": 0.6997, "step": 2921 }, { "epoch": 2.049088359046283, "grad_norm": 0.7480080127716064, "learning_rate": 3.2365340830010285e-05, "loss": 0.4032, "step": 2922 }, { "epoch": 2.0497896213183733, "grad_norm": 0.26900431513786316, "learning_rate": 3.235472427000107e-05, "loss": 0.2416, "step": 2923 }, { "epoch": 2.050490883590463, "grad_norm": 0.2639055848121643, "learning_rate": 3.2344106257788945e-05, "loss": 0.089, "step": 2924 }, { "epoch": 2.0511921458625526, "grad_norm": 0.28799688816070557, "learning_rate": 3.233348679547043e-05, "loss": 0.2379, "step": 2925 }, { "epoch": 2.0518934081346423, "grad_norm": 0.27147817611694336, "learning_rate": 3.2322865885142375e-05, "loss": 0.2447, "step": 2926 }, { "epoch": 2.052594670406732, "grad_norm": 0.2620126008987427, "learning_rate": 3.231224352890189e-05, "loss": 0.2526, "step": 2927 }, { "epoch": 2.053295932678822, "grad_norm": 0.26090875267982483, "learning_rate": 3.230161972884638e-05, "loss": 0.2462, "step": 2928 }, { "epoch": 2.0539971949509117, "grad_norm": 0.2656753361225128, "learning_rate": 3.229099448707352e-05, "loss": 0.0894, "step": 2929 }, { "epoch": 2.0546984572230014, "grad_norm": 0.274080753326416, "learning_rate": 3.228036780568131e-05, "loss": 0.0978, "step": 2930 }, { "epoch": 2.055399719495091, "grad_norm": 0.2613712251186371, "learning_rate": 3.226973968676797e-05, "loss": 0.2396, "step": 2931 }, { "epoch": 2.056100981767181, "grad_norm": 0.2630743086338043, "learning_rate": 3.225911013243208e-05, "loss": 0.2429, "step": 2932 }, { "epoch": 2.056802244039271, "grad_norm": 2.800959587097168, "learning_rate": 3.224847914477243e-05, "loss": 0.6464, "step": 2933 }, { "epoch": 2.0575035063113605, "grad_norm": 0.24379713833332062, "learning_rate": 3.223784672588815e-05, "loss": 0.2482, "step": 2934 }, { "epoch": 2.05820476858345, "grad_norm": 0.2782478630542755, "learning_rate": 3.222721287787861e-05, "loss": 0.0929, "step": 2935 }, { "epoch": 2.05890603085554, "grad_norm": 0.2819986045360565, "learning_rate": 3.2216577602843496e-05, "loss": 0.0944, "step": 2936 }, { "epoch": 2.05960729312763, "grad_norm": 0.7211698889732361, "learning_rate": 3.2205940902882755e-05, "loss": 0.3909, "step": 2937 }, { "epoch": 2.0603085553997196, "grad_norm": 0.7222282290458679, "learning_rate": 3.219530278009663e-05, "loss": 0.3819, "step": 2938 }, { "epoch": 2.0610098176718092, "grad_norm": 0.7155467867851257, "learning_rate": 3.2184663236585615e-05, "loss": 0.3898, "step": 2939 }, { "epoch": 2.061711079943899, "grad_norm": 0.23690442740917206, "learning_rate": 3.2174022274450534e-05, "loss": 0.2471, "step": 2940 }, { "epoch": 2.0624123422159886, "grad_norm": 0.2940015494823456, "learning_rate": 3.216337989579244e-05, "loss": 0.0973, "step": 2941 }, { "epoch": 2.0631136044880787, "grad_norm": 1.6085069179534912, "learning_rate": 3.21527361027127e-05, "loss": 0.2768, "step": 2942 }, { "epoch": 2.0638148667601683, "grad_norm": 0.30063048005104065, "learning_rate": 3.214209089731296e-05, "loss": 0.0973, "step": 2943 }, { "epoch": 2.064516129032258, "grad_norm": 0.38897523283958435, "learning_rate": 3.2131444281695114e-05, "loss": 0.095, "step": 2944 }, { "epoch": 2.0652173913043477, "grad_norm": 1.1840753555297852, "learning_rate": 3.212079625796136e-05, "loss": 0.5234, "step": 2945 }, { "epoch": 2.065918653576438, "grad_norm": 0.22477175295352936, "learning_rate": 3.2110146828214174e-05, "loss": 0.2411, "step": 2946 }, { "epoch": 2.0666199158485274, "grad_norm": 0.31442180275917053, "learning_rate": 3.2099495994556305e-05, "loss": 0.1015, "step": 2947 }, { "epoch": 2.067321178120617, "grad_norm": 0.31514042615890503, "learning_rate": 3.2088843759090776e-05, "loss": 0.1011, "step": 2948 }, { "epoch": 2.0680224403927068, "grad_norm": 0.33047816157341003, "learning_rate": 3.207819012392088e-05, "loss": 0.1051, "step": 2949 }, { "epoch": 2.0687237026647964, "grad_norm": 0.29170483350753784, "learning_rate": 3.206753509115021e-05, "loss": 0.1034, "step": 2950 }, { "epoch": 2.0694249649368865, "grad_norm": 0.36705532670021057, "learning_rate": 3.205687866288262e-05, "loss": 0.1013, "step": 2951 }, { "epoch": 2.070126227208976, "grad_norm": 0.687567412853241, "learning_rate": 3.204622084122222e-05, "loss": 0.3802, "step": 2952 }, { "epoch": 2.070827489481066, "grad_norm": 0.31109321117401123, "learning_rate": 3.203556162827343e-05, "loss": 0.1009, "step": 2953 }, { "epoch": 2.0715287517531555, "grad_norm": 0.3127570152282715, "learning_rate": 3.2024901026140936e-05, "loss": 0.1011, "step": 2954 }, { "epoch": 2.0722300140252456, "grad_norm": 0.35082799196243286, "learning_rate": 3.201423903692969e-05, "loss": 0.0983, "step": 2955 }, { "epoch": 2.0729312762973353, "grad_norm": 0.30403587222099304, "learning_rate": 3.200357566274491e-05, "loss": 0.0995, "step": 2956 }, { "epoch": 2.073632538569425, "grad_norm": 0.31108370423316956, "learning_rate": 3.19929109056921e-05, "loss": 0.1008, "step": 2957 }, { "epoch": 2.0743338008415146, "grad_norm": 0.24237680435180664, "learning_rate": 3.198224476787704e-05, "loss": 0.2445, "step": 2958 }, { "epoch": 2.0750350631136043, "grad_norm": 0.2415393888950348, "learning_rate": 3.197157725140577e-05, "loss": 0.2381, "step": 2959 }, { "epoch": 2.0757363253856944, "grad_norm": 0.22858059406280518, "learning_rate": 3.196090835838462e-05, "loss": 0.245, "step": 2960 }, { "epoch": 2.076437587657784, "grad_norm": 0.287895530462265, "learning_rate": 3.195023809092017e-05, "loss": 0.1025, "step": 2961 }, { "epoch": 2.0771388499298737, "grad_norm": 0.3053743839263916, "learning_rate": 3.1939566451119294e-05, "loss": 0.0992, "step": 2962 }, { "epoch": 2.0778401122019634, "grad_norm": 0.30398598313331604, "learning_rate": 3.192889344108913e-05, "loss": 0.0994, "step": 2963 }, { "epoch": 2.078541374474053, "grad_norm": 0.29089120030403137, "learning_rate": 3.191821906293705e-05, "loss": 0.0964, "step": 2964 }, { "epoch": 2.079242636746143, "grad_norm": 0.29651010036468506, "learning_rate": 3.190754331877076e-05, "loss": 0.0961, "step": 2965 }, { "epoch": 2.079943899018233, "grad_norm": 0.2807411849498749, "learning_rate": 3.1896866210698195e-05, "loss": 0.094, "step": 2966 }, { "epoch": 2.0806451612903225, "grad_norm": 0.27979058027267456, "learning_rate": 3.188618774082756e-05, "loss": 0.0998, "step": 2967 }, { "epoch": 2.081346423562412, "grad_norm": 0.23912803828716278, "learning_rate": 3.187550791126735e-05, "loss": 0.2458, "step": 2968 }, { "epoch": 2.0820476858345023, "grad_norm": 0.24233561754226685, "learning_rate": 3.1864826724126317e-05, "loss": 0.2467, "step": 2969 }, { "epoch": 2.082748948106592, "grad_norm": 0.24962717294692993, "learning_rate": 3.185414418151346e-05, "loss": 0.2396, "step": 2970 }, { "epoch": 2.0834502103786816, "grad_norm": 0.27862560749053955, "learning_rate": 3.1843460285538084e-05, "loss": 0.0987, "step": 2971 }, { "epoch": 2.0841514726507713, "grad_norm": 0.2778940796852112, "learning_rate": 3.1832775038309745e-05, "loss": 0.0921, "step": 2972 }, { "epoch": 2.084852734922861, "grad_norm": 0.2786560654640198, "learning_rate": 3.182208844193824e-05, "loss": 0.091, "step": 2973 }, { "epoch": 2.085553997194951, "grad_norm": 0.2737058997154236, "learning_rate": 3.181140049853368e-05, "loss": 0.0978, "step": 2974 }, { "epoch": 2.0862552594670407, "grad_norm": 0.7326319813728333, "learning_rate": 3.1800711210206403e-05, "loss": 0.3999, "step": 2975 }, { "epoch": 2.0869565217391304, "grad_norm": 0.2930315136909485, "learning_rate": 3.179002057906704e-05, "loss": 0.0754, "step": 2976 }, { "epoch": 2.08765778401122, "grad_norm": 0.25222429633140564, "learning_rate": 3.177932860722647e-05, "loss": 0.2463, "step": 2977 }, { "epoch": 2.08835904628331, "grad_norm": 0.2706031799316406, "learning_rate": 3.176863529679583e-05, "loss": 0.097, "step": 2978 }, { "epoch": 2.0890603085554, "grad_norm": 0.2697451710700989, "learning_rate": 3.175794064988654e-05, "loss": 0.0966, "step": 2979 }, { "epoch": 2.0897615708274895, "grad_norm": 0.2707446813583374, "learning_rate": 3.174724466861028e-05, "loss": 0.096, "step": 2980 }, { "epoch": 2.090462833099579, "grad_norm": 0.274196594953537, "learning_rate": 3.173654735507899e-05, "loss": 0.2436, "step": 2981 }, { "epoch": 2.091164095371669, "grad_norm": 0.284830242395401, "learning_rate": 3.1725848711404865e-05, "loss": 0.2275, "step": 2982 }, { "epoch": 2.091865357643759, "grad_norm": 0.26700708270072937, "learning_rate": 3.171514873970038e-05, "loss": 0.0953, "step": 2983 }, { "epoch": 2.0925666199158486, "grad_norm": 0.2706657350063324, "learning_rate": 3.170444744207826e-05, "loss": 0.0959, "step": 2984 }, { "epoch": 2.0932678821879382, "grad_norm": 0.2654314637184143, "learning_rate": 3.1693744820651494e-05, "loss": 0.0949, "step": 2985 }, { "epoch": 2.093969144460028, "grad_norm": 0.2583238184452057, "learning_rate": 3.168304087753333e-05, "loss": 0.0872, "step": 2986 }, { "epoch": 2.094670406732118, "grad_norm": 0.2660521864891052, "learning_rate": 3.1672335614837277e-05, "loss": 0.0945, "step": 2987 }, { "epoch": 2.0953716690042077, "grad_norm": 0.2565268576145172, "learning_rate": 3.1661629034677124e-05, "loss": 0.0862, "step": 2988 }, { "epoch": 2.0960729312762973, "grad_norm": 0.25782519578933716, "learning_rate": 3.165092113916688e-05, "loss": 0.0778, "step": 2989 }, { "epoch": 2.096774193548387, "grad_norm": 0.2640823721885681, "learning_rate": 3.164021193042085e-05, "loss": 0.2489, "step": 2990 }, { "epoch": 2.0974754558204767, "grad_norm": 0.2753506302833557, "learning_rate": 3.162950141055359e-05, "loss": 0.2502, "step": 2991 }, { "epoch": 2.098176718092567, "grad_norm": 0.26302576065063477, "learning_rate": 3.1618789581679906e-05, "loss": 0.0927, "step": 2992 }, { "epoch": 2.0988779803646564, "grad_norm": 0.2520988881587982, "learning_rate": 3.160807644591487e-05, "loss": 0.0851, "step": 2993 }, { "epoch": 2.099579242636746, "grad_norm": 4.721608638763428, "learning_rate": 3.15973620053738e-05, "loss": 0.7487, "step": 2994 }, { "epoch": 2.1002805049088358, "grad_norm": 0.27202606201171875, "learning_rate": 3.158664626217229e-05, "loss": 0.2498, "step": 2995 }, { "epoch": 2.100981767180926, "grad_norm": 0.7720306515693665, "learning_rate": 3.157592921842618e-05, "loss": 0.4104, "step": 2996 }, { "epoch": 2.1016830294530155, "grad_norm": 4.099165916442871, "learning_rate": 3.156521087625156e-05, "loss": 0.5392, "step": 2997 }, { "epoch": 2.102384291725105, "grad_norm": 0.2614559829235077, "learning_rate": 3.155449123776479e-05, "loss": 0.0929, "step": 2998 }, { "epoch": 2.103085553997195, "grad_norm": 0.3066842555999756, "learning_rate": 3.154377030508249e-05, "loss": 0.2343, "step": 2999 }, { "epoch": 2.1037868162692845, "grad_norm": 3.541898250579834, "learning_rate": 3.153304808032152e-05, "loss": 0.4824, "step": 3000 }, { "epoch": 2.1044880785413747, "grad_norm": 0.26254481077194214, "learning_rate": 3.1522324565598994e-05, "loss": 0.0942, "step": 3001 }, { "epoch": 2.1051893408134643, "grad_norm": 0.2562066912651062, "learning_rate": 3.15115997630323e-05, "loss": 0.0873, "step": 3002 }, { "epoch": 2.105890603085554, "grad_norm": 0.27501776814460754, "learning_rate": 3.150087367473907e-05, "loss": 0.2431, "step": 3003 }, { "epoch": 2.1065918653576436, "grad_norm": 0.26855477690696716, "learning_rate": 3.149014630283717e-05, "loss": 0.2477, "step": 3004 }, { "epoch": 2.1072931276297333, "grad_norm": 1.754583716392517, "learning_rate": 3.147941764944476e-05, "loss": 0.7181, "step": 3005 }, { "epoch": 2.1079943899018234, "grad_norm": 0.2640641927719116, "learning_rate": 3.146868771668022e-05, "loss": 0.0952, "step": 3006 }, { "epoch": 2.108695652173913, "grad_norm": 0.2556000351905823, "learning_rate": 3.145795650666219e-05, "loss": 0.2503, "step": 3007 }, { "epoch": 2.1093969144460027, "grad_norm": 0.2546212673187256, "learning_rate": 3.144722402150958e-05, "loss": 0.2495, "step": 3008 }, { "epoch": 2.1100981767180924, "grad_norm": 0.2787416875362396, "learning_rate": 3.143649026334152e-05, "loss": 0.0927, "step": 3009 }, { "epoch": 2.1107994389901825, "grad_norm": 0.2655811905860901, "learning_rate": 3.1425755234277424e-05, "loss": 0.0963, "step": 3010 }, { "epoch": 2.111500701262272, "grad_norm": 0.7293593883514404, "learning_rate": 3.1415018936436935e-05, "loss": 0.398, "step": 3011 }, { "epoch": 2.112201963534362, "grad_norm": 0.24874433875083923, "learning_rate": 3.1404281371939955e-05, "loss": 0.2455, "step": 3012 }, { "epoch": 2.1129032258064515, "grad_norm": 0.2735118567943573, "learning_rate": 3.1393542542906627e-05, "loss": 0.0929, "step": 3013 }, { "epoch": 2.113604488078541, "grad_norm": 0.26950979232788086, "learning_rate": 3.1382802451457366e-05, "loss": 0.0977, "step": 3014 }, { "epoch": 2.1143057503506313, "grad_norm": 0.25420695543289185, "learning_rate": 3.1372061099712804e-05, "loss": 0.2446, "step": 3015 }, { "epoch": 2.115007012622721, "grad_norm": 0.2700229287147522, "learning_rate": 3.136131848979386e-05, "loss": 0.0979, "step": 3016 }, { "epoch": 2.1157082748948106, "grad_norm": 0.27567484974861145, "learning_rate": 3.135057462382166e-05, "loss": 0.0937, "step": 3017 }, { "epoch": 2.1164095371669003, "grad_norm": 0.26962733268737793, "learning_rate": 3.1339829503917614e-05, "loss": 0.0976, "step": 3018 }, { "epoch": 2.1171107994389904, "grad_norm": 2.616682529449463, "learning_rate": 3.132908313220335e-05, "loss": 0.3666, "step": 3019 }, { "epoch": 2.11781206171108, "grad_norm": 0.268144816160202, "learning_rate": 3.131833551080077e-05, "loss": 0.0977, "step": 3020 }, { "epoch": 2.1185133239831697, "grad_norm": 0.3004768490791321, "learning_rate": 3.130758664183201e-05, "loss": 0.0933, "step": 3021 }, { "epoch": 2.1192145862552594, "grad_norm": 0.2668248116970062, "learning_rate": 3.129683652741945e-05, "loss": 0.0974, "step": 3022 }, { "epoch": 2.119915848527349, "grad_norm": 0.2694752514362335, "learning_rate": 3.128608516968571e-05, "loss": 0.0977, "step": 3023 }, { "epoch": 2.120617110799439, "grad_norm": 0.25574809312820435, "learning_rate": 3.127533257075368e-05, "loss": 0.2472, "step": 3024 }, { "epoch": 2.121318373071529, "grad_norm": 0.2587175965309143, "learning_rate": 3.126457873274646e-05, "loss": 0.2433, "step": 3025 }, { "epoch": 2.1220196353436185, "grad_norm": 0.2770734429359436, "learning_rate": 3.125382365778743e-05, "loss": 0.0943, "step": 3026 }, { "epoch": 2.122720897615708, "grad_norm": 0.26448675990104675, "learning_rate": 3.1243067348000195e-05, "loss": 0.0967, "step": 3027 }, { "epoch": 2.123422159887798, "grad_norm": 0.2664903998374939, "learning_rate": 3.12323098055086e-05, "loss": 0.0974, "step": 3028 }, { "epoch": 2.124123422159888, "grad_norm": 0.2539169490337372, "learning_rate": 3.1221551032436746e-05, "loss": 0.2461, "step": 3029 }, { "epoch": 2.1248246844319776, "grad_norm": 0.24360914528369904, "learning_rate": 3.121079103090896e-05, "loss": 0.2463, "step": 3030 }, { "epoch": 2.1255259467040672, "grad_norm": 0.2635408639907837, "learning_rate": 3.120002980304985e-05, "loss": 0.0967, "step": 3031 }, { "epoch": 2.126227208976157, "grad_norm": 0.30294516682624817, "learning_rate": 3.1189267350984215e-05, "loss": 0.0915, "step": 3032 }, { "epoch": 2.126928471248247, "grad_norm": 0.27210527658462524, "learning_rate": 3.1178503676837114e-05, "loss": 0.0935, "step": 3033 }, { "epoch": 2.1276297335203367, "grad_norm": 0.2759735584259033, "learning_rate": 3.116773878273388e-05, "loss": 0.2399, "step": 3034 }, { "epoch": 2.1283309957924264, "grad_norm": 0.725764274597168, "learning_rate": 3.115697267080004e-05, "loss": 0.3958, "step": 3035 }, { "epoch": 2.129032258064516, "grad_norm": 0.245435893535614, "learning_rate": 3.114620534316138e-05, "loss": 0.2478, "step": 3036 }, { "epoch": 2.1297335203366057, "grad_norm": 0.2602481245994568, "learning_rate": 3.113543680194394e-05, "loss": 0.0958, "step": 3037 }, { "epoch": 2.130434782608696, "grad_norm": 0.2634546756744385, "learning_rate": 3.112466704927397e-05, "loss": 0.0964, "step": 3038 }, { "epoch": 2.1311360448807855, "grad_norm": 0.261849969625473, "learning_rate": 3.111389608727799e-05, "loss": 0.0961, "step": 3039 }, { "epoch": 2.131837307152875, "grad_norm": 0.26163268089294434, "learning_rate": 3.110312391808275e-05, "loss": 0.0959, "step": 3040 }, { "epoch": 2.132538569424965, "grad_norm": 0.2656483054161072, "learning_rate": 3.109235054381523e-05, "loss": 0.0908, "step": 3041 }, { "epoch": 2.133239831697055, "grad_norm": 0.26208001375198364, "learning_rate": 3.108157596660263e-05, "loss": 0.0954, "step": 3042 }, { "epoch": 2.1339410939691446, "grad_norm": 0.2798839509487152, "learning_rate": 3.1070800188572425e-05, "loss": 0.0865, "step": 3043 }, { "epoch": 2.134642356241234, "grad_norm": 0.26097390055656433, "learning_rate": 3.1060023211852305e-05, "loss": 0.0954, "step": 3044 }, { "epoch": 2.135343618513324, "grad_norm": 0.2643025815486908, "learning_rate": 3.1049245038570225e-05, "loss": 0.0912, "step": 3045 }, { "epoch": 2.1360448807854135, "grad_norm": 0.25441116094589233, "learning_rate": 3.1038465670854325e-05, "loss": 0.2528, "step": 3046 }, { "epoch": 2.1367461430575037, "grad_norm": 3.4127559661865234, "learning_rate": 3.102768511083303e-05, "loss": 0.5526, "step": 3047 }, { "epoch": 2.1374474053295933, "grad_norm": 0.2673371136188507, "learning_rate": 3.1016903360634956e-05, "loss": 0.2428, "step": 3048 }, { "epoch": 2.138148667601683, "grad_norm": 0.2584698796272278, "learning_rate": 3.100612042238901e-05, "loss": 0.0891, "step": 3049 }, { "epoch": 2.1388499298737726, "grad_norm": 0.2603025436401367, "learning_rate": 3.099533629822428e-05, "loss": 0.0897, "step": 3050 }, { "epoch": 2.1395511921458628, "grad_norm": 0.25841057300567627, "learning_rate": 3.0984550990270106e-05, "loss": 0.0947, "step": 3051 }, { "epoch": 2.1402524544179524, "grad_norm": 0.25327953696250916, "learning_rate": 3.097376450065609e-05, "loss": 0.2477, "step": 3052 }, { "epoch": 2.140953716690042, "grad_norm": 0.25812965631484985, "learning_rate": 3.096297683151201e-05, "loss": 0.0943, "step": 3053 }, { "epoch": 2.1416549789621318, "grad_norm": 0.2552507519721985, "learning_rate": 3.0952187984967935e-05, "loss": 0.0943, "step": 3054 }, { "epoch": 2.1423562412342214, "grad_norm": 0.2729126214981079, "learning_rate": 3.094139796315413e-05, "loss": 0.086, "step": 3055 }, { "epoch": 2.1430575035063115, "grad_norm": 0.2537807524204254, "learning_rate": 3.093060676820112e-05, "loss": 0.0934, "step": 3056 }, { "epoch": 2.143758765778401, "grad_norm": 0.2895185053348541, "learning_rate": 3.0919814402239616e-05, "loss": 0.0865, "step": 3057 }, { "epoch": 2.144460028050491, "grad_norm": 0.27170565724372864, "learning_rate": 3.0909020867400606e-05, "loss": 0.2538, "step": 3058 }, { "epoch": 2.1451612903225805, "grad_norm": 0.2553342282772064, "learning_rate": 3.0898226165815305e-05, "loss": 0.0939, "step": 3059 }, { "epoch": 2.1458625525946706, "grad_norm": 0.25467509031295776, "learning_rate": 3.088743029961512e-05, "loss": 0.0938, "step": 3060 }, { "epoch": 2.1465638148667603, "grad_norm": 0.26850539445877075, "learning_rate": 3.087663327093172e-05, "loss": 0.2472, "step": 3061 }, { "epoch": 2.14726507713885, "grad_norm": 0.27474281191825867, "learning_rate": 3.086583508189701e-05, "loss": 0.0856, "step": 3062 }, { "epoch": 2.1479663394109396, "grad_norm": 0.2635685205459595, "learning_rate": 3.08550357346431e-05, "loss": 0.2429, "step": 3063 }, { "epoch": 2.1486676016830293, "grad_norm": 0.25750866532325745, "learning_rate": 3.084423523130233e-05, "loss": 0.0894, "step": 3064 }, { "epoch": 2.1493688639551194, "grad_norm": 0.26660671830177307, "learning_rate": 3.08334335740073e-05, "loss": 0.2499, "step": 3065 }, { "epoch": 2.150070126227209, "grad_norm": 0.27600687742233276, "learning_rate": 3.082263076489081e-05, "loss": 0.2464, "step": 3066 }, { "epoch": 2.1507713884992987, "grad_norm": 2.0934150218963623, "learning_rate": 3.0811826806085884e-05, "loss": 0.3592, "step": 3067 }, { "epoch": 2.1514726507713884, "grad_norm": 0.24991218745708466, "learning_rate": 3.080102169972578e-05, "loss": 0.0924, "step": 3068 }, { "epoch": 2.1521739130434785, "grad_norm": 0.2526400685310364, "learning_rate": 3.0790215447944006e-05, "loss": 0.0928, "step": 3069 }, { "epoch": 2.152875175315568, "grad_norm": 0.3136932849884033, "learning_rate": 3.077940805287425e-05, "loss": 0.2394, "step": 3070 }, { "epoch": 2.153576437587658, "grad_norm": 0.2636643648147583, "learning_rate": 3.0768599516650456e-05, "loss": 0.2505, "step": 3071 }, { "epoch": 2.1542776998597475, "grad_norm": 0.2624208927154541, "learning_rate": 3.07577898414068e-05, "loss": 0.2512, "step": 3072 }, { "epoch": 2.154978962131837, "grad_norm": 0.2568816542625427, "learning_rate": 3.0746979029277665e-05, "loss": 0.0881, "step": 3073 }, { "epoch": 2.1556802244039273, "grad_norm": 0.25370892882347107, "learning_rate": 3.0736167082397666e-05, "loss": 0.0883, "step": 3074 }, { "epoch": 2.156381486676017, "grad_norm": 2.188642740249634, "learning_rate": 3.0725354002901626e-05, "loss": 0.6363, "step": 3075 }, { "epoch": 2.1570827489481066, "grad_norm": 0.2722480595111847, "learning_rate": 3.071453979292464e-05, "loss": 0.0844, "step": 3076 }, { "epoch": 2.1577840112201963, "grad_norm": 0.27097001671791077, "learning_rate": 3.0703724454601954e-05, "loss": 0.0853, "step": 3077 }, { "epoch": 2.158485273492286, "grad_norm": 0.25978100299835205, "learning_rate": 3.0692907990069095e-05, "loss": 0.0902, "step": 3078 }, { "epoch": 2.159186535764376, "grad_norm": 0.27643123269081116, "learning_rate": 3.0682090401461784e-05, "loss": 0.0858, "step": 3079 }, { "epoch": 2.1598877980364657, "grad_norm": 0.2615397572517395, "learning_rate": 3.0671271690915986e-05, "loss": 0.0891, "step": 3080 }, { "epoch": 2.1605890603085554, "grad_norm": 0.2678331136703491, "learning_rate": 3.066045186056786e-05, "loss": 0.2532, "step": 3081 }, { "epoch": 2.161290322580645, "grad_norm": 0.2544437646865845, "learning_rate": 3.0649630912553806e-05, "loss": 0.0886, "step": 3082 }, { "epoch": 2.161991584852735, "grad_norm": 0.2705734968185425, "learning_rate": 3.0638808849010434e-05, "loss": 0.2523, "step": 3083 }, { "epoch": 2.162692847124825, "grad_norm": 0.25732025504112244, "learning_rate": 3.06279856720746e-05, "loss": 0.0891, "step": 3084 }, { "epoch": 2.1633941093969145, "grad_norm": 0.24967151880264282, "learning_rate": 3.0617161383883316e-05, "loss": 0.0923, "step": 3085 }, { "epoch": 2.164095371669004, "grad_norm": 0.2837725877761841, "learning_rate": 3.0606335986573895e-05, "loss": 0.2495, "step": 3086 }, { "epoch": 2.164796633941094, "grad_norm": 0.25592005252838135, "learning_rate": 3.059550948228382e-05, "loss": 0.0882, "step": 3087 }, { "epoch": 2.165497896213184, "grad_norm": 0.25886836647987366, "learning_rate": 3.05846818731508e-05, "loss": 0.0885, "step": 3088 }, { "epoch": 2.1661991584852736, "grad_norm": 0.2653798758983612, "learning_rate": 3.057385316131276e-05, "loss": 0.084, "step": 3089 }, { "epoch": 2.166900420757363, "grad_norm": 0.272903174161911, "learning_rate": 3.056302334890786e-05, "loss": 0.2482, "step": 3090 }, { "epoch": 2.167601683029453, "grad_norm": 0.24879863858222961, "learning_rate": 3.0552192438074456e-05, "loss": 0.0918, "step": 3091 }, { "epoch": 2.1683029453015426, "grad_norm": 0.785268247127533, "learning_rate": 3.0541360430951135e-05, "loss": 0.3971, "step": 3092 }, { "epoch": 2.1690042075736327, "grad_norm": 0.25076743960380554, "learning_rate": 3.0530527329676684e-05, "loss": 0.0873, "step": 3093 }, { "epoch": 2.1697054698457223, "grad_norm": 0.24633722007274628, "learning_rate": 3.051969313639013e-05, "loss": 0.091, "step": 3094 }, { "epoch": 2.170406732117812, "grad_norm": 0.24740898609161377, "learning_rate": 3.05088578532307e-05, "loss": 0.0912, "step": 3095 }, { "epoch": 2.1711079943899017, "grad_norm": 0.2509586811065674, "learning_rate": 3.0498021482337836e-05, "loss": 0.0871, "step": 3096 }, { "epoch": 2.1718092566619918, "grad_norm": 0.2452961653470993, "learning_rate": 3.0487184025851206e-05, "loss": 0.0902, "step": 3097 }, { "epoch": 2.1725105189340814, "grad_norm": 0.27775683999061584, "learning_rate": 3.0476345485910674e-05, "loss": 0.253, "step": 3098 }, { "epoch": 2.173211781206171, "grad_norm": 0.24953772127628326, "learning_rate": 3.046550586465633e-05, "loss": 0.0861, "step": 3099 }, { "epoch": 2.1739130434782608, "grad_norm": 0.2777895927429199, "learning_rate": 3.045466516422848e-05, "loss": 0.255, "step": 3100 }, { "epoch": 2.1746143057503504, "grad_norm": 0.24647995829582214, "learning_rate": 3.0443823386767645e-05, "loss": 0.091, "step": 3101 }, { "epoch": 2.1753155680224405, "grad_norm": 1.2710784673690796, "learning_rate": 3.043298053441454e-05, "loss": 0.5817, "step": 3102 }, { "epoch": 2.17601683029453, "grad_norm": 0.2459142655134201, "learning_rate": 3.0422136609310108e-05, "loss": 0.085, "step": 3103 }, { "epoch": 2.17671809256662, "grad_norm": 0.2754307687282562, "learning_rate": 3.0411291613595505e-05, "loss": 0.0774, "step": 3104 }, { "epoch": 2.1774193548387095, "grad_norm": 1.9361356496810913, "learning_rate": 3.0400445549412093e-05, "loss": 0.3532, "step": 3105 }, { "epoch": 2.1781206171107996, "grad_norm": 0.2688940167427063, "learning_rate": 3.0389598418901437e-05, "loss": 0.2523, "step": 3106 }, { "epoch": 2.1788218793828893, "grad_norm": 0.2595377266407013, "learning_rate": 3.0378750224205333e-05, "loss": 0.0815, "step": 3107 }, { "epoch": 2.179523141654979, "grad_norm": 0.2856581509113312, "learning_rate": 3.036790096746578e-05, "loss": 0.2487, "step": 3108 }, { "epoch": 2.1802244039270686, "grad_norm": 0.286117285490036, "learning_rate": 3.035705065082496e-05, "loss": 0.2525, "step": 3109 }, { "epoch": 2.1809256661991583, "grad_norm": 0.2460705190896988, "learning_rate": 3.0346199276425308e-05, "loss": 0.0906, "step": 3110 }, { "epoch": 2.1816269284712484, "grad_norm": 0.2705188989639282, "learning_rate": 3.033534684640944e-05, "loss": 0.2537, "step": 3111 }, { "epoch": 2.182328190743338, "grad_norm": 0.2775940001010895, "learning_rate": 3.0324493362920182e-05, "loss": 0.0779, "step": 3112 }, { "epoch": 2.1830294530154277, "grad_norm": 0.2600190341472626, "learning_rate": 3.0313638828100576e-05, "loss": 0.081, "step": 3113 }, { "epoch": 2.1837307152875174, "grad_norm": 0.2484193742275238, "learning_rate": 3.0302783244093873e-05, "loss": 0.0862, "step": 3114 }, { "epoch": 2.1844319775596075, "grad_norm": 0.7692738771438599, "learning_rate": 3.0291926613043526e-05, "loss": 0.4076, "step": 3115 }, { "epoch": 2.185133239831697, "grad_norm": 0.2995883524417877, "learning_rate": 3.0281068937093186e-05, "loss": 0.2428, "step": 3116 }, { "epoch": 2.185834502103787, "grad_norm": 0.2676830589771271, "learning_rate": 3.027021021838673e-05, "loss": 0.2509, "step": 3117 }, { "epoch": 2.1865357643758765, "grad_norm": 0.24765528738498688, "learning_rate": 3.0259350459068232e-05, "loss": 0.091, "step": 3118 }, { "epoch": 2.187237026647966, "grad_norm": 0.24656471610069275, "learning_rate": 3.024848966128196e-05, "loss": 0.0907, "step": 3119 }, { "epoch": 2.1879382889200563, "grad_norm": 0.2760482132434845, "learning_rate": 3.02376278271724e-05, "loss": 0.0775, "step": 3120 }, { "epoch": 2.188639551192146, "grad_norm": 0.2561071217060089, "learning_rate": 3.022676495888424e-05, "loss": 0.0805, "step": 3121 }, { "epoch": 2.1893408134642356, "grad_norm": 0.2516658306121826, "learning_rate": 3.0215901058562383e-05, "loss": 0.0794, "step": 3122 }, { "epoch": 2.1900420757363253, "grad_norm": 0.25816720724105835, "learning_rate": 3.020503612835191e-05, "loss": 0.0812, "step": 3123 }, { "epoch": 2.1907433380084154, "grad_norm": 0.24552004039287567, "learning_rate": 3.0194170170398124e-05, "loss": 0.085, "step": 3124 }, { "epoch": 2.191444600280505, "grad_norm": 0.24449364840984344, "learning_rate": 3.0183303186846535e-05, "loss": 0.0841, "step": 3125 }, { "epoch": 2.1921458625525947, "grad_norm": 0.7815125584602356, "learning_rate": 3.017243517984284e-05, "loss": 0.4114, "step": 3126 }, { "epoch": 2.1928471248246844, "grad_norm": 0.24331749975681305, "learning_rate": 3.0161566151532937e-05, "loss": 0.0842, "step": 3127 }, { "epoch": 2.193548387096774, "grad_norm": 0.3091405928134918, "learning_rate": 3.015069610406296e-05, "loss": 0.2439, "step": 3128 }, { "epoch": 2.194249649368864, "grad_norm": 0.24624907970428467, "learning_rate": 3.0139825039579194e-05, "loss": 0.0901, "step": 3129 }, { "epoch": 2.194950911640954, "grad_norm": 0.24044494330883026, "learning_rate": 3.0128952960228158e-05, "loss": 0.0835, "step": 3130 }, { "epoch": 2.1956521739130435, "grad_norm": 0.2458820939064026, "learning_rate": 3.0118079868156558e-05, "loss": 0.0891, "step": 3131 }, { "epoch": 2.196353436185133, "grad_norm": 2.9821033477783203, "learning_rate": 3.0107205765511314e-05, "loss": 0.4602, "step": 3132 }, { "epoch": 2.1970546984572232, "grad_norm": 0.2835153043270111, "learning_rate": 3.0096330654439536e-05, "loss": 0.2526, "step": 3133 }, { "epoch": 2.197755960729313, "grad_norm": 0.23320205509662628, "learning_rate": 3.0085454537088524e-05, "loss": 0.0814, "step": 3134 }, { "epoch": 2.1984572230014026, "grad_norm": 0.2434430867433548, "learning_rate": 3.0074577415605798e-05, "loss": 0.0888, "step": 3135 }, { "epoch": 2.1991584852734922, "grad_norm": 0.29278597235679626, "learning_rate": 3.0063699292139047e-05, "loss": 0.248, "step": 3136 }, { "epoch": 2.199859747545582, "grad_norm": 0.24495556950569153, "learning_rate": 3.0052820168836193e-05, "loss": 0.077, "step": 3137 }, { "epoch": 2.200561009817672, "grad_norm": 0.24695591628551483, "learning_rate": 3.004194004784533e-05, "loss": 0.09, "step": 3138 }, { "epoch": 2.2012622720897617, "grad_norm": 0.2832016050815582, "learning_rate": 3.0031058931314755e-05, "loss": 0.2535, "step": 3139 }, { "epoch": 2.2019635343618513, "grad_norm": 0.24449680745601654, "learning_rate": 3.0020176821392964e-05, "loss": 0.0893, "step": 3140 }, { "epoch": 2.202664796633941, "grad_norm": 0.2843764126300812, "learning_rate": 3.0009293720228647e-05, "loss": 0.2531, "step": 3141 }, { "epoch": 2.2033660589060307, "grad_norm": 0.24176546931266785, "learning_rate": 2.9998409629970704e-05, "loss": 0.0835, "step": 3142 }, { "epoch": 2.2040673211781208, "grad_norm": 0.24176400899887085, "learning_rate": 2.9987524552768192e-05, "loss": 0.0835, "step": 3143 }, { "epoch": 2.2047685834502104, "grad_norm": 0.2392396777868271, "learning_rate": 2.997663849077041e-05, "loss": 0.083, "step": 3144 }, { "epoch": 2.2054698457223, "grad_norm": 0.29266607761383057, "learning_rate": 2.9965751446126815e-05, "loss": 0.2469, "step": 3145 }, { "epoch": 2.2061711079943898, "grad_norm": 0.2824028432369232, "learning_rate": 2.995486342098709e-05, "loss": 0.2551, "step": 3146 }, { "epoch": 2.20687237026648, "grad_norm": 0.3099268078804016, "learning_rate": 2.994397441750108e-05, "loss": 0.2503, "step": 3147 }, { "epoch": 2.2075736325385695, "grad_norm": 0.24286548793315887, "learning_rate": 2.9933084437818838e-05, "loss": 0.0881, "step": 3148 }, { "epoch": 2.208274894810659, "grad_norm": 0.2388656735420227, "learning_rate": 2.9922193484090622e-05, "loss": 0.0818, "step": 3149 }, { "epoch": 2.208976157082749, "grad_norm": 0.33256596326828003, "learning_rate": 2.9911301558466853e-05, "loss": 0.2519, "step": 3150 }, { "epoch": 2.2096774193548385, "grad_norm": 0.24242718517780304, "learning_rate": 2.990040866309817e-05, "loss": 0.0876, "step": 3151 }, { "epoch": 2.2103786816269286, "grad_norm": 0.24276739358901978, "learning_rate": 2.9889514800135383e-05, "loss": 0.0885, "step": 3152 }, { "epoch": 2.2110799438990183, "grad_norm": 0.2894802689552307, "learning_rate": 2.987861997172952e-05, "loss": 0.2551, "step": 3153 }, { "epoch": 2.211781206171108, "grad_norm": 0.24174396693706512, "learning_rate": 2.9867724180031775e-05, "loss": 0.0829, "step": 3154 }, { "epoch": 2.2124824684431976, "grad_norm": 0.7873226404190063, "learning_rate": 2.985682742719354e-05, "loss": 0.4219, "step": 3155 }, { "epoch": 2.2131837307152873, "grad_norm": 0.2882847189903259, "learning_rate": 2.9845929715366406e-05, "loss": 0.2548, "step": 3156 }, { "epoch": 2.2138849929873774, "grad_norm": 3.3852312564849854, "learning_rate": 2.9835031046702127e-05, "loss": 0.7934, "step": 3157 }, { "epoch": 2.214586255259467, "grad_norm": 0.23935598134994507, "learning_rate": 2.982413142335268e-05, "loss": 0.0823, "step": 3158 }, { "epoch": 2.2152875175315567, "grad_norm": 0.8010610342025757, "learning_rate": 2.981323084747021e-05, "loss": 0.4178, "step": 3159 }, { "epoch": 2.2159887798036464, "grad_norm": 0.24469855427742004, "learning_rate": 2.9802329321207057e-05, "loss": 0.0893, "step": 3160 }, { "epoch": 2.2166900420757365, "grad_norm": 0.24097175896167755, "learning_rate": 2.9791426846715735e-05, "loss": 0.0831, "step": 3161 }, { "epoch": 2.217391304347826, "grad_norm": 0.24577344954013824, "learning_rate": 2.9780523426148964e-05, "loss": 0.0898, "step": 3162 }, { "epoch": 2.218092566619916, "grad_norm": 0.26519542932510376, "learning_rate": 2.9769619061659647e-05, "loss": 0.2495, "step": 3163 }, { "epoch": 2.2187938288920055, "grad_norm": 0.2772503197193146, "learning_rate": 2.9758713755400868e-05, "loss": 0.2531, "step": 3164 }, { "epoch": 2.219495091164095, "grad_norm": 0.28984639048576355, "learning_rate": 2.9747807509525892e-05, "loss": 0.2489, "step": 3165 }, { "epoch": 2.2201963534361853, "grad_norm": 0.303091436624527, "learning_rate": 2.9736900326188176e-05, "loss": 0.2403, "step": 3166 }, { "epoch": 2.220897615708275, "grad_norm": 0.2486625611782074, "learning_rate": 2.9725992207541376e-05, "loss": 0.091, "step": 3167 }, { "epoch": 2.2215988779803646, "grad_norm": 0.25023579597473145, "learning_rate": 2.97150831557393e-05, "loss": 0.0859, "step": 3168 }, { "epoch": 2.2223001402524543, "grad_norm": 0.24378807842731476, "learning_rate": 2.9704173172935968e-05, "loss": 0.0846, "step": 3169 }, { "epoch": 2.2230014025245444, "grad_norm": 0.24665088951587677, "learning_rate": 2.969326226128558e-05, "loss": 0.0902, "step": 3170 }, { "epoch": 2.223702664796634, "grad_norm": 0.27337661385536194, "learning_rate": 2.9682350422942506e-05, "loss": 0.2522, "step": 3171 }, { "epoch": 2.2244039270687237, "grad_norm": 0.2483070343732834, "learning_rate": 2.9671437660061307e-05, "loss": 0.0909, "step": 3172 }, { "epoch": 2.2251051893408134, "grad_norm": 0.2500435709953308, "learning_rate": 2.9660523974796732e-05, "loss": 0.0917, "step": 3173 }, { "epoch": 2.225806451612903, "grad_norm": 0.7570028901100159, "learning_rate": 2.964960936930371e-05, "loss": 0.4113, "step": 3174 }, { "epoch": 2.226507713884993, "grad_norm": 0.24748851358890533, "learning_rate": 2.9638693845737338e-05, "loss": 0.0851, "step": 3175 }, { "epoch": 2.227208976157083, "grad_norm": 0.2510319948196411, "learning_rate": 2.9627777406252904e-05, "loss": 0.0862, "step": 3176 }, { "epoch": 2.2279102384291725, "grad_norm": 0.25917911529541016, "learning_rate": 2.9616860053005897e-05, "loss": 0.0799, "step": 3177 }, { "epoch": 2.228611500701262, "grad_norm": 0.24689973890781403, "learning_rate": 2.9605941788151943e-05, "loss": 0.0905, "step": 3178 }, { "epoch": 2.2293127629733522, "grad_norm": 0.24818791449069977, "learning_rate": 2.9595022613846885e-05, "loss": 0.0908, "step": 3179 }, { "epoch": 2.230014025245442, "grad_norm": 0.27647146582603455, "learning_rate": 2.9584102532246728e-05, "loss": 0.2529, "step": 3180 }, { "epoch": 2.2307152875175316, "grad_norm": 0.2768092453479767, "learning_rate": 2.957318154550767e-05, "loss": 0.2543, "step": 3181 }, { "epoch": 2.2314165497896212, "grad_norm": 0.311012864112854, "learning_rate": 2.956225965578607e-05, "loss": 0.2392, "step": 3182 }, { "epoch": 2.232117812061711, "grad_norm": 0.2436237931251526, "learning_rate": 2.955133686523847e-05, "loss": 0.0841, "step": 3183 }, { "epoch": 2.232819074333801, "grad_norm": 0.246089369058609, "learning_rate": 2.9540413176021603e-05, "loss": 0.0897, "step": 3184 }, { "epoch": 2.2335203366058907, "grad_norm": 0.24425390362739563, "learning_rate": 2.9529488590292365e-05, "loss": 0.0836, "step": 3185 }, { "epoch": 2.2342215988779803, "grad_norm": 0.2515621781349182, "learning_rate": 2.951856311020784e-05, "loss": 0.0857, "step": 3186 }, { "epoch": 2.23492286115007, "grad_norm": 0.23885856568813324, "learning_rate": 2.9507636737925282e-05, "loss": 0.0825, "step": 3187 }, { "epoch": 2.23562412342216, "grad_norm": 0.242494598031044, "learning_rate": 2.9496709475602115e-05, "loss": 0.0839, "step": 3188 }, { "epoch": 2.2363253856942498, "grad_norm": 0.27752619981765747, "learning_rate": 2.9485781325395946e-05, "loss": 0.2532, "step": 3189 }, { "epoch": 2.2370266479663394, "grad_norm": 0.2469165027141571, "learning_rate": 2.947485228946456e-05, "loss": 0.077, "step": 3190 }, { "epoch": 2.237727910238429, "grad_norm": 0.27810394763946533, "learning_rate": 2.9463922369965917e-05, "loss": 0.2532, "step": 3191 }, { "epoch": 2.2384291725105188, "grad_norm": 0.24987466633319855, "learning_rate": 2.9452991569058146e-05, "loss": 0.0775, "step": 3192 }, { "epoch": 2.239130434782609, "grad_norm": 0.33499568700790405, "learning_rate": 2.9442059888899548e-05, "loss": 0.235, "step": 3193 }, { "epoch": 2.2398316970546985, "grad_norm": 0.2464486062526703, "learning_rate": 2.9431127331648616e-05, "loss": 0.0843, "step": 3194 }, { "epoch": 2.240532959326788, "grad_norm": 0.24488112330436707, "learning_rate": 2.9420193899463983e-05, "loss": 0.0886, "step": 3195 }, { "epoch": 2.241234221598878, "grad_norm": 0.24135719239711761, "learning_rate": 2.9409259594504485e-05, "loss": 0.0878, "step": 3196 }, { "epoch": 2.241935483870968, "grad_norm": 0.28371569514274597, "learning_rate": 2.9398324418929114e-05, "loss": 0.2527, "step": 3197 }, { "epoch": 2.2426367461430576, "grad_norm": 0.236890971660614, "learning_rate": 2.938738837489705e-05, "loss": 0.0806, "step": 3198 }, { "epoch": 2.2433380084151473, "grad_norm": 0.23936542868614197, "learning_rate": 2.9376451464567622e-05, "loss": 0.0818, "step": 3199 }, { "epoch": 2.244039270687237, "grad_norm": 0.31454306840896606, "learning_rate": 2.9365513690100345e-05, "loss": 0.2499, "step": 3200 }, { "epoch": 2.2447405329593266, "grad_norm": 0.24224290251731873, "learning_rate": 2.935457505365491e-05, "loss": 0.0822, "step": 3201 }, { "epoch": 2.2454417952314167, "grad_norm": 8.363534927368164, "learning_rate": 2.9343635557391153e-05, "loss": 1.1277, "step": 3202 }, { "epoch": 2.2461430575035064, "grad_norm": 0.23554395139217377, "learning_rate": 2.9332695203469106e-05, "loss": 0.0736, "step": 3203 }, { "epoch": 2.246844319775596, "grad_norm": 0.2432403415441513, "learning_rate": 2.9321753994048962e-05, "loss": 0.0875, "step": 3204 }, { "epoch": 2.2475455820476857, "grad_norm": 0.23337849974632263, "learning_rate": 2.9310811931291076e-05, "loss": 0.08, "step": 3205 }, { "epoch": 2.2482468443197754, "grad_norm": 0.2377144992351532, "learning_rate": 2.9299869017355985e-05, "loss": 0.0738, "step": 3206 }, { "epoch": 2.2489481065918655, "grad_norm": 0.23394615948200226, "learning_rate": 2.9288925254404376e-05, "loss": 0.0796, "step": 3207 }, { "epoch": 2.249649368863955, "grad_norm": 0.237512469291687, "learning_rate": 2.9277980644597126e-05, "loss": 0.0805, "step": 3208 }, { "epoch": 2.250350631136045, "grad_norm": 0.33879992365837097, "learning_rate": 2.9267035190095256e-05, "loss": 0.2643, "step": 3209 }, { "epoch": 2.2510518934081345, "grad_norm": 0.2262951284646988, "learning_rate": 2.925608889305997e-05, "loss": 0.077, "step": 3210 }, { "epoch": 2.251753155680224, "grad_norm": 0.22910989820957184, "learning_rate": 2.924514175565263e-05, "loss": 0.0714, "step": 3211 }, { "epoch": 2.2524544179523143, "grad_norm": 0.23053182661533356, "learning_rate": 2.923419378003477e-05, "loss": 0.0714, "step": 3212 }, { "epoch": 2.253155680224404, "grad_norm": 3.5504751205444336, "learning_rate": 2.9223244968368085e-05, "loss": 0.6774, "step": 3213 }, { "epoch": 2.2538569424964936, "grad_norm": 0.3210553824901581, "learning_rate": 2.9212295322814435e-05, "loss": 0.2519, "step": 3214 }, { "epoch": 2.2545582047685833, "grad_norm": 0.31174418330192566, "learning_rate": 2.9201344845535862e-05, "loss": 0.2605, "step": 3215 }, { "epoch": 2.2552594670406734, "grad_norm": 0.23331011831760406, "learning_rate": 2.9190393538694528e-05, "loss": 0.08, "step": 3216 }, { "epoch": 2.255960729312763, "grad_norm": 0.2311280369758606, "learning_rate": 2.9179441404452807e-05, "loss": 0.0789, "step": 3217 }, { "epoch": 2.2566619915848527, "grad_norm": 0.8290640711784363, "learning_rate": 2.916848844497321e-05, "loss": 0.4269, "step": 3218 }, { "epoch": 2.2573632538569424, "grad_norm": 0.2395746409893036, "learning_rate": 2.915753466241843e-05, "loss": 0.0864, "step": 3219 }, { "epoch": 2.258064516129032, "grad_norm": 0.2393052726984024, "learning_rate": 2.914658005895129e-05, "loss": 0.086, "step": 3220 }, { "epoch": 2.258765778401122, "grad_norm": 0.23015016317367554, "learning_rate": 2.9135624636734804e-05, "loss": 0.0785, "step": 3221 }, { "epoch": 2.259467040673212, "grad_norm": 2.85205340385437, "learning_rate": 2.9124668397932147e-05, "loss": 0.4535, "step": 3222 }, { "epoch": 2.2601683029453015, "grad_norm": 0.3011172413825989, "learning_rate": 2.911371134470664e-05, "loss": 0.2484, "step": 3223 }, { "epoch": 2.260869565217391, "grad_norm": 0.2377028465270996, "learning_rate": 2.9102753479221768e-05, "loss": 0.0815, "step": 3224 }, { "epoch": 2.2615708274894812, "grad_norm": 0.2369820773601532, "learning_rate": 2.9091794803641183e-05, "loss": 0.0812, "step": 3225 }, { "epoch": 2.262272089761571, "grad_norm": 0.24408359825611115, "learning_rate": 2.9080835320128703e-05, "loss": 0.0758, "step": 3226 }, { "epoch": 2.2629733520336606, "grad_norm": 0.23874174058437347, "learning_rate": 2.9069875030848282e-05, "loss": 0.0814, "step": 3227 }, { "epoch": 2.2636746143057502, "grad_norm": 0.23578445613384247, "learning_rate": 2.9058913937964055e-05, "loss": 0.081, "step": 3228 }, { "epoch": 2.26437587657784, "grad_norm": 0.31011393666267395, "learning_rate": 2.9047952043640316e-05, "loss": 0.2537, "step": 3229 }, { "epoch": 2.26507713884993, "grad_norm": 0.305155485868454, "learning_rate": 2.90369893500415e-05, "loss": 0.2497, "step": 3230 }, { "epoch": 2.2657784011220197, "grad_norm": 2.32224440574646, "learning_rate": 2.9026025859332213e-05, "loss": 0.3961, "step": 3231 }, { "epoch": 2.2664796633941093, "grad_norm": 0.2422204613685608, "learning_rate": 2.901506157367721e-05, "loss": 0.0829, "step": 3232 }, { "epoch": 2.267180925666199, "grad_norm": 0.23742663860321045, "learning_rate": 2.9004096495241416e-05, "loss": 0.0865, "step": 3233 }, { "epoch": 2.267882187938289, "grad_norm": 2.332080125808716, "learning_rate": 2.8993130626189903e-05, "loss": 0.3945, "step": 3234 }, { "epoch": 2.2685834502103788, "grad_norm": 0.7876961827278137, "learning_rate": 2.89821639686879e-05, "loss": 0.4225, "step": 3235 }, { "epoch": 2.2692847124824684, "grad_norm": 0.27103760838508606, "learning_rate": 2.897119652490079e-05, "loss": 0.0742, "step": 3236 }, { "epoch": 2.269985974754558, "grad_norm": 0.2931753098964691, "learning_rate": 2.8960228296994106e-05, "loss": 0.2574, "step": 3237 }, { "epoch": 2.2706872370266478, "grad_norm": 0.24213437736034393, "learning_rate": 2.8949259287133563e-05, "loss": 0.0834, "step": 3238 }, { "epoch": 2.271388499298738, "grad_norm": 0.2912946939468384, "learning_rate": 2.8938289497484995e-05, "loss": 0.2578, "step": 3239 }, { "epoch": 2.2720897615708275, "grad_norm": 0.24045662581920624, "learning_rate": 2.8927318930214414e-05, "loss": 0.0882, "step": 3240 }, { "epoch": 2.272791023842917, "grad_norm": 0.28547996282577515, "learning_rate": 2.8916347587487973e-05, "loss": 0.256, "step": 3241 }, { "epoch": 2.273492286115007, "grad_norm": 0.2406712919473648, "learning_rate": 2.8905375471471984e-05, "loss": 0.0879, "step": 3242 }, { "epoch": 2.274193548387097, "grad_norm": 0.2709948718547821, "learning_rate": 2.8894402584332914e-05, "loss": 0.083, "step": 3243 }, { "epoch": 2.2748948106591866, "grad_norm": 0.2913144528865814, "learning_rate": 2.8883428928237368e-05, "loss": 0.2516, "step": 3244 }, { "epoch": 2.2755960729312763, "grad_norm": 0.2615739703178406, "learning_rate": 2.887245450535212e-05, "loss": 0.0818, "step": 3245 }, { "epoch": 2.276297335203366, "grad_norm": 0.2613160312175751, "learning_rate": 2.8861479317844086e-05, "loss": 0.0825, "step": 3246 }, { "epoch": 2.2769985974754556, "grad_norm": 0.2406364530324936, "learning_rate": 2.8850503367880344e-05, "loss": 0.0883, "step": 3247 }, { "epoch": 2.2776998597475457, "grad_norm": 0.23832285404205322, "learning_rate": 2.88395266576281e-05, "loss": 0.0875, "step": 3248 }, { "epoch": 2.2784011220196354, "grad_norm": 0.2515736222267151, "learning_rate": 2.8828549189254734e-05, "loss": 0.0868, "step": 3249 }, { "epoch": 2.279102384291725, "grad_norm": 0.7791668176651001, "learning_rate": 2.881757096492777e-05, "loss": 0.4147, "step": 3250 }, { "epoch": 2.2798036465638147, "grad_norm": 0.2409602403640747, "learning_rate": 2.880659198681487e-05, "loss": 0.0887, "step": 3251 }, { "epoch": 2.280504908835905, "grad_norm": 1.67662513256073, "learning_rate": 2.8795612257083843e-05, "loss": 0.3285, "step": 3252 }, { "epoch": 2.2812061711079945, "grad_norm": 0.2887144088745117, "learning_rate": 2.8784631777902675e-05, "loss": 0.2496, "step": 3253 }, { "epoch": 2.281907433380084, "grad_norm": 0.24009181559085846, "learning_rate": 2.8773650551439467e-05, "loss": 0.0882, "step": 3254 }, { "epoch": 2.282608695652174, "grad_norm": 0.3048720359802246, "learning_rate": 2.876266857986249e-05, "loss": 0.2499, "step": 3255 }, { "epoch": 2.2833099579242635, "grad_norm": 0.2863018214702606, "learning_rate": 2.8751685865340138e-05, "loss": 0.258, "step": 3256 }, { "epoch": 2.2840112201963536, "grad_norm": 0.7741057276725769, "learning_rate": 2.8740702410040992e-05, "loss": 0.4223, "step": 3257 }, { "epoch": 2.2847124824684433, "grad_norm": 0.27372369170188904, "learning_rate": 2.8729718216133727e-05, "loss": 0.0826, "step": 3258 }, { "epoch": 2.285413744740533, "grad_norm": 0.2525983154773712, "learning_rate": 2.871873328578721e-05, "loss": 0.0873, "step": 3259 }, { "epoch": 2.2861150070126226, "grad_norm": 0.2993473708629608, "learning_rate": 2.870774762117042e-05, "loss": 0.2529, "step": 3260 }, { "epoch": 2.2868162692847127, "grad_norm": 0.24352528154850006, "learning_rate": 2.8696761224452512e-05, "loss": 0.0897, "step": 3261 }, { "epoch": 2.2875175315568024, "grad_norm": 0.24279065430164337, "learning_rate": 2.8685774097802752e-05, "loss": 0.0893, "step": 3262 }, { "epoch": 2.288218793828892, "grad_norm": 0.28333544731140137, "learning_rate": 2.867478624339057e-05, "loss": 0.2533, "step": 3263 }, { "epoch": 2.2889200561009817, "grad_norm": 0.24356992542743683, "learning_rate": 2.8663797663385555e-05, "loss": 0.09, "step": 3264 }, { "epoch": 2.2896213183730714, "grad_norm": 1.8143318891525269, "learning_rate": 2.86528083599574e-05, "loss": 0.4756, "step": 3265 }, { "epoch": 2.2903225806451615, "grad_norm": 0.2448534369468689, "learning_rate": 2.8641818335275973e-05, "loss": 0.0904, "step": 3266 }, { "epoch": 2.291023842917251, "grad_norm": 0.24328617751598358, "learning_rate": 2.8630827591511267e-05, "loss": 0.09, "step": 3267 }, { "epoch": 2.291725105189341, "grad_norm": 0.2762684226036072, "learning_rate": 2.8619836130833422e-05, "loss": 0.2535, "step": 3268 }, { "epoch": 2.2924263674614305, "grad_norm": 0.24432772397994995, "learning_rate": 2.8608843955412728e-05, "loss": 0.0905, "step": 3269 }, { "epoch": 2.2931276297335206, "grad_norm": 0.2590939998626709, "learning_rate": 2.85978510674196e-05, "loss": 0.089, "step": 3270 }, { "epoch": 2.2938288920056102, "grad_norm": 0.28397807478904724, "learning_rate": 2.8586857469024618e-05, "loss": 0.0889, "step": 3271 }, { "epoch": 2.2945301542777, "grad_norm": 0.25818800926208496, "learning_rate": 2.857586316239847e-05, "loss": 0.0884, "step": 3272 }, { "epoch": 2.2952314165497896, "grad_norm": 2.243450403213501, "learning_rate": 2.8564868149712003e-05, "loss": 0.4638, "step": 3273 }, { "epoch": 2.2959326788218792, "grad_norm": 0.24573294818401337, "learning_rate": 2.8553872433136215e-05, "loss": 0.0909, "step": 3274 }, { "epoch": 2.296633941093969, "grad_norm": 0.28789275884628296, "learning_rate": 2.854287601484221e-05, "loss": 0.0878, "step": 3275 }, { "epoch": 2.297335203366059, "grad_norm": 0.24234077334403992, "learning_rate": 2.8531878897001264e-05, "loss": 0.0896, "step": 3276 }, { "epoch": 2.2980364656381487, "grad_norm": 0.3196568191051483, "learning_rate": 2.8520881081784773e-05, "loss": 0.2496, "step": 3277 }, { "epoch": 2.2987377279102383, "grad_norm": 0.7678713798522949, "learning_rate": 2.8509882571364266e-05, "loss": 0.4182, "step": 3278 }, { "epoch": 2.299438990182328, "grad_norm": 0.257572740316391, "learning_rate": 2.849888336791144e-05, "loss": 0.089, "step": 3279 }, { "epoch": 2.300140252454418, "grad_norm": 0.2769939601421356, "learning_rate": 2.848788347359808e-05, "loss": 0.2547, "step": 3280 }, { "epoch": 2.3008415147265078, "grad_norm": 0.26053041219711304, "learning_rate": 2.8476882890596146e-05, "loss": 0.0893, "step": 3281 }, { "epoch": 2.3015427769985974, "grad_norm": 0.2598724067211151, "learning_rate": 2.8465881621077723e-05, "loss": 0.0888, "step": 3282 }, { "epoch": 2.302244039270687, "grad_norm": 0.24306893348693848, "learning_rate": 2.845487966721503e-05, "loss": 0.0902, "step": 3283 }, { "epoch": 2.3029453015427768, "grad_norm": 0.2805124819278717, "learning_rate": 2.8443877031180426e-05, "loss": 0.2542, "step": 3284 }, { "epoch": 2.303646563814867, "grad_norm": 0.2878645360469818, "learning_rate": 2.843287371514639e-05, "loss": 0.0868, "step": 3285 }, { "epoch": 2.3043478260869565, "grad_norm": 0.24293234944343567, "learning_rate": 2.8421869721285548e-05, "loss": 0.0901, "step": 3286 }, { "epoch": 2.305049088359046, "grad_norm": 0.24373207986354828, "learning_rate": 2.8410865051770664e-05, "loss": 0.0901, "step": 3287 }, { "epoch": 2.305750350631136, "grad_norm": 0.24231962859630585, "learning_rate": 2.839985970877463e-05, "loss": 0.0897, "step": 3288 }, { "epoch": 2.306451612903226, "grad_norm": 0.2734346389770508, "learning_rate": 2.838885369447046e-05, "loss": 0.0895, "step": 3289 }, { "epoch": 2.3071528751753156, "grad_norm": 0.28825628757476807, "learning_rate": 2.837784701103131e-05, "loss": 0.2536, "step": 3290 }, { "epoch": 2.3078541374474053, "grad_norm": 0.27931585907936096, "learning_rate": 2.8366839660630483e-05, "loss": 0.2528, "step": 3291 }, { "epoch": 2.308555399719495, "grad_norm": 1.4054780006408691, "learning_rate": 2.8355831645441388e-05, "loss": 0.2853, "step": 3292 }, { "epoch": 2.3092566619915846, "grad_norm": 1.2973827123641968, "learning_rate": 2.834482296763758e-05, "loss": 0.2744, "step": 3293 }, { "epoch": 2.3099579242636747, "grad_norm": 2.4039371013641357, "learning_rate": 2.8333813629392736e-05, "loss": 0.6223, "step": 3294 }, { "epoch": 2.3106591865357644, "grad_norm": 0.24294790625572205, "learning_rate": 2.8322803632880678e-05, "loss": 0.09, "step": 3295 }, { "epoch": 2.311360448807854, "grad_norm": 0.29712212085723877, "learning_rate": 2.8311792980275337e-05, "loss": 0.257, "step": 3296 }, { "epoch": 2.3120617110799437, "grad_norm": 0.27136024832725525, "learning_rate": 2.8300781673750792e-05, "loss": 0.251, "step": 3297 }, { "epoch": 2.312762973352034, "grad_norm": 0.2430090755224228, "learning_rate": 2.8289769715481246e-05, "loss": 0.0903, "step": 3298 }, { "epoch": 2.3134642356241235, "grad_norm": 0.27329957485198975, "learning_rate": 2.8278757107641035e-05, "loss": 0.2544, "step": 3299 }, { "epoch": 2.314165497896213, "grad_norm": 0.7537764310836792, "learning_rate": 2.8267743852404598e-05, "loss": 0.4153, "step": 3300 }, { "epoch": 2.314866760168303, "grad_norm": 1.1999328136444092, "learning_rate": 2.8256729951946536e-05, "loss": 0.2445, "step": 3301 }, { "epoch": 2.3155680224403925, "grad_norm": 0.3165384531021118, "learning_rate": 2.8245715408441565e-05, "loss": 0.094, "step": 3302 }, { "epoch": 2.3162692847124826, "grad_norm": 0.7472425103187561, "learning_rate": 2.8234700224064513e-05, "loss": 0.4122, "step": 3303 }, { "epoch": 2.3169705469845723, "grad_norm": 0.2814728319644928, "learning_rate": 2.8223684400990352e-05, "loss": 0.0955, "step": 3304 }, { "epoch": 2.317671809256662, "grad_norm": 0.28327780961990356, "learning_rate": 2.8212667941394178e-05, "loss": 0.0953, "step": 3305 }, { "epoch": 2.3183730715287516, "grad_norm": 0.26035556197166443, "learning_rate": 2.8201650847451217e-05, "loss": 0.2506, "step": 3306 }, { "epoch": 2.3190743338008417, "grad_norm": 0.2915675640106201, "learning_rate": 2.8190633121336797e-05, "loss": 0.0969, "step": 3307 }, { "epoch": 2.3197755960729314, "grad_norm": 0.26168426871299744, "learning_rate": 2.8179614765226392e-05, "loss": 0.2535, "step": 3308 }, { "epoch": 2.320476858345021, "grad_norm": 0.2516910433769226, "learning_rate": 2.8168595781295608e-05, "loss": 0.0931, "step": 3309 }, { "epoch": 2.3211781206171107, "grad_norm": 1.1921794414520264, "learning_rate": 2.8157576171720145e-05, "loss": 0.5732, "step": 3310 }, { "epoch": 2.3218793828892004, "grad_norm": 0.2589487135410309, "learning_rate": 2.814655593867585e-05, "loss": 0.256, "step": 3311 }, { "epoch": 2.3225806451612905, "grad_norm": 0.36331671476364136, "learning_rate": 2.8135535084338692e-05, "loss": 0.1045, "step": 3312 }, { "epoch": 2.32328190743338, "grad_norm": 0.2546813488006592, "learning_rate": 2.812451361088475e-05, "loss": 0.0941, "step": 3313 }, { "epoch": 2.32398316970547, "grad_norm": 0.7341532111167908, "learning_rate": 2.8113491520490243e-05, "loss": 0.405, "step": 3314 }, { "epoch": 2.3246844319775595, "grad_norm": 0.7131009101867676, "learning_rate": 2.8102468815331485e-05, "loss": 0.4147, "step": 3315 }, { "epoch": 2.3253856942496496, "grad_norm": 0.25395461916923523, "learning_rate": 2.8091445497584944e-05, "loss": 0.0941, "step": 3316 }, { "epoch": 2.3260869565217392, "grad_norm": 0.25758183002471924, "learning_rate": 2.808042156942719e-05, "loss": 0.2532, "step": 3317 }, { "epoch": 2.326788218793829, "grad_norm": 0.29513123631477356, "learning_rate": 2.8069397033034906e-05, "loss": 0.0994, "step": 3318 }, { "epoch": 2.3274894810659186, "grad_norm": 0.2560534179210663, "learning_rate": 2.805837189058492e-05, "loss": 0.0949, "step": 3319 }, { "epoch": 2.3281907433380082, "grad_norm": 0.2565765678882599, "learning_rate": 2.8047346144254154e-05, "loss": 0.0945, "step": 3320 }, { "epoch": 2.3288920056100983, "grad_norm": 0.2537195682525635, "learning_rate": 2.8036319796219668e-05, "loss": 0.0944, "step": 3321 }, { "epoch": 2.329593267882188, "grad_norm": 0.25610077381134033, "learning_rate": 2.802529284865863e-05, "loss": 0.0944, "step": 3322 }, { "epoch": 2.3302945301542777, "grad_norm": 0.3485375940799713, "learning_rate": 2.8014265303748334e-05, "loss": 0.1024, "step": 3323 }, { "epoch": 2.3309957924263673, "grad_norm": 0.40052253007888794, "learning_rate": 2.8003237163666174e-05, "loss": 0.1062, "step": 3324 }, { "epoch": 2.3316970546984574, "grad_norm": 0.25973019003868103, "learning_rate": 2.7992208430589688e-05, "loss": 0.2528, "step": 3325 }, { "epoch": 2.332398316970547, "grad_norm": 0.2538161873817444, "learning_rate": 2.7981179106696514e-05, "loss": 0.0941, "step": 3326 }, { "epoch": 2.333099579242637, "grad_norm": 0.25208914279937744, "learning_rate": 2.7970149194164413e-05, "loss": 0.0935, "step": 3327 }, { "epoch": 2.3338008415147264, "grad_norm": 0.2781520187854767, "learning_rate": 2.7959118695171267e-05, "loss": 0.0955, "step": 3328 }, { "epoch": 2.334502103786816, "grad_norm": 0.25988805294036865, "learning_rate": 2.794808761189505e-05, "loss": 0.2495, "step": 3329 }, { "epoch": 2.335203366058906, "grad_norm": 0.31274545192718506, "learning_rate": 2.7937055946513878e-05, "loss": 0.096, "step": 3330 }, { "epoch": 2.335904628330996, "grad_norm": 0.2592732906341553, "learning_rate": 2.792602370120598e-05, "loss": 0.2506, "step": 3331 }, { "epoch": 2.3366058906030855, "grad_norm": 0.2676469385623932, "learning_rate": 2.791499087814967e-05, "loss": 0.093, "step": 3332 }, { "epoch": 2.337307152875175, "grad_norm": 0.3186766803264618, "learning_rate": 2.790395747952343e-05, "loss": 0.0952, "step": 3333 }, { "epoch": 2.3380084151472653, "grad_norm": 0.2588379681110382, "learning_rate": 2.7892923507505792e-05, "loss": 0.2478, "step": 3334 }, { "epoch": 2.338709677419355, "grad_norm": 0.28545838594436646, "learning_rate": 2.7881888964275445e-05, "loss": 0.2524, "step": 3335 }, { "epoch": 2.3394109396914446, "grad_norm": 0.24935035407543182, "learning_rate": 2.787085385201118e-05, "loss": 0.0921, "step": 3336 }, { "epoch": 2.3401122019635343, "grad_norm": 0.24920083582401276, "learning_rate": 2.7859818172891906e-05, "loss": 0.0924, "step": 3337 }, { "epoch": 2.340813464235624, "grad_norm": 0.249175027012825, "learning_rate": 2.7848781929096618e-05, "loss": 0.0923, "step": 3338 }, { "epoch": 2.3415147265077136, "grad_norm": 0.2606315612792969, "learning_rate": 2.7837745122804455e-05, "loss": 0.2505, "step": 3339 }, { "epoch": 2.3422159887798037, "grad_norm": 0.27229925990104675, "learning_rate": 2.782670775619466e-05, "loss": 0.0913, "step": 3340 }, { "epoch": 2.3429172510518934, "grad_norm": 0.27982521057128906, "learning_rate": 2.7815669831446556e-05, "loss": 0.0895, "step": 3341 }, { "epoch": 2.343618513323983, "grad_norm": 0.24631983041763306, "learning_rate": 2.780463135073962e-05, "loss": 0.0911, "step": 3342 }, { "epoch": 2.3443197755960727, "grad_norm": 1.6089601516723633, "learning_rate": 2.7793592316253408e-05, "loss": 0.2804, "step": 3343 }, { "epoch": 2.345021037868163, "grad_norm": 0.24516960978507996, "learning_rate": 2.7782552730167614e-05, "loss": 0.0905, "step": 3344 }, { "epoch": 2.3457223001402525, "grad_norm": 0.2560283839702606, "learning_rate": 2.7771512594662e-05, "loss": 0.0889, "step": 3345 }, { "epoch": 2.346423562412342, "grad_norm": 0.7644343376159668, "learning_rate": 2.7760471911916474e-05, "loss": 0.4154, "step": 3346 }, { "epoch": 2.347124824684432, "grad_norm": 0.2735362648963928, "learning_rate": 2.774943068411104e-05, "loss": 0.2506, "step": 3347 }, { "epoch": 2.3478260869565215, "grad_norm": 0.24449487030506134, "learning_rate": 2.7738388913425794e-05, "loss": 0.0907, "step": 3348 }, { "epoch": 2.3485273492286116, "grad_norm": 0.2596885859966278, "learning_rate": 2.772734660204096e-05, "loss": 0.0901, "step": 3349 }, { "epoch": 2.3492286115007013, "grad_norm": 0.24283604323863983, "learning_rate": 2.7716303752136864e-05, "loss": 0.0901, "step": 3350 }, { "epoch": 2.349929873772791, "grad_norm": 0.2563019096851349, "learning_rate": 2.7705260365893938e-05, "loss": 0.0889, "step": 3351 }, { "epoch": 2.3506311360448806, "grad_norm": 0.7605876922607422, "learning_rate": 2.7694216445492703e-05, "loss": 0.4151, "step": 3352 }, { "epoch": 2.3513323983169707, "grad_norm": 1.5021928548812866, "learning_rate": 2.7683171993113815e-05, "loss": 0.4453, "step": 3353 }, { "epoch": 2.3520336605890604, "grad_norm": 0.7563725113868713, "learning_rate": 2.767212701093802e-05, "loss": 0.4176, "step": 3354 }, { "epoch": 2.35273492286115, "grad_norm": 0.2598414421081543, "learning_rate": 2.7661081501146152e-05, "loss": 0.0896, "step": 3355 }, { "epoch": 2.3534361851332397, "grad_norm": 0.2862800359725952, "learning_rate": 2.7650035465919184e-05, "loss": 0.0898, "step": 3356 }, { "epoch": 2.3541374474053294, "grad_norm": 0.2783491909503937, "learning_rate": 2.7638988907438168e-05, "loss": 0.0879, "step": 3357 }, { "epoch": 2.3548387096774195, "grad_norm": 0.277108371257782, "learning_rate": 2.762794182788427e-05, "loss": 0.2537, "step": 3358 }, { "epoch": 2.355539971949509, "grad_norm": 0.2652290165424347, "learning_rate": 2.761689422943874e-05, "loss": 0.0917, "step": 3359 }, { "epoch": 2.356241234221599, "grad_norm": 0.2614194452762604, "learning_rate": 2.7605846114282964e-05, "loss": 0.0903, "step": 3360 }, { "epoch": 2.3569424964936885, "grad_norm": 0.27480942010879517, "learning_rate": 2.7594797484598402e-05, "loss": 0.2526, "step": 3361 }, { "epoch": 2.3576437587657786, "grad_norm": 0.2476104497909546, "learning_rate": 2.7583748342566622e-05, "loss": 0.0918, "step": 3362 }, { "epoch": 2.3583450210378682, "grad_norm": 0.2603505551815033, "learning_rate": 2.7572698690369303e-05, "loss": 0.0902, "step": 3363 }, { "epoch": 2.359046283309958, "grad_norm": 1.1314260959625244, "learning_rate": 2.756164853018821e-05, "loss": 0.2525, "step": 3364 }, { "epoch": 2.3597475455820476, "grad_norm": 0.2641344964504242, "learning_rate": 2.755059786420523e-05, "loss": 0.0916, "step": 3365 }, { "epoch": 2.3604488078541372, "grad_norm": 1.2542564868927002, "learning_rate": 2.7539546694602327e-05, "loss": 0.4039, "step": 3366 }, { "epoch": 2.3611500701262274, "grad_norm": 0.339470237493515, "learning_rate": 2.7528495023561562e-05, "loss": 0.0928, "step": 3367 }, { "epoch": 2.361851332398317, "grad_norm": 0.26740363240242004, "learning_rate": 2.751744285326513e-05, "loss": 0.2541, "step": 3368 }, { "epoch": 2.3625525946704067, "grad_norm": 1.2319340705871582, "learning_rate": 2.7506390185895283e-05, "loss": 0.5779, "step": 3369 }, { "epoch": 2.3632538569424963, "grad_norm": 0.2692048251628876, "learning_rate": 2.7495337023634393e-05, "loss": 0.2561, "step": 3370 }, { "epoch": 2.3639551192145865, "grad_norm": 0.34568557143211365, "learning_rate": 2.748428336866493e-05, "loss": 0.0936, "step": 3371 }, { "epoch": 2.364656381486676, "grad_norm": 0.2687840461730957, "learning_rate": 2.747322922316946e-05, "loss": 0.0931, "step": 3372 }, { "epoch": 2.365357643758766, "grad_norm": 0.2749999463558197, "learning_rate": 2.7462174589330632e-05, "loss": 0.2543, "step": 3373 }, { "epoch": 2.3660589060308554, "grad_norm": 0.25041043758392334, "learning_rate": 2.7451119469331206e-05, "loss": 0.0928, "step": 3374 }, { "epoch": 2.366760168302945, "grad_norm": 0.2706189453601837, "learning_rate": 2.7440063865354037e-05, "loss": 0.0925, "step": 3375 }, { "epoch": 2.367461430575035, "grad_norm": 0.25863829255104065, "learning_rate": 2.7429007779582077e-05, "loss": 0.25, "step": 3376 }, { "epoch": 2.368162692847125, "grad_norm": 0.25123509764671326, "learning_rate": 2.7417951214198363e-05, "loss": 0.0929, "step": 3377 }, { "epoch": 2.3688639551192145, "grad_norm": 0.7404088973999023, "learning_rate": 2.740689417138603e-05, "loss": 0.4135, "step": 3378 }, { "epoch": 2.369565217391304, "grad_norm": 0.250693678855896, "learning_rate": 2.739583665332832e-05, "loss": 0.0927, "step": 3379 }, { "epoch": 2.3702664796633943, "grad_norm": 0.249741330742836, "learning_rate": 2.7384778662208555e-05, "loss": 0.0926, "step": 3380 }, { "epoch": 2.370967741935484, "grad_norm": 0.26362600922584534, "learning_rate": 2.7373720200210145e-05, "loss": 0.2517, "step": 3381 }, { "epoch": 2.3716690042075736, "grad_norm": 0.24981042742729187, "learning_rate": 2.7362661269516622e-05, "loss": 0.0927, "step": 3382 }, { "epoch": 2.3723702664796633, "grad_norm": 0.2663029432296753, "learning_rate": 2.7351601872311572e-05, "loss": 0.2527, "step": 3383 }, { "epoch": 2.373071528751753, "grad_norm": 0.2517035901546478, "learning_rate": 2.7340542010778696e-05, "loss": 0.0929, "step": 3384 }, { "epoch": 2.373772791023843, "grad_norm": 0.262564480304718, "learning_rate": 2.7329481687101795e-05, "loss": 0.2525, "step": 3385 }, { "epoch": 2.3744740532959328, "grad_norm": 0.26001301407814026, "learning_rate": 2.7318420903464732e-05, "loss": 0.2494, "step": 3386 }, { "epoch": 2.3751753155680224, "grad_norm": 0.30874037742614746, "learning_rate": 2.730735966205148e-05, "loss": 0.0943, "step": 3387 }, { "epoch": 2.375876577840112, "grad_norm": 0.25282320380210876, "learning_rate": 2.7296297965046114e-05, "loss": 0.0931, "step": 3388 }, { "epoch": 2.376577840112202, "grad_norm": 0.25160208344459534, "learning_rate": 2.7285235814632775e-05, "loss": 0.0929, "step": 3389 }, { "epoch": 2.377279102384292, "grad_norm": 0.7393691539764404, "learning_rate": 2.72741732129957e-05, "loss": 0.4079, "step": 3390 }, { "epoch": 2.3779803646563815, "grad_norm": 0.25194478034973145, "learning_rate": 2.7263110162319228e-05, "loss": 0.0931, "step": 3391 }, { "epoch": 2.378681626928471, "grad_norm": 0.7442572116851807, "learning_rate": 2.7252046664787773e-05, "loss": 0.4103, "step": 3392 }, { "epoch": 2.379382889200561, "grad_norm": 0.260816752910614, "learning_rate": 2.724098272258584e-05, "loss": 0.2507, "step": 3393 }, { "epoch": 2.380084151472651, "grad_norm": 0.7378200888633728, "learning_rate": 2.7229918337898025e-05, "loss": 0.4086, "step": 3394 }, { "epoch": 2.3807854137447406, "grad_norm": 0.31285110116004944, "learning_rate": 2.7218853512909003e-05, "loss": 0.0953, "step": 3395 }, { "epoch": 2.3814866760168303, "grad_norm": 0.25480741262435913, "learning_rate": 2.7207788249803563e-05, "loss": 0.252, "step": 3396 }, { "epoch": 2.38218793828892, "grad_norm": 0.3145519495010376, "learning_rate": 2.7196722550766535e-05, "loss": 0.0948, "step": 3397 }, { "epoch": 2.38288920056101, "grad_norm": 0.2848888337612152, "learning_rate": 2.7185656417982874e-05, "loss": 0.0942, "step": 3398 }, { "epoch": 2.3835904628330997, "grad_norm": 0.2627923786640167, "learning_rate": 2.717458985363761e-05, "loss": 0.2512, "step": 3399 }, { "epoch": 2.3842917251051894, "grad_norm": 0.2552478313446045, "learning_rate": 2.7163522859915846e-05, "loss": 0.0939, "step": 3400 }, { "epoch": 2.384992987377279, "grad_norm": 0.25450825691223145, "learning_rate": 2.715245543900278e-05, "loss": 0.0941, "step": 3401 }, { "epoch": 2.3856942496493687, "grad_norm": 0.27099254727363586, "learning_rate": 2.7141387593083696e-05, "loss": 0.0925, "step": 3402 }, { "epoch": 2.3863955119214584, "grad_norm": 0.2548091411590576, "learning_rate": 2.713031932434396e-05, "loss": 0.0937, "step": 3403 }, { "epoch": 2.3870967741935485, "grad_norm": 0.25370854139328003, "learning_rate": 2.711925063496902e-05, "loss": 0.0934, "step": 3404 }, { "epoch": 2.387798036465638, "grad_norm": 0.25299450755119324, "learning_rate": 2.7108181527144406e-05, "loss": 0.0931, "step": 3405 }, { "epoch": 2.388499298737728, "grad_norm": 0.2715561091899872, "learning_rate": 2.7097112003055737e-05, "loss": 0.0927, "step": 3406 }, { "epoch": 2.3892005610098175, "grad_norm": 0.27220383286476135, "learning_rate": 2.70860420648887e-05, "loss": 0.0931, "step": 3407 }, { "epoch": 2.3899018232819076, "grad_norm": 0.265610933303833, "learning_rate": 2.7074971714829083e-05, "loss": 0.0914, "step": 3408 }, { "epoch": 2.3906030855539973, "grad_norm": 0.25414496660232544, "learning_rate": 2.7063900955062737e-05, "loss": 0.0934, "step": 3409 }, { "epoch": 2.391304347826087, "grad_norm": 0.2856035828590393, "learning_rate": 2.705282978777562e-05, "loss": 0.0895, "step": 3410 }, { "epoch": 2.3920056100981766, "grad_norm": 0.27411580085754395, "learning_rate": 2.704175821515373e-05, "loss": 0.2531, "step": 3411 }, { "epoch": 2.3927068723702662, "grad_norm": 0.27150759100914, "learning_rate": 2.703068623938318e-05, "loss": 0.2488, "step": 3412 }, { "epoch": 2.3934081346423564, "grad_norm": 0.26109057664871216, "learning_rate": 2.701961386265015e-05, "loss": 0.0902, "step": 3413 }, { "epoch": 2.394109396914446, "grad_norm": 0.2817646265029907, "learning_rate": 2.70085410871409e-05, "loss": 0.0882, "step": 3414 }, { "epoch": 2.3948106591865357, "grad_norm": 0.24891383945941925, "learning_rate": 2.6997467915041765e-05, "loss": 0.0912, "step": 3415 }, { "epoch": 2.3955119214586253, "grad_norm": 0.24950651824474335, "learning_rate": 2.6986394348539163e-05, "loss": 0.0915, "step": 3416 }, { "epoch": 2.3962131837307155, "grad_norm": 0.26313304901123047, "learning_rate": 2.6975320389819596e-05, "loss": 0.0891, "step": 3417 }, { "epoch": 2.396914446002805, "grad_norm": 0.27867040038108826, "learning_rate": 2.6964246041069623e-05, "loss": 0.2505, "step": 3418 }, { "epoch": 2.397615708274895, "grad_norm": 0.28651952743530273, "learning_rate": 2.6953171304475905e-05, "loss": 0.2559, "step": 3419 }, { "epoch": 2.3983169705469845, "grad_norm": 0.2874240279197693, "learning_rate": 2.6942096182225162e-05, "loss": 0.2521, "step": 3420 }, { "epoch": 2.399018232819074, "grad_norm": 0.2539462447166443, "learning_rate": 2.6931020676504193e-05, "loss": 0.087, "step": 3421 }, { "epoch": 2.3997194950911642, "grad_norm": 0.7756835222244263, "learning_rate": 2.6919944789499884e-05, "loss": 0.4165, "step": 3422 }, { "epoch": 2.400420757363254, "grad_norm": 0.2909822165966034, "learning_rate": 2.6908868523399176e-05, "loss": 0.2465, "step": 3423 }, { "epoch": 2.4011220196353436, "grad_norm": 0.25753602385520935, "learning_rate": 2.6897791880389112e-05, "loss": 0.0882, "step": 3424 }, { "epoch": 2.401823281907433, "grad_norm": 0.2990642189979553, "learning_rate": 2.6886714862656788e-05, "loss": 0.2542, "step": 3425 }, { "epoch": 2.4025245441795233, "grad_norm": 0.25306564569473267, "learning_rate": 2.687563747238937e-05, "loss": 0.0864, "step": 3426 }, { "epoch": 2.403225806451613, "grad_norm": 0.2461203634738922, "learning_rate": 2.6864559711774116e-05, "loss": 0.0901, "step": 3427 }, { "epoch": 2.4039270687237027, "grad_norm": 0.7917917966842651, "learning_rate": 2.6853481582998352e-05, "loss": 0.4135, "step": 3428 }, { "epoch": 2.4046283309957923, "grad_norm": 0.24704532325267792, "learning_rate": 2.6842403088249466e-05, "loss": 0.0902, "step": 3429 }, { "epoch": 2.405329593267882, "grad_norm": 0.28426000475883484, "learning_rate": 2.6831324229714943e-05, "loss": 0.2501, "step": 3430 }, { "epoch": 2.406030855539972, "grad_norm": 0.24648572504520416, "learning_rate": 2.68202450095823e-05, "loss": 0.0903, "step": 3431 }, { "epoch": 2.4067321178120618, "grad_norm": 0.2877906560897827, "learning_rate": 2.6809165430039158e-05, "loss": 0.249, "step": 3432 }, { "epoch": 2.4074333800841514, "grad_norm": 0.2689806818962097, "learning_rate": 2.67980854932732e-05, "loss": 0.0826, "step": 3433 }, { "epoch": 2.408134642356241, "grad_norm": 0.2468959540128708, "learning_rate": 2.678700520147218e-05, "loss": 0.09, "step": 3434 }, { "epoch": 2.408835904628331, "grad_norm": 2.1860318183898926, "learning_rate": 2.6775924556823916e-05, "loss": 0.3482, "step": 3435 }, { "epoch": 2.409537166900421, "grad_norm": 0.2724279761314392, "learning_rate": 2.67648435615163e-05, "loss": 0.0836, "step": 3436 }, { "epoch": 2.4102384291725105, "grad_norm": 0.2671017348766327, "learning_rate": 2.6753762217737294e-05, "loss": 0.0881, "step": 3437 }, { "epoch": 2.4109396914446, "grad_norm": 1.6152141094207764, "learning_rate": 2.674268052767494e-05, "loss": 0.3148, "step": 3438 }, { "epoch": 2.41164095371669, "grad_norm": 0.2477981448173523, "learning_rate": 2.6731598493517317e-05, "loss": 0.0905, "step": 3439 }, { "epoch": 2.41234221598878, "grad_norm": 0.27955251932144165, "learning_rate": 2.6720516117452604e-05, "loss": 0.2483, "step": 3440 }, { "epoch": 2.4130434782608696, "grad_norm": 0.27903661131858826, "learning_rate": 2.6709433401669037e-05, "loss": 0.2466, "step": 3441 }, { "epoch": 2.4137447405329593, "grad_norm": 0.2557506859302521, "learning_rate": 2.6698350348354905e-05, "loss": 0.0869, "step": 3442 }, { "epoch": 2.414446002805049, "grad_norm": 0.25666478276252747, "learning_rate": 2.6687266959698587e-05, "loss": 0.0876, "step": 3443 }, { "epoch": 2.415147265077139, "grad_norm": 1.5221115350723267, "learning_rate": 2.667618323788852e-05, "loss": 0.3076, "step": 3444 }, { "epoch": 2.4158485273492287, "grad_norm": 0.24632646143436432, "learning_rate": 2.666509918511319e-05, "loss": 0.0905, "step": 3445 }, { "epoch": 2.4165497896213184, "grad_norm": 0.25943470001220703, "learning_rate": 2.665401480356117e-05, "loss": 0.0892, "step": 3446 }, { "epoch": 2.417251051893408, "grad_norm": 0.28152838349342346, "learning_rate": 2.664293009542109e-05, "loss": 0.0868, "step": 3447 }, { "epoch": 2.4179523141654977, "grad_norm": 0.24668672680854797, "learning_rate": 2.6631845062881654e-05, "loss": 0.0906, "step": 3448 }, { "epoch": 2.418653576437588, "grad_norm": 0.2768864035606384, "learning_rate": 2.6620759708131604e-05, "loss": 0.0865, "step": 3449 }, { "epoch": 2.4193548387096775, "grad_norm": 0.2741192579269409, "learning_rate": 2.6609674033359765e-05, "loss": 0.2563, "step": 3450 }, { "epoch": 2.420056100981767, "grad_norm": 0.27183738350868225, "learning_rate": 2.659858804075504e-05, "loss": 0.2501, "step": 3451 }, { "epoch": 2.420757363253857, "grad_norm": 0.26937195658683777, "learning_rate": 2.6587501732506353e-05, "loss": 0.2523, "step": 3452 }, { "epoch": 2.421458625525947, "grad_norm": 0.3011323809623718, "learning_rate": 2.6576415110802734e-05, "loss": 0.2552, "step": 3453 }, { "epoch": 2.4221598877980366, "grad_norm": 0.2463838905096054, "learning_rate": 2.6565328177833237e-05, "loss": 0.0902, "step": 3454 }, { "epoch": 2.4228611500701263, "grad_norm": 0.7737625241279602, "learning_rate": 2.6554240935787018e-05, "loss": 0.4199, "step": 3455 }, { "epoch": 2.423562412342216, "grad_norm": 0.28476402163505554, "learning_rate": 2.6543153386853254e-05, "loss": 0.0878, "step": 3456 }, { "epoch": 2.4242636746143056, "grad_norm": 0.2784368097782135, "learning_rate": 2.6532065533221207e-05, "loss": 0.2568, "step": 3457 }, { "epoch": 2.4249649368863957, "grad_norm": 0.24886301159858704, "learning_rate": 2.6520977377080196e-05, "loss": 0.0915, "step": 3458 }, { "epoch": 2.4256661991584854, "grad_norm": 0.260124146938324, "learning_rate": 2.650988892061959e-05, "loss": 0.0889, "step": 3459 }, { "epoch": 2.426367461430575, "grad_norm": 0.27296581864356995, "learning_rate": 2.6498800166028826e-05, "loss": 0.2505, "step": 3460 }, { "epoch": 2.4270687237026647, "grad_norm": 1.3001978397369385, "learning_rate": 2.64877111154974e-05, "loss": 0.267, "step": 3461 }, { "epoch": 2.427769985974755, "grad_norm": 0.2628309428691864, "learning_rate": 2.6476621771214866e-05, "loss": 0.0896, "step": 3462 }, { "epoch": 2.4284712482468445, "grad_norm": 0.24738667905330658, "learning_rate": 2.646553213537083e-05, "loss": 0.0914, "step": 3463 }, { "epoch": 2.429172510518934, "grad_norm": 0.2460295557975769, "learning_rate": 2.6454442210154956e-05, "loss": 0.0908, "step": 3464 }, { "epoch": 2.429873772791024, "grad_norm": 0.24620983004570007, "learning_rate": 2.6443351997756976e-05, "loss": 0.0909, "step": 3465 }, { "epoch": 2.4305750350631135, "grad_norm": 0.2634069323539734, "learning_rate": 2.6432261500366666e-05, "loss": 0.0907, "step": 3466 }, { "epoch": 2.431276297335203, "grad_norm": 0.25937342643737793, "learning_rate": 2.6421170720173865e-05, "loss": 0.0891, "step": 3467 }, { "epoch": 2.4319775596072932, "grad_norm": 0.27732354402542114, "learning_rate": 2.641007965936847e-05, "loss": 0.2571, "step": 3468 }, { "epoch": 2.432678821879383, "grad_norm": 1.1729481220245361, "learning_rate": 2.6398988320140432e-05, "loss": 0.2696, "step": 3469 }, { "epoch": 2.4333800841514726, "grad_norm": 0.30842918157577515, "learning_rate": 2.6387896704679744e-05, "loss": 0.0915, "step": 3470 }, { "epoch": 2.434081346423562, "grad_norm": 0.24746251106262207, "learning_rate": 2.6376804815176474e-05, "loss": 0.0913, "step": 3471 }, { "epoch": 2.4347826086956523, "grad_norm": 0.24529927968978882, "learning_rate": 2.6365712653820727e-05, "loss": 0.0904, "step": 3472 }, { "epoch": 2.435483870967742, "grad_norm": 0.26784807443618774, "learning_rate": 2.6354620222802677e-05, "loss": 0.0913, "step": 3473 }, { "epoch": 2.4361851332398317, "grad_norm": 0.26838889718055725, "learning_rate": 2.634352752431255e-05, "loss": 0.2511, "step": 3474 }, { "epoch": 2.4368863955119213, "grad_norm": 0.2459881603717804, "learning_rate": 2.63324345605406e-05, "loss": 0.0909, "step": 3475 }, { "epoch": 2.437587657784011, "grad_norm": 0.24473479390144348, "learning_rate": 2.6321341333677158e-05, "loss": 0.0908, "step": 3476 }, { "epoch": 2.438288920056101, "grad_norm": 0.24616147577762604, "learning_rate": 2.6310247845912607e-05, "loss": 0.091, "step": 3477 }, { "epoch": 2.4389901823281908, "grad_norm": 0.24565932154655457, "learning_rate": 2.6299154099437366e-05, "loss": 0.0908, "step": 3478 }, { "epoch": 2.4396914446002804, "grad_norm": 0.7556620836257935, "learning_rate": 2.628806009644193e-05, "loss": 0.4194, "step": 3479 }, { "epoch": 2.44039270687237, "grad_norm": 0.24295413494110107, "learning_rate": 2.627696583911681e-05, "loss": 0.0901, "step": 3480 }, { "epoch": 2.44109396914446, "grad_norm": 0.24392081797122955, "learning_rate": 2.62658713296526e-05, "loss": 0.0904, "step": 3481 }, { "epoch": 2.44179523141655, "grad_norm": 0.27104490995407104, "learning_rate": 2.625477657023992e-05, "loss": 0.2522, "step": 3482 }, { "epoch": 2.4424964936886395, "grad_norm": 0.27151843905448914, "learning_rate": 2.624368156306946e-05, "loss": 0.2513, "step": 3483 }, { "epoch": 2.443197755960729, "grad_norm": 0.26976433396339417, "learning_rate": 2.6232586310331942e-05, "loss": 0.2512, "step": 3484 }, { "epoch": 2.443899018232819, "grad_norm": 0.24463427066802979, "learning_rate": 2.6221490814218138e-05, "loss": 0.0902, "step": 3485 }, { "epoch": 2.444600280504909, "grad_norm": 0.2667744755744934, "learning_rate": 2.6210395076918888e-05, "loss": 0.091, "step": 3486 }, { "epoch": 2.4453015427769986, "grad_norm": 0.26209452748298645, "learning_rate": 2.619929910062504e-05, "loss": 0.0902, "step": 3487 }, { "epoch": 2.4460028050490883, "grad_norm": 0.2430185228586197, "learning_rate": 2.6188202887527536e-05, "loss": 0.0899, "step": 3488 }, { "epoch": 2.446704067321178, "grad_norm": 0.3017234802246094, "learning_rate": 2.6177106439817328e-05, "loss": 0.2599, "step": 3489 }, { "epoch": 2.447405329593268, "grad_norm": 0.24507279694080353, "learning_rate": 2.616600975968544e-05, "loss": 0.0905, "step": 3490 }, { "epoch": 2.4481065918653577, "grad_norm": 0.2585224509239197, "learning_rate": 2.615491284932292e-05, "loss": 0.0896, "step": 3491 }, { "epoch": 2.4488078541374474, "grad_norm": 0.34275227785110474, "learning_rate": 2.614381571092088e-05, "loss": 0.0916, "step": 3492 }, { "epoch": 2.449509116409537, "grad_norm": 0.24329149723052979, "learning_rate": 2.613271834667047e-05, "loss": 0.0897, "step": 3493 }, { "epoch": 2.4502103786816267, "grad_norm": 0.2419966459274292, "learning_rate": 2.6121620758762877e-05, "loss": 0.0896, "step": 3494 }, { "epoch": 2.450911640953717, "grad_norm": 0.31693077087402344, "learning_rate": 2.6110522949389337e-05, "loss": 0.2548, "step": 3495 }, { "epoch": 2.4516129032258065, "grad_norm": 1.1733355522155762, "learning_rate": 2.609942492074114e-05, "loss": 0.2643, "step": 3496 }, { "epoch": 2.452314165497896, "grad_norm": 0.2845483124256134, "learning_rate": 2.6088326675009605e-05, "loss": 0.2554, "step": 3497 }, { "epoch": 2.453015427769986, "grad_norm": 0.2887209355831146, "learning_rate": 2.60772282143861e-05, "loss": 0.0878, "step": 3498 }, { "epoch": 2.453716690042076, "grad_norm": 0.2816424071788788, "learning_rate": 2.606612954106204e-05, "loss": 0.2548, "step": 3499 }, { "epoch": 2.4544179523141656, "grad_norm": 0.24056391417980194, "learning_rate": 2.6055030657228875e-05, "loss": 0.089, "step": 3500 }, { "epoch": 2.4551192145862553, "grad_norm": 0.2556704878807068, "learning_rate": 2.6043931565078096e-05, "loss": 0.0876, "step": 3501 }, { "epoch": 2.455820476858345, "grad_norm": 0.31175515055656433, "learning_rate": 2.6032832266801233e-05, "loss": 0.0901, "step": 3502 }, { "epoch": 2.4565217391304346, "grad_norm": 0.29321202635765076, "learning_rate": 2.6021732764589875e-05, "loss": 0.0884, "step": 3503 }, { "epoch": 2.4572230014025247, "grad_norm": 0.25785592198371887, "learning_rate": 2.6010633060635624e-05, "loss": 0.0885, "step": 3504 }, { "epoch": 2.4579242636746144, "grad_norm": 0.7760153412818909, "learning_rate": 2.5999533157130146e-05, "loss": 0.4185, "step": 3505 }, { "epoch": 2.458625525946704, "grad_norm": 0.2570074498653412, "learning_rate": 2.5988433056265126e-05, "loss": 0.0881, "step": 3506 }, { "epoch": 2.4593267882187937, "grad_norm": 0.3236589729785919, "learning_rate": 2.5977332760232315e-05, "loss": 0.0866, "step": 3507 }, { "epoch": 2.460028050490884, "grad_norm": 0.2387881577014923, "learning_rate": 2.5966232271223463e-05, "loss": 0.088, "step": 3508 }, { "epoch": 2.4607293127629735, "grad_norm": 0.26764115691185, "learning_rate": 2.5955131591430394e-05, "loss": 0.0877, "step": 3509 }, { "epoch": 2.461430575035063, "grad_norm": 0.3037567436695099, "learning_rate": 2.594403072304496e-05, "loss": 0.2513, "step": 3510 }, { "epoch": 2.462131837307153, "grad_norm": 0.23899395763874054, "learning_rate": 2.5932929668259037e-05, "loss": 0.0878, "step": 3511 }, { "epoch": 2.4628330995792425, "grad_norm": 0.2505635917186737, "learning_rate": 2.5921828429264544e-05, "loss": 0.0858, "step": 3512 }, { "epoch": 2.4635343618513326, "grad_norm": 0.2534661293029785, "learning_rate": 2.5910727008253455e-05, "loss": 0.0861, "step": 3513 }, { "epoch": 2.4642356241234222, "grad_norm": 1.3295605182647705, "learning_rate": 2.5899625407417755e-05, "loss": 0.2845, "step": 3514 }, { "epoch": 2.464936886395512, "grad_norm": 0.24984674155712128, "learning_rate": 2.5888523628949472e-05, "loss": 0.085, "step": 3515 }, { "epoch": 2.4656381486676016, "grad_norm": 0.296974241733551, "learning_rate": 2.5877421675040676e-05, "loss": 0.258, "step": 3516 }, { "epoch": 2.4663394109396917, "grad_norm": 0.24816545844078064, "learning_rate": 2.586631954788347e-05, "loss": 0.0848, "step": 3517 }, { "epoch": 2.4670406732117813, "grad_norm": 0.2364434450864792, "learning_rate": 2.5855217249669977e-05, "loss": 0.0867, "step": 3518 }, { "epoch": 2.467741935483871, "grad_norm": 0.30199581384658813, "learning_rate": 2.5844114782592372e-05, "loss": 0.2592, "step": 3519 }, { "epoch": 2.4684431977559607, "grad_norm": 0.23630648851394653, "learning_rate": 2.5833012148842857e-05, "loss": 0.0864, "step": 3520 }, { "epoch": 2.4691444600280503, "grad_norm": 0.23630903661251068, "learning_rate": 2.582190935061366e-05, "loss": 0.0868, "step": 3521 }, { "epoch": 2.4698457223001404, "grad_norm": 0.23885880410671234, "learning_rate": 2.5810806390097063e-05, "loss": 0.0873, "step": 3522 }, { "epoch": 2.47054698457223, "grad_norm": 1.576619029045105, "learning_rate": 2.5799703269485343e-05, "loss": 0.4524, "step": 3523 }, { "epoch": 2.4712482468443198, "grad_norm": 0.2814185917377472, "learning_rate": 2.5788599990970848e-05, "loss": 0.0839, "step": 3524 }, { "epoch": 2.4719495091164094, "grad_norm": 0.23653890192508698, "learning_rate": 2.5777496556745924e-05, "loss": 0.0868, "step": 3525 }, { "epoch": 2.4726507713884995, "grad_norm": 0.3181670308113098, "learning_rate": 2.5766392969002974e-05, "loss": 0.2561, "step": 3526 }, { "epoch": 2.473352033660589, "grad_norm": 0.23690682649612427, "learning_rate": 2.575528922993442e-05, "loss": 0.0871, "step": 3527 }, { "epoch": 2.474053295932679, "grad_norm": 0.3329913020133972, "learning_rate": 2.5744185341732714e-05, "loss": 0.2536, "step": 3528 }, { "epoch": 2.4747545582047685, "grad_norm": 0.2370801568031311, "learning_rate": 2.5733081306590335e-05, "loss": 0.0869, "step": 3529 }, { "epoch": 2.475455820476858, "grad_norm": 0.24401795864105225, "learning_rate": 2.5721977126699794e-05, "loss": 0.0837, "step": 3530 }, { "epoch": 2.476157082748948, "grad_norm": 0.2534356117248535, "learning_rate": 2.5710872804253634e-05, "loss": 0.0855, "step": 3531 }, { "epoch": 2.476858345021038, "grad_norm": 0.23517289757728577, "learning_rate": 2.5699768341444412e-05, "loss": 0.0858, "step": 3532 }, { "epoch": 2.4775596072931276, "grad_norm": 0.23578320443630219, "learning_rate": 2.568866374046473e-05, "loss": 0.0862, "step": 3533 }, { "epoch": 2.4782608695652173, "grad_norm": 0.25577041506767273, "learning_rate": 2.567755900350722e-05, "loss": 0.0863, "step": 3534 }, { "epoch": 2.478962131837307, "grad_norm": 0.3121081292629242, "learning_rate": 2.5666454132764517e-05, "loss": 0.2537, "step": 3535 }, { "epoch": 2.479663394109397, "grad_norm": 0.30251410603523254, "learning_rate": 2.5655349130429307e-05, "loss": 0.081, "step": 3536 }, { "epoch": 2.4803646563814867, "grad_norm": 0.2329084426164627, "learning_rate": 2.564424399869428e-05, "loss": 0.0852, "step": 3537 }, { "epoch": 2.4810659186535764, "grad_norm": 0.270344614982605, "learning_rate": 2.5633138739752177e-05, "loss": 0.0825, "step": 3538 }, { "epoch": 2.481767180925666, "grad_norm": 0.3202342092990875, "learning_rate": 2.5622033355795742e-05, "loss": 0.2522, "step": 3539 }, { "epoch": 2.4824684431977557, "grad_norm": 0.2937227189540863, "learning_rate": 2.561092784901775e-05, "loss": 0.0786, "step": 3540 }, { "epoch": 2.483169705469846, "grad_norm": 0.23281534016132355, "learning_rate": 2.5599822221611007e-05, "loss": 0.0843, "step": 3541 }, { "epoch": 2.4838709677419355, "grad_norm": 0.26701444387435913, "learning_rate": 2.558871647576835e-05, "loss": 0.0804, "step": 3542 }, { "epoch": 2.484572230014025, "grad_norm": 0.23120924830436707, "learning_rate": 2.5577610613682602e-05, "loss": 0.0838, "step": 3543 }, { "epoch": 2.485273492286115, "grad_norm": 0.2417425960302353, "learning_rate": 2.556650463754665e-05, "loss": 0.0811, "step": 3544 }, { "epoch": 2.485974754558205, "grad_norm": 0.8413553237915039, "learning_rate": 2.5555398549553388e-05, "loss": 0.4394, "step": 3545 }, { "epoch": 2.4866760168302946, "grad_norm": 0.2517756223678589, "learning_rate": 2.5544292351895728e-05, "loss": 0.0754, "step": 3546 }, { "epoch": 2.4873772791023843, "grad_norm": 0.23679965734481812, "learning_rate": 2.553318604676661e-05, "loss": 0.0797, "step": 3547 }, { "epoch": 2.488078541374474, "grad_norm": 0.22961515188217163, "learning_rate": 2.552207963635899e-05, "loss": 0.0825, "step": 3548 }, { "epoch": 2.4887798036465636, "grad_norm": 0.23084484040737152, "learning_rate": 2.5510973122865856e-05, "loss": 0.0826, "step": 3549 }, { "epoch": 2.4894810659186537, "grad_norm": 0.3385494351387024, "learning_rate": 2.5499866508480197e-05, "loss": 0.2657, "step": 3550 }, { "epoch": 2.4901823281907434, "grad_norm": 3.2712345123291016, "learning_rate": 2.548875979539504e-05, "loss": 0.7636, "step": 3551 }, { "epoch": 2.490883590462833, "grad_norm": 0.2474018931388855, "learning_rate": 2.547765298580343e-05, "loss": 0.0741, "step": 3552 }, { "epoch": 2.4915848527349227, "grad_norm": 0.4151202440261841, "learning_rate": 2.5466546081898413e-05, "loss": 0.2691, "step": 3553 }, { "epoch": 2.492286115007013, "grad_norm": 0.23625291883945465, "learning_rate": 2.5455439085873068e-05, "loss": 0.0784, "step": 3554 }, { "epoch": 2.4929873772791025, "grad_norm": 0.24587447941303253, "learning_rate": 2.5444331999920495e-05, "loss": 0.0737, "step": 3555 }, { "epoch": 2.493688639551192, "grad_norm": 0.33179211616516113, "learning_rate": 2.5433224826233814e-05, "loss": 0.2591, "step": 3556 }, { "epoch": 2.494389901823282, "grad_norm": 0.32936322689056396, "learning_rate": 2.5422117567006143e-05, "loss": 0.2621, "step": 3557 }, { "epoch": 2.4950911640953715, "grad_norm": 1.8478659391403198, "learning_rate": 2.541101022443063e-05, "loss": 0.3433, "step": 3558 }, { "epoch": 2.4957924263674616, "grad_norm": 0.23049978911876678, "learning_rate": 2.5399902800700454e-05, "loss": 0.083, "step": 3559 }, { "epoch": 2.4964936886395512, "grad_norm": 0.2416684776544571, "learning_rate": 2.5388795298008776e-05, "loss": 0.0802, "step": 3560 }, { "epoch": 2.497194950911641, "grad_norm": 0.23885950446128845, "learning_rate": 2.5377687718548797e-05, "loss": 0.0794, "step": 3561 }, { "epoch": 2.4978962131837306, "grad_norm": 0.2383614033460617, "learning_rate": 2.536658006451374e-05, "loss": 0.0802, "step": 3562 }, { "epoch": 2.4985974754558207, "grad_norm": 0.33618223667144775, "learning_rate": 2.5355472338096814e-05, "loss": 0.2549, "step": 3563 }, { "epoch": 2.4992987377279103, "grad_norm": 1.7421714067459106, "learning_rate": 2.5344364541491266e-05, "loss": 0.3408, "step": 3564 }, { "epoch": 2.5, "grad_norm": 0.22994421422481537, "learning_rate": 2.533325667689035e-05, "loss": 0.0823, "step": 3565 }, { "epoch": 2.5007012622720897, "grad_norm": 0.2446928471326828, "learning_rate": 2.5322148746487335e-05, "loss": 0.0808, "step": 3566 }, { "epoch": 2.5014025245441793, "grad_norm": 1.5366231203079224, "learning_rate": 2.531104075247549e-05, "loss": 0.294, "step": 3567 }, { "epoch": 2.5021037868162694, "grad_norm": 0.3659988343715668, "learning_rate": 2.5299932697048118e-05, "loss": 0.2622, "step": 3568 }, { "epoch": 2.502805049088359, "grad_norm": 0.23249758780002594, "learning_rate": 2.528882458239852e-05, "loss": 0.084, "step": 3569 }, { "epoch": 2.5035063113604488, "grad_norm": 0.24892528355121613, "learning_rate": 2.527771641072001e-05, "loss": 0.083, "step": 3570 }, { "epoch": 2.5042075736325384, "grad_norm": 0.31039315462112427, "learning_rate": 2.5266608184205927e-05, "loss": 0.2573, "step": 3571 }, { "epoch": 2.5049088359046285, "grad_norm": 0.2476627230644226, "learning_rate": 2.525549990504959e-05, "loss": 0.0834, "step": 3572 }, { "epoch": 2.505610098176718, "grad_norm": 0.3449893295764923, "learning_rate": 2.5244391575444366e-05, "loss": 0.2608, "step": 3573 }, { "epoch": 2.506311360448808, "grad_norm": 0.8083913922309875, "learning_rate": 2.5233283197583603e-05, "loss": 0.4319, "step": 3574 }, { "epoch": 2.5070126227208975, "grad_norm": 0.2339041382074356, "learning_rate": 2.5222174773660673e-05, "loss": 0.0852, "step": 3575 }, { "epoch": 2.507713884992987, "grad_norm": 0.30803608894348145, "learning_rate": 2.5211066305868958e-05, "loss": 0.2602, "step": 3576 }, { "epoch": 2.5084151472650773, "grad_norm": 0.308057576417923, "learning_rate": 2.519995779640183e-05, "loss": 0.2594, "step": 3577 }, { "epoch": 2.509116409537167, "grad_norm": 0.30321547389030457, "learning_rate": 2.5188849247452696e-05, "loss": 0.2589, "step": 3578 }, { "epoch": 2.5098176718092566, "grad_norm": 0.33508414030075073, "learning_rate": 2.5177740661214956e-05, "loss": 0.2653, "step": 3579 }, { "epoch": 2.5105189340813463, "grad_norm": 0.23457840085029602, "learning_rate": 2.5166632039882016e-05, "loss": 0.0858, "step": 3580 }, { "epoch": 2.5112201963534364, "grad_norm": 0.3085915148258209, "learning_rate": 2.5155523385647294e-05, "loss": 0.2581, "step": 3581 }, { "epoch": 2.511921458625526, "grad_norm": 1.2742539644241333, "learning_rate": 2.5144414700704212e-05, "loss": 0.258, "step": 3582 }, { "epoch": 2.5126227208976157, "grad_norm": 0.235944926738739, "learning_rate": 2.5133305987246203e-05, "loss": 0.086, "step": 3583 }, { "epoch": 2.5133239831697054, "grad_norm": 0.2364029884338379, "learning_rate": 2.5122197247466693e-05, "loss": 0.0864, "step": 3584 }, { "epoch": 2.514025245441795, "grad_norm": 0.260657399892807, "learning_rate": 2.511108848355913e-05, "loss": 0.0878, "step": 3585 }, { "epoch": 2.5147265077138847, "grad_norm": 0.2990444600582123, "learning_rate": 2.5099979697716952e-05, "loss": 0.2602, "step": 3586 }, { "epoch": 2.515427769985975, "grad_norm": 0.30976614356040955, "learning_rate": 2.508887089213362e-05, "loss": 0.0896, "step": 3587 }, { "epoch": 2.5161290322580645, "grad_norm": 0.2638438642024994, "learning_rate": 2.507776206900257e-05, "loss": 0.0884, "step": 3588 }, { "epoch": 2.516830294530154, "grad_norm": 0.260149210691452, "learning_rate": 2.506665323051727e-05, "loss": 0.0874, "step": 3589 }, { "epoch": 2.5175315568022443, "grad_norm": 0.2383551150560379, "learning_rate": 2.5055544378871178e-05, "loss": 0.0871, "step": 3590 }, { "epoch": 2.518232819074334, "grad_norm": 0.23596780002117157, "learning_rate": 2.5044435516257748e-05, "loss": 0.0868, "step": 3591 }, { "epoch": 2.5189340813464236, "grad_norm": 0.23727767169475555, "learning_rate": 2.5033326644870446e-05, "loss": 0.0875, "step": 3592 }, { "epoch": 2.5196353436185133, "grad_norm": 0.23800122737884521, "learning_rate": 2.5022217766902745e-05, "loss": 0.0874, "step": 3593 }, { "epoch": 2.520336605890603, "grad_norm": 0.26009130477905273, "learning_rate": 2.501110888454811e-05, "loss": 0.0872, "step": 3594 }, { "epoch": 2.5210378681626926, "grad_norm": 0.23665904998779297, "learning_rate": 2.5e-05, "loss": 0.087, "step": 3595 }, { "epoch": 2.5217391304347827, "grad_norm": 0.2367362529039383, "learning_rate": 2.4988891115451894e-05, "loss": 0.087, "step": 3596 }, { "epoch": 2.5224403927068724, "grad_norm": 0.30338725447654724, "learning_rate": 2.4977782233097257e-05, "loss": 0.2551, "step": 3597 }, { "epoch": 2.523141654978962, "grad_norm": 0.25893768668174744, "learning_rate": 2.4966673355129556e-05, "loss": 0.0865, "step": 3598 }, { "epoch": 2.523842917251052, "grad_norm": 0.2606193423271179, "learning_rate": 2.4955564483742268e-05, "loss": 0.0854, "step": 3599 }, { "epoch": 2.524544179523142, "grad_norm": 0.2350812703371048, "learning_rate": 2.4944455621128835e-05, "loss": 0.0857, "step": 3600 }, { "epoch": 2.5252454417952315, "grad_norm": 0.2981743812561035, "learning_rate": 2.4933346769482737e-05, "loss": 0.2576, "step": 3601 }, { "epoch": 2.525946704067321, "grad_norm": 1.425345778465271, "learning_rate": 2.4922237930997434e-05, "loss": 0.4314, "step": 3602 }, { "epoch": 2.526647966339411, "grad_norm": 0.2512207329273224, "learning_rate": 2.4911129107866385e-05, "loss": 0.085, "step": 3603 }, { "epoch": 2.5273492286115005, "grad_norm": 0.27768924832344055, "learning_rate": 2.4900020302283043e-05, "loss": 0.0844, "step": 3604 }, { "epoch": 2.5280504908835906, "grad_norm": 0.25225791335105896, "learning_rate": 2.488891151644087e-05, "loss": 0.085, "step": 3605 }, { "epoch": 2.5287517531556802, "grad_norm": 0.26386016607284546, "learning_rate": 2.4877802752533316e-05, "loss": 0.0875, "step": 3606 }, { "epoch": 2.52945301542777, "grad_norm": 0.23359009623527527, "learning_rate": 2.486669401275381e-05, "loss": 0.0851, "step": 3607 }, { "epoch": 2.53015427769986, "grad_norm": 0.31599336862564087, "learning_rate": 2.4855585299295794e-05, "loss": 0.2604, "step": 3608 }, { "epoch": 2.5308555399719497, "grad_norm": 0.25586971640586853, "learning_rate": 2.484447661435271e-05, "loss": 0.0854, "step": 3609 }, { "epoch": 2.5315568022440393, "grad_norm": 0.2565004527568817, "learning_rate": 2.4833367960117987e-05, "loss": 0.0857, "step": 3610 }, { "epoch": 2.532258064516129, "grad_norm": 0.30212971568107605, "learning_rate": 2.4822259338785046e-05, "loss": 0.2582, "step": 3611 }, { "epoch": 2.5329593267882187, "grad_norm": 0.2315680831670761, "learning_rate": 2.4811150752547303e-05, "loss": 0.0845, "step": 3612 }, { "epoch": 2.5336605890603083, "grad_norm": 0.2329239845275879, "learning_rate": 2.4800042203598177e-05, "loss": 0.0844, "step": 3613 }, { "epoch": 2.5343618513323984, "grad_norm": 0.3415561318397522, "learning_rate": 2.478893369413105e-05, "loss": 0.085, "step": 3614 }, { "epoch": 2.535063113604488, "grad_norm": 0.25424525141716003, "learning_rate": 2.4777825226339337e-05, "loss": 0.0851, "step": 3615 }, { "epoch": 2.5357643758765778, "grad_norm": 0.23098503053188324, "learning_rate": 2.4766716802416403e-05, "loss": 0.0839, "step": 3616 }, { "epoch": 2.536465638148668, "grad_norm": 0.23089084029197693, "learning_rate": 2.475560842455564e-05, "loss": 0.0837, "step": 3617 }, { "epoch": 2.5371669004207575, "grad_norm": 0.2804200351238251, "learning_rate": 2.474450009495041e-05, "loss": 0.0825, "step": 3618 }, { "epoch": 2.537868162692847, "grad_norm": 0.3225388824939728, "learning_rate": 2.473339181579408e-05, "loss": 0.2628, "step": 3619 }, { "epoch": 2.538569424964937, "grad_norm": 0.2576763927936554, "learning_rate": 2.4722283589279997e-05, "loss": 0.0819, "step": 3620 }, { "epoch": 2.5392706872370265, "grad_norm": 0.2845037281513214, "learning_rate": 2.4711175417601486e-05, "loss": 0.0772, "step": 3621 }, { "epoch": 2.539971949509116, "grad_norm": 0.3164820969104767, "learning_rate": 2.4700067302951888e-05, "loss": 0.2617, "step": 3622 }, { "epoch": 2.5406732117812063, "grad_norm": 0.24516040086746216, "learning_rate": 2.4688959247524516e-05, "loss": 0.08, "step": 3623 }, { "epoch": 2.541374474053296, "grad_norm": 0.28614404797554016, "learning_rate": 2.467785125351267e-05, "loss": 0.0762, "step": 3624 }, { "epoch": 2.5420757363253856, "grad_norm": 0.24311737716197968, "learning_rate": 2.4666743323109655e-05, "loss": 0.0786, "step": 3625 }, { "epoch": 2.5427769985974753, "grad_norm": 0.23610341548919678, "learning_rate": 2.4655635458508737e-05, "loss": 0.0784, "step": 3626 }, { "epoch": 2.5434782608695654, "grad_norm": 0.24331560730934143, "learning_rate": 2.4644527661903195e-05, "loss": 0.0777, "step": 3627 }, { "epoch": 2.544179523141655, "grad_norm": 0.862680971622467, "learning_rate": 2.463341993548627e-05, "loss": 0.4411, "step": 3628 }, { "epoch": 2.5448807854137447, "grad_norm": 0.23663192987442017, "learning_rate": 2.4622312281451206e-05, "loss": 0.0781, "step": 3629 }, { "epoch": 2.5455820476858344, "grad_norm": 0.23667560517787933, "learning_rate": 2.4611204701991227e-05, "loss": 0.0778, "step": 3630 }, { "epoch": 2.546283309957924, "grad_norm": 0.23469951748847961, "learning_rate": 2.460009719929955e-05, "loss": 0.0775, "step": 3631 }, { "epoch": 2.546984572230014, "grad_norm": 0.34396448731422424, "learning_rate": 2.4588989775569368e-05, "loss": 0.2611, "step": 3632 }, { "epoch": 2.547685834502104, "grad_norm": 0.2671811282634735, "learning_rate": 2.457788243299386e-05, "loss": 0.0699, "step": 3633 }, { "epoch": 2.5483870967741935, "grad_norm": 0.357298344373703, "learning_rate": 2.4566775173766195e-05, "loss": 0.2655, "step": 3634 }, { "epoch": 2.549088359046283, "grad_norm": 1.9227181673049927, "learning_rate": 2.455566800007951e-05, "loss": 0.3341, "step": 3635 }, { "epoch": 2.5497896213183733, "grad_norm": 0.3659982681274414, "learning_rate": 2.4544560914126938e-05, "loss": 0.2598, "step": 3636 }, { "epoch": 2.550490883590463, "grad_norm": 0.36803093552589417, "learning_rate": 2.4533453918101596e-05, "loss": 0.2572, "step": 3637 }, { "epoch": 2.5511921458625526, "grad_norm": 0.2451574206352234, "learning_rate": 2.4522347014196576e-05, "loss": 0.0725, "step": 3638 }, { "epoch": 2.5518934081346423, "grad_norm": 0.3434046804904938, "learning_rate": 2.451124020460496e-05, "loss": 0.2648, "step": 3639 }, { "epoch": 2.552594670406732, "grad_norm": 0.2268703579902649, "learning_rate": 2.4500133491519802e-05, "loss": 0.0809, "step": 3640 }, { "epoch": 2.553295932678822, "grad_norm": 0.355915367603302, "learning_rate": 2.4489026877134154e-05, "loss": 0.2689, "step": 3641 }, { "epoch": 2.5539971949509117, "grad_norm": 0.22591254115104675, "learning_rate": 2.4477920363641015e-05, "loss": 0.08, "step": 3642 }, { "epoch": 2.5546984572230014, "grad_norm": 1.4106336832046509, "learning_rate": 2.4466813953233396e-05, "loss": 0.63, "step": 3643 }, { "epoch": 2.555399719495091, "grad_norm": 0.22665391862392426, "learning_rate": 2.4455707648104277e-05, "loss": 0.0806, "step": 3644 }, { "epoch": 2.556100981767181, "grad_norm": 0.2279053032398224, "learning_rate": 2.4444601450446615e-05, "loss": 0.0807, "step": 3645 }, { "epoch": 2.556802244039271, "grad_norm": 0.2278132438659668, "learning_rate": 2.4433495362453352e-05, "loss": 0.0813, "step": 3646 }, { "epoch": 2.5575035063113605, "grad_norm": 0.35527917742729187, "learning_rate": 2.4422389386317397e-05, "loss": 0.2586, "step": 3647 }, { "epoch": 2.55820476858345, "grad_norm": 0.2272210717201233, "learning_rate": 2.4411283524231664e-05, "loss": 0.0815, "step": 3648 }, { "epoch": 2.55890603085554, "grad_norm": 0.22607366740703583, "learning_rate": 2.4400177778389e-05, "loss": 0.0806, "step": 3649 }, { "epoch": 2.5596072931276295, "grad_norm": 0.22864355146884918, "learning_rate": 2.4389072150982254e-05, "loss": 0.0812, "step": 3650 }, { "epoch": 2.5603085553997196, "grad_norm": 0.344063937664032, "learning_rate": 2.4377966644204264e-05, "loss": 0.2598, "step": 3651 }, { "epoch": 2.5610098176718092, "grad_norm": 0.2277609407901764, "learning_rate": 2.436686126024783e-05, "loss": 0.0816, "step": 3652 }, { "epoch": 2.561711079943899, "grad_norm": 0.22784243524074554, "learning_rate": 2.435575600130572e-05, "loss": 0.0809, "step": 3653 }, { "epoch": 2.562412342215989, "grad_norm": 0.34493762254714966, "learning_rate": 2.4344650869570696e-05, "loss": 0.2603, "step": 3654 }, { "epoch": 2.5631136044880787, "grad_norm": 0.2439390867948532, "learning_rate": 2.4333545867235493e-05, "loss": 0.0734, "step": 3655 }, { "epoch": 2.5638148667601683, "grad_norm": 0.22761768102645874, "learning_rate": 2.432244099649279e-05, "loss": 0.0802, "step": 3656 }, { "epoch": 2.564516129032258, "grad_norm": 0.23536397516727448, "learning_rate": 2.4311336259535273e-05, "loss": 0.078, "step": 3657 }, { "epoch": 2.5652173913043477, "grad_norm": 0.3507789075374603, "learning_rate": 2.4300231658555594e-05, "loss": 0.2695, "step": 3658 }, { "epoch": 2.5659186535764373, "grad_norm": 0.3545868992805481, "learning_rate": 2.4289127195746375e-05, "loss": 0.2584, "step": 3659 }, { "epoch": 2.5666199158485274, "grad_norm": 0.3450091481208801, "learning_rate": 2.427802287330021e-05, "loss": 0.2603, "step": 3660 }, { "epoch": 2.567321178120617, "grad_norm": 0.3389855623245239, "learning_rate": 2.426691869340967e-05, "loss": 0.2646, "step": 3661 }, { "epoch": 2.5680224403927068, "grad_norm": 0.2330239713191986, "learning_rate": 2.425581465826729e-05, "loss": 0.0767, "step": 3662 }, { "epoch": 2.568723702664797, "grad_norm": 0.3384527862071991, "learning_rate": 2.4244710770065587e-05, "loss": 0.2586, "step": 3663 }, { "epoch": 2.5694249649368865, "grad_norm": 0.2273813933134079, "learning_rate": 2.423360703099703e-05, "loss": 0.0807, "step": 3664 }, { "epoch": 2.570126227208976, "grad_norm": 0.8712243437767029, "learning_rate": 2.422250344325408e-05, "loss": 0.445, "step": 3665 }, { "epoch": 2.570827489481066, "grad_norm": 0.2342926561832428, "learning_rate": 2.421140000902916e-05, "loss": 0.0765, "step": 3666 }, { "epoch": 2.5715287517531555, "grad_norm": 2.702312707901001, "learning_rate": 2.420029673051466e-05, "loss": 0.735, "step": 3667 }, { "epoch": 2.572230014025245, "grad_norm": 0.2300148606300354, "learning_rate": 2.418919360990295e-05, "loss": 0.0765, "step": 3668 }, { "epoch": 2.5729312762973353, "grad_norm": 0.3371488153934479, "learning_rate": 2.417809064938634e-05, "loss": 0.2633, "step": 3669 }, { "epoch": 2.573632538569425, "grad_norm": 0.24263927340507507, "learning_rate": 2.416698785115715e-05, "loss": 0.0733, "step": 3670 }, { "epoch": 2.5743338008415146, "grad_norm": 0.24343614280223846, "learning_rate": 2.415588521740763e-05, "loss": 0.0736, "step": 3671 }, { "epoch": 2.5750350631136047, "grad_norm": 1.7873142957687378, "learning_rate": 2.414478275033003e-05, "loss": 0.344, "step": 3672 }, { "epoch": 2.5757363253856944, "grad_norm": 1.7564606666564941, "learning_rate": 2.4133680452116535e-05, "loss": 0.3405, "step": 3673 }, { "epoch": 2.576437587657784, "grad_norm": 0.38867616653442383, "learning_rate": 2.4122578324959326e-05, "loss": 0.2621, "step": 3674 }, { "epoch": 2.5771388499298737, "grad_norm": 0.3331247866153717, "learning_rate": 2.4111476371050537e-05, "loss": 0.258, "step": 3675 }, { "epoch": 2.5778401122019634, "grad_norm": 0.8369092345237732, "learning_rate": 2.4100374592582257e-05, "loss": 0.4367, "step": 3676 }, { "epoch": 2.578541374474053, "grad_norm": 0.26253843307495117, "learning_rate": 2.4089272991746554e-05, "loss": 0.0791, "step": 3677 }, { "epoch": 2.579242636746143, "grad_norm": 0.2604319155216217, "learning_rate": 2.407817157073546e-05, "loss": 0.0798, "step": 3678 }, { "epoch": 2.579943899018233, "grad_norm": 0.8255321383476257, "learning_rate": 2.4067070331740972e-05, "loss": 0.4361, "step": 3679 }, { "epoch": 2.5806451612903225, "grad_norm": 0.23150750994682312, "learning_rate": 2.4055969276955047e-05, "loss": 0.0847, "step": 3680 }, { "epoch": 2.5813464235624126, "grad_norm": 3.255382776260376, "learning_rate": 2.4044868408569608e-05, "loss": 0.5472, "step": 3681 }, { "epoch": 2.5820476858345023, "grad_norm": 0.30068203806877136, "learning_rate": 2.403376772877655e-05, "loss": 0.2579, "step": 3682 }, { "epoch": 2.582748948106592, "grad_norm": 0.3087069094181061, "learning_rate": 2.4022667239767698e-05, "loss": 0.2624, "step": 3683 }, { "epoch": 2.5834502103786816, "grad_norm": 0.23513931035995483, "learning_rate": 2.401156694373488e-05, "loss": 0.0864, "step": 3684 }, { "epoch": 2.5841514726507713, "grad_norm": 0.2701314091682434, "learning_rate": 2.4000466842869864e-05, "loss": 0.0844, "step": 3685 }, { "epoch": 2.584852734922861, "grad_norm": 1.8514221906661987, "learning_rate": 2.398936693936438e-05, "loss": 0.617, "step": 3686 }, { "epoch": 2.585553997194951, "grad_norm": 0.29248273372650146, "learning_rate": 2.3978267235410128e-05, "loss": 0.2564, "step": 3687 }, { "epoch": 2.5862552594670407, "grad_norm": 0.23581059277057648, "learning_rate": 2.3967167733198766e-05, "loss": 0.0871, "step": 3688 }, { "epoch": 2.5869565217391304, "grad_norm": 1.0290906429290771, "learning_rate": 2.395606843492192e-05, "loss": 0.2528, "step": 3689 }, { "epoch": 2.58765778401122, "grad_norm": 1.1149137020111084, "learning_rate": 2.3944969342771134e-05, "loss": 0.2582, "step": 3690 }, { "epoch": 2.58835904628331, "grad_norm": 0.2547766864299774, "learning_rate": 2.3933870458937964e-05, "loss": 0.0881, "step": 3691 }, { "epoch": 2.5890603085554, "grad_norm": 0.26197370886802673, "learning_rate": 2.3922771785613902e-05, "loss": 0.0899, "step": 3692 }, { "epoch": 2.5897615708274895, "grad_norm": 0.2623934745788574, "learning_rate": 2.3911673324990397e-05, "loss": 0.0907, "step": 3693 }, { "epoch": 2.590462833099579, "grad_norm": 0.2788720726966858, "learning_rate": 2.3900575079258863e-05, "loss": 0.2569, "step": 3694 }, { "epoch": 2.591164095371669, "grad_norm": 0.8049853444099426, "learning_rate": 2.3889477050610662e-05, "loss": 0.2122, "step": 3695 }, { "epoch": 2.591865357643759, "grad_norm": 0.27850595116615295, "learning_rate": 2.3878379241237136e-05, "loss": 0.2547, "step": 3696 }, { "epoch": 2.5925666199158486, "grad_norm": 0.3547700047492981, "learning_rate": 2.386728165332954e-05, "loss": 0.0981, "step": 3697 }, { "epoch": 2.5932678821879382, "grad_norm": 0.24286659061908722, "learning_rate": 2.385618428907912e-05, "loss": 0.09, "step": 3698 }, { "epoch": 2.593969144460028, "grad_norm": 0.2883000373840332, "learning_rate": 2.3845087150677083e-05, "loss": 0.2621, "step": 3699 }, { "epoch": 2.594670406732118, "grad_norm": 0.27105778455734253, "learning_rate": 2.3833990240314562e-05, "loss": 0.093, "step": 3700 }, { "epoch": 2.5953716690042077, "grad_norm": 0.27446356415748596, "learning_rate": 2.382289356018267e-05, "loss": 0.2541, "step": 3701 }, { "epoch": 2.5960729312762973, "grad_norm": 0.26837536692619324, "learning_rate": 2.3811797112472467e-05, "loss": 0.0927, "step": 3702 }, { "epoch": 2.596774193548387, "grad_norm": 0.27657103538513184, "learning_rate": 2.380070089937497e-05, "loss": 0.2549, "step": 3703 }, { "epoch": 2.5974754558204767, "grad_norm": 0.24316734075546265, "learning_rate": 2.3789604923081128e-05, "loss": 0.0903, "step": 3704 }, { "epoch": 2.598176718092567, "grad_norm": 0.270458459854126, "learning_rate": 2.3778509185781865e-05, "loss": 0.2515, "step": 3705 }, { "epoch": 2.5988779803646564, "grad_norm": 0.28101101517677307, "learning_rate": 2.3767413689668064e-05, "loss": 0.0959, "step": 3706 }, { "epoch": 2.599579242636746, "grad_norm": 0.2725071310997009, "learning_rate": 2.3756318436930542e-05, "loss": 0.2554, "step": 3707 }, { "epoch": 2.6002805049088358, "grad_norm": 0.2704225778579712, "learning_rate": 2.3745223429760076e-05, "loss": 0.252, "step": 3708 }, { "epoch": 2.600981767180926, "grad_norm": 0.24368372559547424, "learning_rate": 2.37341286703474e-05, "loss": 0.0906, "step": 3709 }, { "epoch": 2.6016830294530155, "grad_norm": 0.24414053559303284, "learning_rate": 2.372303416088319e-05, "loss": 0.0909, "step": 3710 }, { "epoch": 2.602384291725105, "grad_norm": 0.2690550982952118, "learning_rate": 2.3711939903558074e-05, "loss": 0.2582, "step": 3711 }, { "epoch": 2.603085553997195, "grad_norm": 0.27950671315193176, "learning_rate": 2.370084590056264e-05, "loss": 0.095, "step": 3712 }, { "epoch": 2.6037868162692845, "grad_norm": 0.28507891297340393, "learning_rate": 2.36897521540874e-05, "loss": 0.0957, "step": 3713 }, { "epoch": 2.604488078541374, "grad_norm": 0.37793204188346863, "learning_rate": 2.3678658666322848e-05, "loss": 0.1022, "step": 3714 }, { "epoch": 2.6051893408134643, "grad_norm": 0.7481350302696228, "learning_rate": 2.3667565439459407e-05, "loss": 0.4134, "step": 3715 }, { "epoch": 2.605890603085554, "grad_norm": 0.2803359031677246, "learning_rate": 2.365647247568746e-05, "loss": 0.2598, "step": 3716 }, { "epoch": 2.6065918653576436, "grad_norm": 0.26916080713272095, "learning_rate": 2.3645379777197325e-05, "loss": 0.093, "step": 3717 }, { "epoch": 2.6072931276297338, "grad_norm": 0.24118925631046295, "learning_rate": 2.3634287346179275e-05, "loss": 0.0899, "step": 3718 }, { "epoch": 2.6079943899018234, "grad_norm": 0.3406561017036438, "learning_rate": 2.3623195184823532e-05, "loss": 0.0991, "step": 3719 }, { "epoch": 2.608695652173913, "grad_norm": 1.2486287355422974, "learning_rate": 2.361210329532026e-05, "loss": 0.5828, "step": 3720 }, { "epoch": 2.6093969144460027, "grad_norm": 0.29481351375579834, "learning_rate": 2.3601011679859574e-05, "loss": 0.0947, "step": 3721 }, { "epoch": 2.6100981767180924, "grad_norm": 0.27445393800735474, "learning_rate": 2.3589920340631536e-05, "loss": 0.2551, "step": 3722 }, { "epoch": 2.610799438990182, "grad_norm": 0.27007123827934265, "learning_rate": 2.357882927982614e-05, "loss": 0.251, "step": 3723 }, { "epoch": 2.611500701262272, "grad_norm": 1.0871350765228271, "learning_rate": 2.3567738499633343e-05, "loss": 0.3761, "step": 3724 }, { "epoch": 2.612201963534362, "grad_norm": 0.24148240685462952, "learning_rate": 2.3556648002243033e-05, "loss": 0.0901, "step": 3725 }, { "epoch": 2.6129032258064515, "grad_norm": 0.2734684646129608, "learning_rate": 2.3545557789845053e-05, "loss": 0.0942, "step": 3726 }, { "epoch": 2.6136044880785416, "grad_norm": 0.2684231400489807, "learning_rate": 2.353446786462918e-05, "loss": 0.2513, "step": 3727 }, { "epoch": 2.6143057503506313, "grad_norm": 0.312842458486557, "learning_rate": 2.3523378228785136e-05, "loss": 0.0974, "step": 3728 }, { "epoch": 2.615007012622721, "grad_norm": 0.26884618401527405, "learning_rate": 2.3512288884502597e-05, "loss": 0.0922, "step": 3729 }, { "epoch": 2.6157082748948106, "grad_norm": 0.24503254890441895, "learning_rate": 2.3501199833971173e-05, "loss": 0.0912, "step": 3730 }, { "epoch": 2.6164095371669003, "grad_norm": 0.2441575527191162, "learning_rate": 2.349011107938042e-05, "loss": 0.091, "step": 3731 }, { "epoch": 2.61711079943899, "grad_norm": 0.312568336725235, "learning_rate": 2.3479022622919814e-05, "loss": 0.0967, "step": 3732 }, { "epoch": 2.61781206171108, "grad_norm": 0.2441946566104889, "learning_rate": 2.34679344667788e-05, "loss": 0.0909, "step": 3733 }, { "epoch": 2.6185133239831697, "grad_norm": 0.7454778552055359, "learning_rate": 2.3456846613146752e-05, "loss": 0.4196, "step": 3734 }, { "epoch": 2.6192145862552594, "grad_norm": 0.26960745453834534, "learning_rate": 2.3445759064212988e-05, "loss": 0.0935, "step": 3735 }, { "epoch": 2.6199158485273495, "grad_norm": 0.31614357233047485, "learning_rate": 2.3434671822166762e-05, "loss": 0.0964, "step": 3736 }, { "epoch": 2.620617110799439, "grad_norm": 0.24205970764160156, "learning_rate": 2.3423584889197268e-05, "loss": 0.0902, "step": 3737 }, { "epoch": 2.621318373071529, "grad_norm": 0.27145883440971375, "learning_rate": 2.3412498267493656e-05, "loss": 0.2534, "step": 3738 }, { "epoch": 2.6220196353436185, "grad_norm": 0.2668789327144623, "learning_rate": 2.340141195924497e-05, "loss": 0.0924, "step": 3739 }, { "epoch": 2.622720897615708, "grad_norm": 0.7542844414710999, "learning_rate": 2.3390325966640237e-05, "loss": 0.4159, "step": 3740 }, { "epoch": 2.623422159887798, "grad_norm": 0.2664961814880371, "learning_rate": 2.33792402918684e-05, "loss": 0.0923, "step": 3741 }, { "epoch": 2.624123422159888, "grad_norm": 0.7608965039253235, "learning_rate": 2.3368154937118352e-05, "loss": 0.4175, "step": 3742 }, { "epoch": 2.6248246844319776, "grad_norm": 0.2648885250091553, "learning_rate": 2.3357069904578908e-05, "loss": 0.0919, "step": 3743 }, { "epoch": 2.6255259467040672, "grad_norm": 0.2418198138475418, "learning_rate": 2.3345985196438828e-05, "loss": 0.0901, "step": 3744 }, { "epoch": 2.6262272089761574, "grad_norm": 0.26937344670295715, "learning_rate": 2.333490081488682e-05, "loss": 0.093, "step": 3745 }, { "epoch": 2.626928471248247, "grad_norm": 0.3046201467514038, "learning_rate": 2.3323816762111493e-05, "loss": 0.2581, "step": 3746 }, { "epoch": 2.6276297335203367, "grad_norm": 0.756880521774292, "learning_rate": 2.3312733040301416e-05, "loss": 0.4181, "step": 3747 }, { "epoch": 2.6283309957924264, "grad_norm": 0.7638749480247498, "learning_rate": 2.33016496516451e-05, "loss": 0.4246, "step": 3748 }, { "epoch": 2.629032258064516, "grad_norm": 0.2994075119495392, "learning_rate": 2.329056659833097e-05, "loss": 0.0941, "step": 3749 }, { "epoch": 2.6297335203366057, "grad_norm": 0.2703512907028198, "learning_rate": 2.32794838825474e-05, "loss": 0.2539, "step": 3750 }, { "epoch": 2.630434782608696, "grad_norm": 0.2661319971084595, "learning_rate": 2.3268401506482685e-05, "loss": 0.0924, "step": 3751 }, { "epoch": 2.6311360448807855, "grad_norm": 0.24177470803260803, "learning_rate": 2.3257319472325074e-05, "loss": 0.0901, "step": 3752 }, { "epoch": 2.631837307152875, "grad_norm": 0.24244892597198486, "learning_rate": 2.3246237782262712e-05, "loss": 0.0901, "step": 3753 }, { "epoch": 2.632538569424965, "grad_norm": 0.2721075713634491, "learning_rate": 2.323515643848371e-05, "loss": 0.255, "step": 3754 }, { "epoch": 2.633239831697055, "grad_norm": 0.24270842969417572, "learning_rate": 2.322407544317609e-05, "loss": 0.0903, "step": 3755 }, { "epoch": 2.6339410939691446, "grad_norm": 0.27275457978248596, "learning_rate": 2.3212994798527826e-05, "loss": 0.2542, "step": 3756 }, { "epoch": 2.634642356241234, "grad_norm": 0.2436903566122055, "learning_rate": 2.3201914506726802e-05, "loss": 0.0909, "step": 3757 }, { "epoch": 2.635343618513324, "grad_norm": 0.2726958096027374, "learning_rate": 2.319083456996084e-05, "loss": 0.2527, "step": 3758 }, { "epoch": 2.6360448807854135, "grad_norm": 0.24105072021484375, "learning_rate": 2.3179754990417707e-05, "loss": 0.09, "step": 3759 }, { "epoch": 2.6367461430575037, "grad_norm": 0.2692817747592926, "learning_rate": 2.3168675770285063e-05, "loss": 0.2527, "step": 3760 }, { "epoch": 2.6374474053295933, "grad_norm": 0.2416500598192215, "learning_rate": 2.3157596911750536e-05, "loss": 0.0901, "step": 3761 }, { "epoch": 2.638148667601683, "grad_norm": 0.2607724964618683, "learning_rate": 2.3146518417001654e-05, "loss": 0.0914, "step": 3762 }, { "epoch": 2.6388499298737726, "grad_norm": 0.2601964771747589, "learning_rate": 2.3135440288225886e-05, "loss": 0.0908, "step": 3763 }, { "epoch": 2.6395511921458628, "grad_norm": 0.24254143238067627, "learning_rate": 2.3124362527610634e-05, "loss": 0.0903, "step": 3764 }, { "epoch": 2.6402524544179524, "grad_norm": 0.24173586070537567, "learning_rate": 2.311328513734322e-05, "loss": 0.0899, "step": 3765 }, { "epoch": 2.640953716690042, "grad_norm": 0.285192608833313, "learning_rate": 2.3102208119610897e-05, "loss": 0.0898, "step": 3766 }, { "epoch": 2.6416549789621318, "grad_norm": 0.2561798393726349, "learning_rate": 2.3091131476600826e-05, "loss": 0.0899, "step": 3767 }, { "epoch": 2.6423562412342214, "grad_norm": 0.239500492811203, "learning_rate": 2.3080055210500125e-05, "loss": 0.0893, "step": 3768 }, { "epoch": 2.6430575035063115, "grad_norm": 0.24165430665016174, "learning_rate": 2.306897932349581e-05, "loss": 0.09, "step": 3769 }, { "epoch": 2.643758765778401, "grad_norm": 0.29783424735069275, "learning_rate": 2.3057903817774843e-05, "loss": 0.2565, "step": 3770 }, { "epoch": 2.644460028050491, "grad_norm": 0.2543928027153015, "learning_rate": 2.3046828695524104e-05, "loss": 0.0897, "step": 3771 }, { "epoch": 2.6451612903225805, "grad_norm": 0.27315181493759155, "learning_rate": 2.3035753958930382e-05, "loss": 0.0883, "step": 3772 }, { "epoch": 2.6458625525946706, "grad_norm": 0.27535468339920044, "learning_rate": 2.3024679610180417e-05, "loss": 0.2532, "step": 3773 }, { "epoch": 2.6465638148667603, "grad_norm": 0.23827339708805084, "learning_rate": 2.3013605651460842e-05, "loss": 0.0886, "step": 3774 }, { "epoch": 2.64726507713885, "grad_norm": 0.3340631425380707, "learning_rate": 2.300253208495824e-05, "loss": 0.2525, "step": 3775 }, { "epoch": 2.6479663394109396, "grad_norm": 0.30205512046813965, "learning_rate": 2.2991458912859103e-05, "loss": 0.0874, "step": 3776 }, { "epoch": 2.6486676016830293, "grad_norm": 0.27945777773857117, "learning_rate": 2.298038613734985e-05, "loss": 0.2546, "step": 3777 }, { "epoch": 2.649368863955119, "grad_norm": 0.23691603541374207, "learning_rate": 2.296931376061682e-05, "loss": 0.0881, "step": 3778 }, { "epoch": 2.650070126227209, "grad_norm": 0.23815809190273285, "learning_rate": 2.295824178484627e-05, "loss": 0.0884, "step": 3779 }, { "epoch": 2.6507713884992987, "grad_norm": 0.23711125552654266, "learning_rate": 2.2947170212224394e-05, "loss": 0.0883, "step": 3780 }, { "epoch": 2.6514726507713884, "grad_norm": 0.23591865599155426, "learning_rate": 2.2936099044937266e-05, "loss": 0.0877, "step": 3781 }, { "epoch": 2.6521739130434785, "grad_norm": 0.24851994216442108, "learning_rate": 2.2925028285170923e-05, "loss": 0.0867, "step": 3782 }, { "epoch": 2.652875175315568, "grad_norm": 0.2880420386791229, "learning_rate": 2.2913957935111306e-05, "loss": 0.2535, "step": 3783 }, { "epoch": 2.653576437587658, "grad_norm": 0.29304376244544983, "learning_rate": 2.290288799694427e-05, "loss": 0.2566, "step": 3784 }, { "epoch": 2.6542776998597475, "grad_norm": 0.2356521487236023, "learning_rate": 2.2891818472855593e-05, "loss": 0.0876, "step": 3785 }, { "epoch": 2.654978962131837, "grad_norm": 1.1615853309631348, "learning_rate": 2.2880749365030993e-05, "loss": 0.2772, "step": 3786 }, { "epoch": 2.655680224403927, "grad_norm": 0.3124368488788605, "learning_rate": 2.2869680675656048e-05, "loss": 0.0823, "step": 3787 }, { "epoch": 2.656381486676017, "grad_norm": 0.23592209815979004, "learning_rate": 2.2858612406916313e-05, "loss": 0.0874, "step": 3788 }, { "epoch": 2.6570827489481066, "grad_norm": 0.2346949279308319, "learning_rate": 2.2847544560997227e-05, "loss": 0.0871, "step": 3789 }, { "epoch": 2.6577840112201963, "grad_norm": 0.2986520230770111, "learning_rate": 2.283647714008416e-05, "loss": 0.2532, "step": 3790 }, { "epoch": 2.6584852734922864, "grad_norm": 0.24658754467964172, "learning_rate": 2.2825410146362395e-05, "loss": 0.0863, "step": 3791 }, { "epoch": 2.659186535764376, "grad_norm": 0.23694393038749695, "learning_rate": 2.2814343582017125e-05, "loss": 0.0875, "step": 3792 }, { "epoch": 2.6598877980364657, "grad_norm": 0.2667106091976166, "learning_rate": 2.2803277449233478e-05, "loss": 0.0847, "step": 3793 }, { "epoch": 2.6605890603085554, "grad_norm": 0.30544722080230713, "learning_rate": 2.279221175019645e-05, "loss": 0.2536, "step": 3794 }, { "epoch": 2.661290322580645, "grad_norm": 0.24210232496261597, "learning_rate": 2.2781146487091e-05, "loss": 0.0854, "step": 3795 }, { "epoch": 2.6619915848527347, "grad_norm": 0.8006841540336609, "learning_rate": 2.2770081662101984e-05, "loss": 0.4312, "step": 3796 }, { "epoch": 2.662692847124825, "grad_norm": 0.24481776356697083, "learning_rate": 2.2759017277414166e-05, "loss": 0.085, "step": 3797 }, { "epoch": 2.6633941093969145, "grad_norm": 1.5950207710266113, "learning_rate": 2.274795333521223e-05, "loss": 0.2703, "step": 3798 }, { "epoch": 2.664095371669004, "grad_norm": 0.25199368596076965, "learning_rate": 2.273688983768077e-05, "loss": 0.0872, "step": 3799 }, { "epoch": 2.6647966339410942, "grad_norm": 0.23617200553417206, "learning_rate": 2.2725826787004308e-05, "loss": 0.0869, "step": 3800 }, { "epoch": 2.665497896213184, "grad_norm": 0.29750490188598633, "learning_rate": 2.2714764185367234e-05, "loss": 0.256, "step": 3801 }, { "epoch": 2.6661991584852736, "grad_norm": 0.23684045672416687, "learning_rate": 2.2703702034953895e-05, "loss": 0.0873, "step": 3802 }, { "epoch": 2.666900420757363, "grad_norm": 0.2459167242050171, "learning_rate": 2.2692640337948524e-05, "loss": 0.0856, "step": 3803 }, { "epoch": 2.667601683029453, "grad_norm": 0.3051711320877075, "learning_rate": 2.2681579096535277e-05, "loss": 0.2534, "step": 3804 }, { "epoch": 2.6683029453015426, "grad_norm": 1.197557806968689, "learning_rate": 2.267051831289821e-05, "loss": 0.2725, "step": 3805 }, { "epoch": 2.6690042075736327, "grad_norm": 0.30372539162635803, "learning_rate": 2.2659457989221306e-05, "loss": 0.2577, "step": 3806 }, { "epoch": 2.6697054698457223, "grad_norm": 0.2645278871059418, "learning_rate": 2.2648398127688437e-05, "loss": 0.0838, "step": 3807 }, { "epoch": 2.670406732117812, "grad_norm": 0.2467213273048401, "learning_rate": 2.2637338730483384e-05, "loss": 0.0858, "step": 3808 }, { "epoch": 2.671107994389902, "grad_norm": 0.24899692833423615, "learning_rate": 2.262627979978986e-05, "loss": 0.0861, "step": 3809 }, { "epoch": 2.6718092566619918, "grad_norm": 0.2903290092945099, "learning_rate": 2.2615221337791455e-05, "loss": 0.087, "step": 3810 }, { "epoch": 2.6725105189340814, "grad_norm": 0.3029889762401581, "learning_rate": 2.2604163346671685e-05, "loss": 0.2587, "step": 3811 }, { "epoch": 2.673211781206171, "grad_norm": 0.32576119899749756, "learning_rate": 2.259310582861397e-05, "loss": 0.2611, "step": 3812 }, { "epoch": 2.6739130434782608, "grad_norm": 0.2504826784133911, "learning_rate": 2.258204878580164e-05, "loss": 0.0866, "step": 3813 }, { "epoch": 2.6746143057503504, "grad_norm": 0.2674291431903839, "learning_rate": 2.257099222041793e-05, "loss": 0.0843, "step": 3814 }, { "epoch": 2.6753155680224405, "grad_norm": 0.2919536828994751, "learning_rate": 2.255993613464597e-05, "loss": 0.2571, "step": 3815 }, { "epoch": 2.67601683029453, "grad_norm": 0.7907817959785461, "learning_rate": 2.25488805306688e-05, "loss": 0.4286, "step": 3816 }, { "epoch": 2.67671809256662, "grad_norm": 0.26756736636161804, "learning_rate": 2.2537825410669374e-05, "loss": 0.0835, "step": 3817 }, { "epoch": 2.6774193548387095, "grad_norm": 0.30573445558547974, "learning_rate": 2.2526770776830545e-05, "loss": 0.2606, "step": 3818 }, { "epoch": 2.6781206171107996, "grad_norm": 0.29613426327705383, "learning_rate": 2.251571663133507e-05, "loss": 0.2579, "step": 3819 }, { "epoch": 2.6788218793828893, "grad_norm": 0.23672990500926971, "learning_rate": 2.2504662976365613e-05, "loss": 0.0874, "step": 3820 }, { "epoch": 2.679523141654979, "grad_norm": 0.23613019287586212, "learning_rate": 2.2493609814104727e-05, "loss": 0.0872, "step": 3821 }, { "epoch": 2.6802244039270686, "grad_norm": 0.23601986467838287, "learning_rate": 2.248255714673488e-05, "loss": 0.0871, "step": 3822 }, { "epoch": 2.6809256661991583, "grad_norm": 0.283535361289978, "learning_rate": 2.247150497643844e-05, "loss": 0.2526, "step": 3823 }, { "epoch": 2.6816269284712484, "grad_norm": 0.2889503538608551, "learning_rate": 2.2460453305397683e-05, "loss": 0.2547, "step": 3824 }, { "epoch": 2.682328190743338, "grad_norm": 0.2497761994600296, "learning_rate": 2.244940213579477e-05, "loss": 0.0859, "step": 3825 }, { "epoch": 2.6830294530154277, "grad_norm": 0.2694022059440613, "learning_rate": 2.243835146981179e-05, "loss": 0.0854, "step": 3826 }, { "epoch": 2.6837307152875174, "grad_norm": 0.29656633734703064, "learning_rate": 2.24273013096307e-05, "loss": 0.258, "step": 3827 }, { "epoch": 2.6844319775596075, "grad_norm": 0.7961236834526062, "learning_rate": 2.2416251657433387e-05, "loss": 0.427, "step": 3828 }, { "epoch": 2.685133239831697, "grad_norm": 0.23561787605285645, "learning_rate": 2.240520251540161e-05, "loss": 0.0868, "step": 3829 }, { "epoch": 2.685834502103787, "grad_norm": 1.360349416732788, "learning_rate": 2.2394153885717045e-05, "loss": 0.2986, "step": 3830 }, { "epoch": 2.6865357643758765, "grad_norm": 0.29723024368286133, "learning_rate": 2.2383105770561264e-05, "loss": 0.2574, "step": 3831 }, { "epoch": 2.687237026647966, "grad_norm": 0.27330920100212097, "learning_rate": 2.237205817211574e-05, "loss": 0.086, "step": 3832 }, { "epoch": 2.6879382889200563, "grad_norm": 0.2709498405456543, "learning_rate": 2.2361011092561834e-05, "loss": 0.0841, "step": 3833 }, { "epoch": 2.688639551192146, "grad_norm": 0.795483410358429, "learning_rate": 2.2349964534080815e-05, "loss": 0.4267, "step": 3834 }, { "epoch": 2.6893408134642356, "grad_norm": 1.3026387691497803, "learning_rate": 2.2338918498853857e-05, "loss": 0.28, "step": 3835 }, { "epoch": 2.6900420757363253, "grad_norm": 0.23885291814804077, "learning_rate": 2.2327872989061992e-05, "loss": 0.0883, "step": 3836 }, { "epoch": 2.6907433380084154, "grad_norm": 0.2873568534851074, "learning_rate": 2.231682800688619e-05, "loss": 0.2574, "step": 3837 }, { "epoch": 2.691444600280505, "grad_norm": 0.28423821926116943, "learning_rate": 2.23057835545073e-05, "loss": 0.257, "step": 3838 }, { "epoch": 2.6921458625525947, "grad_norm": 0.7714678049087524, "learning_rate": 2.2294739634106068e-05, "loss": 0.4226, "step": 3839 }, { "epoch": 2.6928471248246844, "grad_norm": 0.2517342269420624, "learning_rate": 2.2283696247863135e-05, "loss": 0.0878, "step": 3840 }, { "epoch": 2.693548387096774, "grad_norm": 0.2799251675605774, "learning_rate": 2.227265339795904e-05, "loss": 0.2543, "step": 3841 }, { "epoch": 2.6942496493688637, "grad_norm": 0.28812792897224426, "learning_rate": 2.2261611086574215e-05, "loss": 0.2543, "step": 3842 }, { "epoch": 2.694950911640954, "grad_norm": 0.2803044021129608, "learning_rate": 2.2250569315888973e-05, "loss": 0.2563, "step": 3843 }, { "epoch": 2.6956521739130435, "grad_norm": 0.7625031471252441, "learning_rate": 2.223952808808353e-05, "loss": 0.4209, "step": 3844 }, { "epoch": 2.696353436185133, "grad_norm": 0.27391117811203003, "learning_rate": 2.2228487405338e-05, "loss": 0.0919, "step": 3845 }, { "epoch": 2.6970546984572232, "grad_norm": 0.27248650789260864, "learning_rate": 2.221744726983239e-05, "loss": 0.2537, "step": 3846 }, { "epoch": 2.697755960729313, "grad_norm": 0.2874460220336914, "learning_rate": 2.2206407683746588e-05, "loss": 0.2539, "step": 3847 }, { "epoch": 2.6984572230014026, "grad_norm": 0.2611757218837738, "learning_rate": 2.219536864926038e-05, "loss": 0.0904, "step": 3848 }, { "epoch": 2.6991584852734922, "grad_norm": 0.28354233503341675, "learning_rate": 2.2184330168553454e-05, "loss": 0.2578, "step": 3849 }, { "epoch": 2.699859747545582, "grad_norm": 0.26766934990882874, "learning_rate": 2.2173292243805354e-05, "loss": 0.0924, "step": 3850 }, { "epoch": 2.7005610098176716, "grad_norm": 0.2927875816822052, "learning_rate": 2.2162254877195547e-05, "loss": 0.0913, "step": 3851 }, { "epoch": 2.7012622720897617, "grad_norm": 0.24343961477279663, "learning_rate": 2.2151218070903388e-05, "loss": 0.0907, "step": 3852 }, { "epoch": 2.7019635343618513, "grad_norm": 1.8092209100723267, "learning_rate": 2.2140181827108103e-05, "loss": 0.5936, "step": 3853 }, { "epoch": 2.702664796633941, "grad_norm": 0.27061107754707336, "learning_rate": 2.212914614798882e-05, "loss": 0.2531, "step": 3854 }, { "epoch": 2.703366058906031, "grad_norm": 0.2601613998413086, "learning_rate": 2.2118111035724557e-05, "loss": 0.091, "step": 3855 }, { "epoch": 2.7040673211781208, "grad_norm": 0.750676155090332, "learning_rate": 2.2107076492494217e-05, "loss": 0.4101, "step": 3856 }, { "epoch": 2.7047685834502104, "grad_norm": 0.29318514466285706, "learning_rate": 2.2096042520476577e-05, "loss": 0.0914, "step": 3857 }, { "epoch": 2.7054698457223, "grad_norm": 0.2686457633972168, "learning_rate": 2.2085009121850332e-05, "loss": 0.0927, "step": 3858 }, { "epoch": 2.7061711079943898, "grad_norm": 0.29011526703834534, "learning_rate": 2.2073976298794027e-05, "loss": 0.0909, "step": 3859 }, { "epoch": 2.7068723702664794, "grad_norm": 0.29528045654296875, "learning_rate": 2.2062944053486125e-05, "loss": 0.0928, "step": 3860 }, { "epoch": 2.7075736325385695, "grad_norm": 0.26859307289123535, "learning_rate": 2.2051912388104954e-05, "loss": 0.2531, "step": 3861 }, { "epoch": 2.708274894810659, "grad_norm": 0.2699264883995056, "learning_rate": 2.2040881304828735e-05, "loss": 0.093, "step": 3862 }, { "epoch": 2.708976157082749, "grad_norm": 0.24588458240032196, "learning_rate": 2.202985080583559e-05, "loss": 0.0915, "step": 3863 }, { "epoch": 2.709677419354839, "grad_norm": 0.2887514531612396, "learning_rate": 2.2018820893303492e-05, "loss": 0.092, "step": 3864 }, { "epoch": 2.7103786816269286, "grad_norm": 0.24385443329811096, "learning_rate": 2.2007791569410318e-05, "loss": 0.0909, "step": 3865 }, { "epoch": 2.7110799438990183, "grad_norm": 0.2794100046157837, "learning_rate": 2.1996762836333832e-05, "loss": 0.255, "step": 3866 }, { "epoch": 2.711781206171108, "grad_norm": 0.28891631960868835, "learning_rate": 2.1985734696251675e-05, "loss": 0.0907, "step": 3867 }, { "epoch": 2.7124824684431976, "grad_norm": 0.24346667528152466, "learning_rate": 2.1974707151341373e-05, "loss": 0.091, "step": 3868 }, { "epoch": 2.7131837307152873, "grad_norm": 0.7698819041252136, "learning_rate": 2.1963680203780338e-05, "loss": 0.4184, "step": 3869 }, { "epoch": 2.7138849929873774, "grad_norm": 0.24360641837120056, "learning_rate": 2.1952653855745856e-05, "loss": 0.0907, "step": 3870 }, { "epoch": 2.714586255259467, "grad_norm": 0.26708096265792847, "learning_rate": 2.1941628109415088e-05, "loss": 0.2514, "step": 3871 }, { "epoch": 2.7152875175315567, "grad_norm": 0.25630369782447815, "learning_rate": 2.1930602966965096e-05, "loss": 0.0897, "step": 3872 }, { "epoch": 2.715988779803647, "grad_norm": 1.4103879928588867, "learning_rate": 2.191957843057282e-05, "loss": 0.4399, "step": 3873 }, { "epoch": 2.7166900420757365, "grad_norm": 0.27928197383880615, "learning_rate": 2.1908554502415058e-05, "loss": 0.2523, "step": 3874 }, { "epoch": 2.717391304347826, "grad_norm": 0.2585316002368927, "learning_rate": 2.1897531184668517e-05, "loss": 0.0903, "step": 3875 }, { "epoch": 2.718092566619916, "grad_norm": 0.25791674852371216, "learning_rate": 2.188650847950976e-05, "loss": 0.0903, "step": 3876 }, { "epoch": 2.7187938288920055, "grad_norm": 0.25702592730522156, "learning_rate": 2.1875486389115256e-05, "loss": 0.09, "step": 3877 }, { "epoch": 2.719495091164095, "grad_norm": 0.27587398886680603, "learning_rate": 2.1864464915661314e-05, "loss": 0.2531, "step": 3878 }, { "epoch": 2.7201963534361853, "grad_norm": 0.2879043519496918, "learning_rate": 2.1853444061324156e-05, "loss": 0.2551, "step": 3879 }, { "epoch": 2.720897615708275, "grad_norm": 0.2717130780220032, "learning_rate": 2.184242382827986e-05, "loss": 0.2527, "step": 3880 }, { "epoch": 2.7215988779803646, "grad_norm": 0.28530353307724, "learning_rate": 2.1831404218704395e-05, "loss": 0.251, "step": 3881 }, { "epoch": 2.7223001402524543, "grad_norm": 0.24496175348758698, "learning_rate": 2.1820385234773604e-05, "loss": 0.0911, "step": 3882 }, { "epoch": 2.7230014025245444, "grad_norm": 0.26240274310112, "learning_rate": 2.1809366878663203e-05, "loss": 0.0913, "step": 3883 }, { "epoch": 2.723702664796634, "grad_norm": 0.2709829807281494, "learning_rate": 2.1798349152548796e-05, "loss": 0.2507, "step": 3884 }, { "epoch": 2.7244039270687237, "grad_norm": 0.27014636993408203, "learning_rate": 2.1787332058605825e-05, "loss": 0.2525, "step": 3885 }, { "epoch": 2.7251051893408134, "grad_norm": 0.25596389174461365, "learning_rate": 2.177631559900965e-05, "loss": 0.0893, "step": 3886 }, { "epoch": 2.725806451612903, "grad_norm": 0.2823891341686249, "learning_rate": 2.1765299775935492e-05, "loss": 0.0892, "step": 3887 }, { "epoch": 2.726507713884993, "grad_norm": 1.2724095582962036, "learning_rate": 2.175428459155844e-05, "loss": 0.2802, "step": 3888 }, { "epoch": 2.727208976157083, "grad_norm": 0.26418960094451904, "learning_rate": 2.1743270048053463e-05, "loss": 0.2511, "step": 3889 }, { "epoch": 2.7279102384291725, "grad_norm": 0.7537374496459961, "learning_rate": 2.17322561475954e-05, "loss": 0.4149, "step": 3890 }, { "epoch": 2.728611500701262, "grad_norm": 0.2639913260936737, "learning_rate": 2.1721242892358977e-05, "loss": 0.2511, "step": 3891 }, { "epoch": 2.7293127629733522, "grad_norm": 0.2467757910490036, "learning_rate": 2.1710230284518756e-05, "loss": 0.0916, "step": 3892 }, { "epoch": 2.730014025245442, "grad_norm": 0.24498388171195984, "learning_rate": 2.169921832624921e-05, "loss": 0.0912, "step": 3893 }, { "epoch": 2.7307152875175316, "grad_norm": 0.27944621443748474, "learning_rate": 2.168820701972467e-05, "loss": 0.2547, "step": 3894 }, { "epoch": 2.7314165497896212, "grad_norm": 0.2731429636478424, "learning_rate": 2.1677196367119328e-05, "loss": 0.2553, "step": 3895 }, { "epoch": 2.732117812061711, "grad_norm": 0.2691192924976349, "learning_rate": 2.1666186370607266e-05, "loss": 0.2533, "step": 3896 }, { "epoch": 2.732819074333801, "grad_norm": 0.2658100724220276, "learning_rate": 2.165517703236242e-05, "loss": 0.0927, "step": 3897 }, { "epoch": 2.7335203366058907, "grad_norm": 0.24691185355186462, "learning_rate": 2.164416835455862e-05, "loss": 0.0917, "step": 3898 }, { "epoch": 2.7342215988779803, "grad_norm": 0.25932934880256653, "learning_rate": 2.1633160339369522e-05, "loss": 0.0909, "step": 3899 }, { "epoch": 2.73492286115007, "grad_norm": 0.26697060465812683, "learning_rate": 2.1622152988968693e-05, "loss": 0.2515, "step": 3900 }, { "epoch": 2.73562412342216, "grad_norm": 0.283431738615036, "learning_rate": 2.1611146305529547e-05, "loss": 0.0904, "step": 3901 }, { "epoch": 2.7363253856942498, "grad_norm": 0.26792988181114197, "learning_rate": 2.1600140291225377e-05, "loss": 0.2536, "step": 3902 }, { "epoch": 2.7370266479663394, "grad_norm": 0.744951605796814, "learning_rate": 2.1589134948229335e-05, "loss": 0.4102, "step": 3903 }, { "epoch": 2.737727910238429, "grad_norm": 0.32294225692749023, "learning_rate": 2.157813027871445e-05, "loss": 0.0909, "step": 3904 }, { "epoch": 2.7384291725105188, "grad_norm": 0.26304715871810913, "learning_rate": 2.1567126284853616e-05, "loss": 0.0918, "step": 3905 }, { "epoch": 2.7391304347826084, "grad_norm": 0.7392199635505676, "learning_rate": 2.155612296881958e-05, "loss": 0.4089, "step": 3906 }, { "epoch": 2.7398316970546985, "grad_norm": 0.26625025272369385, "learning_rate": 2.1545120332784974e-05, "loss": 0.2529, "step": 3907 }, { "epoch": 2.740532959326788, "grad_norm": 0.26407507061958313, "learning_rate": 2.153411837892228e-05, "loss": 0.0922, "step": 3908 }, { "epoch": 2.741234221598878, "grad_norm": 0.249765545129776, "learning_rate": 2.152311710940386e-05, "loss": 0.0926, "step": 3909 }, { "epoch": 2.741935483870968, "grad_norm": 0.2801600992679596, "learning_rate": 2.1512116526401928e-05, "loss": 0.2504, "step": 3910 }, { "epoch": 2.7426367461430576, "grad_norm": 0.269108384847641, "learning_rate": 2.1501116632088573e-05, "loss": 0.0938, "step": 3911 }, { "epoch": 2.7433380084151473, "grad_norm": 0.24794942140579224, "learning_rate": 2.1490117428635737e-05, "loss": 0.0922, "step": 3912 }, { "epoch": 2.744039270687237, "grad_norm": 0.754015326499939, "learning_rate": 2.1479118918215236e-05, "loss": 0.4116, "step": 3913 }, { "epoch": 2.7447405329593266, "grad_norm": 0.28980371356010437, "learning_rate": 2.1468121102998738e-05, "loss": 0.0913, "step": 3914 }, { "epoch": 2.7454417952314163, "grad_norm": 0.26845303177833557, "learning_rate": 2.145712398515779e-05, "loss": 0.2481, "step": 3915 }, { "epoch": 2.7461430575035064, "grad_norm": 0.24921460449695587, "learning_rate": 2.144612756686379e-05, "loss": 0.0921, "step": 3916 }, { "epoch": 2.746844319775596, "grad_norm": 0.27374589443206787, "learning_rate": 2.1435131850288002e-05, "loss": 0.2509, "step": 3917 }, { "epoch": 2.7475455820476857, "grad_norm": 0.2636300027370453, "learning_rate": 2.1424136837601543e-05, "loss": 0.0915, "step": 3918 }, { "epoch": 2.748246844319776, "grad_norm": 0.26041337847709656, "learning_rate": 2.1413142530975395e-05, "loss": 0.0906, "step": 3919 }, { "epoch": 2.7489481065918655, "grad_norm": 0.2506866455078125, "learning_rate": 2.1402148932580403e-05, "loss": 0.0926, "step": 3920 }, { "epoch": 2.749649368863955, "grad_norm": 1.4262069463729858, "learning_rate": 2.1391156044587278e-05, "loss": 0.4314, "step": 3921 }, { "epoch": 2.750350631136045, "grad_norm": 1.3382848501205444, "learning_rate": 2.138016386916658e-05, "loss": 0.287, "step": 3922 }, { "epoch": 2.7510518934081345, "grad_norm": 0.2645423412322998, "learning_rate": 2.136917240848874e-05, "loss": 0.252, "step": 3923 }, { "epoch": 2.751753155680224, "grad_norm": 0.25243550539016724, "learning_rate": 2.135818166472403e-05, "loss": 0.0933, "step": 3924 }, { "epoch": 2.7524544179523143, "grad_norm": 0.26064932346343994, "learning_rate": 2.1347191640042608e-05, "loss": 0.2483, "step": 3925 }, { "epoch": 2.753155680224404, "grad_norm": 0.2917742431163788, "learning_rate": 2.1336202336614454e-05, "loss": 0.0926, "step": 3926 }, { "epoch": 2.7538569424964936, "grad_norm": 0.272655725479126, "learning_rate": 2.132521375660943e-05, "loss": 0.0942, "step": 3927 }, { "epoch": 2.7545582047685837, "grad_norm": 0.26567935943603516, "learning_rate": 2.1314225902197254e-05, "loss": 0.093, "step": 3928 }, { "epoch": 2.7552594670406734, "grad_norm": 0.2653738260269165, "learning_rate": 2.1303238775547497e-05, "loss": 0.092, "step": 3929 }, { "epoch": 2.755960729312763, "grad_norm": 0.2637764811515808, "learning_rate": 2.129225237882958e-05, "loss": 0.2527, "step": 3930 }, { "epoch": 2.7566619915848527, "grad_norm": 0.3614460825920105, "learning_rate": 2.1281266714212793e-05, "loss": 0.0906, "step": 3931 }, { "epoch": 2.7573632538569424, "grad_norm": 0.2638106942176819, "learning_rate": 2.1270281783866282e-05, "loss": 0.2519, "step": 3932 }, { "epoch": 2.758064516129032, "grad_norm": 0.2561452090740204, "learning_rate": 2.125929758995902e-05, "loss": 0.2503, "step": 3933 }, { "epoch": 2.758765778401122, "grad_norm": 0.26872360706329346, "learning_rate": 2.1248314134659864e-05, "loss": 0.0928, "step": 3934 }, { "epoch": 2.759467040673212, "grad_norm": 0.2516775131225586, "learning_rate": 2.123733142013752e-05, "loss": 0.0933, "step": 3935 }, { "epoch": 2.7601683029453015, "grad_norm": 0.267879456281662, "learning_rate": 2.1226349448560535e-05, "loss": 0.0928, "step": 3936 }, { "epoch": 2.7608695652173916, "grad_norm": 0.2658371329307556, "learning_rate": 2.121536822209733e-05, "loss": 0.0926, "step": 3937 }, { "epoch": 2.7615708274894812, "grad_norm": 0.24792402982711792, "learning_rate": 2.1204387742916156e-05, "loss": 0.0924, "step": 3938 }, { "epoch": 2.762272089761571, "grad_norm": 0.2686751186847687, "learning_rate": 2.1193408013185147e-05, "loss": 0.2521, "step": 3939 }, { "epoch": 2.7629733520336606, "grad_norm": 0.2644544839859009, "learning_rate": 2.118242903507224e-05, "loss": 0.0915, "step": 3940 }, { "epoch": 2.7636746143057502, "grad_norm": 0.283626914024353, "learning_rate": 2.117145081074527e-05, "loss": 0.09, "step": 3941 }, { "epoch": 2.76437587657784, "grad_norm": 0.25877952575683594, "learning_rate": 2.1160473342371904e-05, "loss": 0.0908, "step": 3942 }, { "epoch": 2.76507713884993, "grad_norm": 0.26373806595802307, "learning_rate": 2.1149496632119666e-05, "loss": 0.2493, "step": 3943 }, { "epoch": 2.7657784011220197, "grad_norm": 1.0421513319015503, "learning_rate": 2.1138520682155916e-05, "loss": 0.2635, "step": 3944 }, { "epoch": 2.7664796633941093, "grad_norm": 0.26271820068359375, "learning_rate": 2.1127545494647882e-05, "loss": 0.0909, "step": 3945 }, { "epoch": 2.767180925666199, "grad_norm": 0.2680881917476654, "learning_rate": 2.1116571071762645e-05, "loss": 0.2521, "step": 3946 }, { "epoch": 2.767882187938289, "grad_norm": 0.26499679684638977, "learning_rate": 2.11055974156671e-05, "loss": 0.2521, "step": 3947 }, { "epoch": 2.7685834502103788, "grad_norm": 0.7507076263427734, "learning_rate": 2.1094624528528022e-05, "loss": 0.4137, "step": 3948 }, { "epoch": 2.7692847124824684, "grad_norm": 0.28920456767082214, "learning_rate": 2.108365241251203e-05, "loss": 0.091, "step": 3949 }, { "epoch": 2.769985974754558, "grad_norm": 0.7526519894599915, "learning_rate": 2.1072681069785588e-05, "loss": 0.4127, "step": 3950 }, { "epoch": 2.7706872370266478, "grad_norm": 0.24853579699993134, "learning_rate": 2.1061710502515004e-05, "loss": 0.0921, "step": 3951 }, { "epoch": 2.771388499298738, "grad_norm": 0.28469526767730713, "learning_rate": 2.1050740712866436e-05, "loss": 0.0909, "step": 3952 }, { "epoch": 2.7720897615708275, "grad_norm": 0.2744910717010498, "learning_rate": 2.1039771703005896e-05, "loss": 0.2489, "step": 3953 }, { "epoch": 2.772791023842917, "grad_norm": 0.24904930591583252, "learning_rate": 2.1028803475099217e-05, "loss": 0.0926, "step": 3954 }, { "epoch": 2.773492286115007, "grad_norm": 0.26635855436325073, "learning_rate": 2.1017836031312103e-05, "loss": 0.0925, "step": 3955 }, { "epoch": 2.774193548387097, "grad_norm": 0.2714531719684601, "learning_rate": 2.1006869373810106e-05, "loss": 0.2485, "step": 3956 }, { "epoch": 2.7748948106591866, "grad_norm": 0.2649597227573395, "learning_rate": 2.099590350475859e-05, "loss": 0.252, "step": 3957 }, { "epoch": 2.7755960729312763, "grad_norm": 0.2644709050655365, "learning_rate": 2.0984938426322796e-05, "loss": 0.2522, "step": 3958 }, { "epoch": 2.776297335203366, "grad_norm": 0.26180949807167053, "learning_rate": 2.0973974140667793e-05, "loss": 0.0913, "step": 3959 }, { "epoch": 2.7769985974754556, "grad_norm": 1.2826757431030273, "learning_rate": 2.0963010649958507e-05, "loss": 0.2461, "step": 3960 }, { "epoch": 2.7776998597475457, "grad_norm": 0.25081780552864075, "learning_rate": 2.0952047956359694e-05, "loss": 0.093, "step": 3961 }, { "epoch": 2.7784011220196354, "grad_norm": 0.29662850499153137, "learning_rate": 2.094108606203595e-05, "loss": 0.0925, "step": 3962 }, { "epoch": 2.779102384291725, "grad_norm": 0.3151998817920685, "learning_rate": 2.0930124969151724e-05, "loss": 0.0895, "step": 3963 }, { "epoch": 2.7798036465638147, "grad_norm": 0.7351003885269165, "learning_rate": 2.0919164679871306e-05, "loss": 0.4074, "step": 3964 }, { "epoch": 2.780504908835905, "grad_norm": 0.26873868703842163, "learning_rate": 2.090820519635882e-05, "loss": 0.2524, "step": 3965 }, { "epoch": 2.7812061711079945, "grad_norm": 0.24905185401439667, "learning_rate": 2.089724652077824e-05, "loss": 0.0925, "step": 3966 }, { "epoch": 2.781907433380084, "grad_norm": 0.26042047142982483, "learning_rate": 2.0886288655293373e-05, "loss": 0.2491, "step": 3967 }, { "epoch": 2.782608695652174, "grad_norm": 0.24978308379650116, "learning_rate": 2.0875331602067863e-05, "loss": 0.0927, "step": 3968 }, { "epoch": 2.7833099579242635, "grad_norm": 0.31144994497299194, "learning_rate": 2.0864375363265202e-05, "loss": 0.2518, "step": 3969 }, { "epoch": 2.784011220196353, "grad_norm": 1.3678661584854126, "learning_rate": 2.0853419941048717e-05, "loss": 0.4238, "step": 3970 }, { "epoch": 2.7847124824684433, "grad_norm": 0.2935931980609894, "learning_rate": 2.084246533758158e-05, "loss": 0.0915, "step": 3971 }, { "epoch": 2.785413744740533, "grad_norm": 0.25991740822792053, "learning_rate": 2.083151155502679e-05, "loss": 0.2471, "step": 3972 }, { "epoch": 2.7861150070126226, "grad_norm": 0.25186359882354736, "learning_rate": 2.0820558595547192e-05, "loss": 0.0933, "step": 3973 }, { "epoch": 2.7868162692847127, "grad_norm": 0.26792627573013306, "learning_rate": 2.080960646130548e-05, "loss": 0.0929, "step": 3974 }, { "epoch": 2.7875175315568024, "grad_norm": 0.2818504571914673, "learning_rate": 2.0798655154464154e-05, "loss": 0.2515, "step": 3975 }, { "epoch": 2.788218793828892, "grad_norm": 0.26695504784584045, "learning_rate": 2.0787704677185567e-05, "loss": 0.0929, "step": 3976 }, { "epoch": 2.7889200561009817, "grad_norm": 0.26276275515556335, "learning_rate": 2.077675503163192e-05, "loss": 0.2497, "step": 3977 }, { "epoch": 2.7896213183730714, "grad_norm": 1.013921856880188, "learning_rate": 2.0765806219965234e-05, "loss": 0.2518, "step": 3978 }, { "epoch": 2.790322580645161, "grad_norm": 1.3236503601074219, "learning_rate": 2.075485824434737e-05, "loss": 0.4096, "step": 3979 }, { "epoch": 2.791023842917251, "grad_norm": 0.2590242028236389, "learning_rate": 2.0743911106940034e-05, "loss": 0.2478, "step": 3980 }, { "epoch": 2.791725105189341, "grad_norm": 0.2696842849254608, "learning_rate": 2.0732964809904757e-05, "loss": 0.094, "step": 3981 }, { "epoch": 2.7924263674614305, "grad_norm": 0.2699885368347168, "learning_rate": 2.0722019355402883e-05, "loss": 0.0942, "step": 3982 }, { "epoch": 2.7931276297335206, "grad_norm": 0.2811874449253082, "learning_rate": 2.071107474559563e-05, "loss": 0.0957, "step": 3983 }, { "epoch": 2.7938288920056102, "grad_norm": 0.25452920794487, "learning_rate": 2.0700130982644024e-05, "loss": 0.0943, "step": 3984 }, { "epoch": 2.7945301542777, "grad_norm": 1.2991944551467896, "learning_rate": 2.0689188068708926e-05, "loss": 0.4117, "step": 3985 }, { "epoch": 2.7952314165497896, "grad_norm": 0.2560673952102661, "learning_rate": 2.067824600595104e-05, "loss": 0.2521, "step": 3986 }, { "epoch": 2.7959326788218792, "grad_norm": 0.2764321267604828, "learning_rate": 2.0667304796530897e-05, "loss": 0.0968, "step": 3987 }, { "epoch": 2.796633941093969, "grad_norm": 0.27811720967292786, "learning_rate": 2.065636444260886e-05, "loss": 0.0958, "step": 3988 }, { "epoch": 2.797335203366059, "grad_norm": 1.1112098693847656, "learning_rate": 2.06454249463451e-05, "loss": 0.3959, "step": 3989 }, { "epoch": 2.7980364656381487, "grad_norm": 0.2549790143966675, "learning_rate": 2.0634486309899657e-05, "loss": 0.0948, "step": 3990 }, { "epoch": 2.7987377279102383, "grad_norm": 0.27838921546936035, "learning_rate": 2.0623548535432384e-05, "loss": 0.0965, "step": 3991 }, { "epoch": 2.7994389901823284, "grad_norm": 0.31273922324180603, "learning_rate": 2.0612611625102953e-05, "loss": 0.0989, "step": 3992 }, { "epoch": 2.800140252454418, "grad_norm": 0.2844776511192322, "learning_rate": 2.0601675581070885e-05, "loss": 0.0979, "step": 3993 }, { "epoch": 2.8008415147265078, "grad_norm": 0.2841992974281311, "learning_rate": 2.0590740405495514e-05, "loss": 0.0978, "step": 3994 }, { "epoch": 2.8015427769985974, "grad_norm": 0.256233811378479, "learning_rate": 2.0579806100536027e-05, "loss": 0.0951, "step": 3995 }, { "epoch": 2.802244039270687, "grad_norm": 0.28123629093170166, "learning_rate": 2.0568872668351397e-05, "loss": 0.0977, "step": 3996 }, { "epoch": 2.8029453015427768, "grad_norm": 0.7075570225715637, "learning_rate": 2.0557940111100454e-05, "loss": 0.4132, "step": 3997 }, { "epoch": 2.803646563814867, "grad_norm": 0.25557997822761536, "learning_rate": 2.054700843094186e-05, "loss": 0.2476, "step": 3998 }, { "epoch": 2.8043478260869565, "grad_norm": 0.29049235582351685, "learning_rate": 2.0536077630034086e-05, "loss": 0.0979, "step": 3999 }, { "epoch": 2.805049088359046, "grad_norm": 0.2501545548439026, "learning_rate": 2.052514771053544e-05, "loss": 0.2491, "step": 4000 }, { "epoch": 2.8057503506311363, "grad_norm": 0.707608699798584, "learning_rate": 2.0514218674604056e-05, "loss": 0.4028, "step": 4001 }, { "epoch": 2.806451612903226, "grad_norm": 0.32168760895729065, "learning_rate": 2.0503290524397894e-05, "loss": 0.1006, "step": 4002 }, { "epoch": 2.8071528751753156, "grad_norm": 0.2497447282075882, "learning_rate": 2.0492363262074727e-05, "loss": 0.2497, "step": 4003 }, { "epoch": 2.8078541374474053, "grad_norm": 0.27736249566078186, "learning_rate": 2.0481436889792165e-05, "loss": 0.0964, "step": 4004 }, { "epoch": 2.808555399719495, "grad_norm": 0.30938899517059326, "learning_rate": 2.0470511409707638e-05, "loss": 0.0988, "step": 4005 }, { "epoch": 2.8092566619915846, "grad_norm": 0.2524777352809906, "learning_rate": 2.0459586823978403e-05, "loss": 0.2524, "step": 4006 }, { "epoch": 2.8099579242636747, "grad_norm": 0.282277375459671, "learning_rate": 2.0448663134761534e-05, "loss": 0.0981, "step": 4007 }, { "epoch": 2.8106591865357644, "grad_norm": 0.2737281620502472, "learning_rate": 2.0437740344213937e-05, "loss": 0.0954, "step": 4008 }, { "epoch": 2.811360448807854, "grad_norm": 0.26293250918388367, "learning_rate": 2.042681845449234e-05, "loss": 0.2519, "step": 4009 }, { "epoch": 2.8120617110799437, "grad_norm": 0.3121945858001709, "learning_rate": 2.0415897467753275e-05, "loss": 0.0994, "step": 4010 }, { "epoch": 2.812762973352034, "grad_norm": 1.1567397117614746, "learning_rate": 2.0404977386153124e-05, "loss": 0.3865, "step": 4011 }, { "epoch": 2.8134642356241235, "grad_norm": 0.2535926103591919, "learning_rate": 2.0394058211848063e-05, "loss": 0.2479, "step": 4012 }, { "epoch": 2.814165497896213, "grad_norm": 0.25513744354248047, "learning_rate": 2.0383139946994112e-05, "loss": 0.0949, "step": 4013 }, { "epoch": 2.814866760168303, "grad_norm": 0.2533455193042755, "learning_rate": 2.0372222593747095e-05, "loss": 0.2496, "step": 4014 }, { "epoch": 2.8155680224403925, "grad_norm": 0.31855857372283936, "learning_rate": 2.0361306154262668e-05, "loss": 0.0987, "step": 4015 }, { "epoch": 2.8162692847124826, "grad_norm": 0.3142879009246826, "learning_rate": 2.03503906306963e-05, "loss": 0.0987, "step": 4016 }, { "epoch": 2.8169705469845723, "grad_norm": 0.3048308789730072, "learning_rate": 2.033947602520327e-05, "loss": 0.0985, "step": 4017 }, { "epoch": 2.817671809256662, "grad_norm": 0.3153282105922699, "learning_rate": 2.0328562339938696e-05, "loss": 0.0989, "step": 4018 }, { "epoch": 2.8183730715287516, "grad_norm": 0.25462615489959717, "learning_rate": 2.03176495770575e-05, "loss": 0.2483, "step": 4019 }, { "epoch": 2.8190743338008417, "grad_norm": 0.25325971841812134, "learning_rate": 2.0306737738714423e-05, "loss": 0.2503, "step": 4020 }, { "epoch": 2.8197755960729314, "grad_norm": 0.32417863607406616, "learning_rate": 2.0295826827064028e-05, "loss": 0.0995, "step": 4021 }, { "epoch": 2.820476858345021, "grad_norm": 0.25349563360214233, "learning_rate": 2.02849168442607e-05, "loss": 0.0944, "step": 4022 }, { "epoch": 2.8211781206171107, "grad_norm": 0.2528145909309387, "learning_rate": 2.0274007792458637e-05, "loss": 0.0944, "step": 4023 }, { "epoch": 2.8218793828892004, "grad_norm": 0.2609696686267853, "learning_rate": 2.026309967381183e-05, "loss": 0.2509, "step": 4024 }, { "epoch": 2.8225806451612905, "grad_norm": 0.2563035786151886, "learning_rate": 2.0252192490474113e-05, "loss": 0.2521, "step": 4025 }, { "epoch": 2.82328190743338, "grad_norm": 0.25315600633621216, "learning_rate": 2.0241286244599138e-05, "loss": 0.0942, "step": 4026 }, { "epoch": 2.82398316970547, "grad_norm": 0.2570129930973053, "learning_rate": 2.0230380938340355e-05, "loss": 0.2472, "step": 4027 }, { "epoch": 2.8246844319775595, "grad_norm": 0.26317986845970154, "learning_rate": 2.0219476573851035e-05, "loss": 0.2543, "step": 4028 }, { "epoch": 2.8253856942496496, "grad_norm": 0.860576868057251, "learning_rate": 2.0208573153284267e-05, "loss": 0.2417, "step": 4029 }, { "epoch": 2.8260869565217392, "grad_norm": 0.28323429822921753, "learning_rate": 2.019767067879296e-05, "loss": 0.0962, "step": 4030 }, { "epoch": 2.826788218793829, "grad_norm": 0.2568010091781616, "learning_rate": 2.0186769152529797e-05, "loss": 0.2499, "step": 4031 }, { "epoch": 2.8274894810659186, "grad_norm": 0.2543623447418213, "learning_rate": 2.0175868576647325e-05, "loss": 0.0945, "step": 4032 }, { "epoch": 2.8281907433380082, "grad_norm": 0.26196935772895813, "learning_rate": 2.016496895329788e-05, "loss": 0.25, "step": 4033 }, { "epoch": 2.828892005610098, "grad_norm": 0.27165189385414124, "learning_rate": 2.0154070284633603e-05, "loss": 0.095, "step": 4034 }, { "epoch": 2.829593267882188, "grad_norm": 0.9724327325820923, "learning_rate": 2.0143172572806458e-05, "loss": 0.2535, "step": 4035 }, { "epoch": 2.8302945301542777, "grad_norm": 0.25307318568229675, "learning_rate": 2.0132275819968224e-05, "loss": 0.0942, "step": 4036 }, { "epoch": 2.8309957924263673, "grad_norm": 0.2684568762779236, "learning_rate": 2.012138002827049e-05, "loss": 0.2549, "step": 4037 }, { "epoch": 2.8316970546984574, "grad_norm": 0.2741326093673706, "learning_rate": 2.0110485199864623e-05, "loss": 0.0958, "step": 4038 }, { "epoch": 2.832398316970547, "grad_norm": 0.24667352437973022, "learning_rate": 2.009959133690184e-05, "loss": 0.2488, "step": 4039 }, { "epoch": 2.833099579242637, "grad_norm": 0.27513906359672546, "learning_rate": 2.0088698441533153e-05, "loss": 0.0964, "step": 4040 }, { "epoch": 2.8338008415147264, "grad_norm": 0.2874162793159485, "learning_rate": 2.0077806515909383e-05, "loss": 0.0969, "step": 4041 }, { "epoch": 2.834502103786816, "grad_norm": 0.28782203793525696, "learning_rate": 2.006691556218116e-05, "loss": 0.0983, "step": 4042 }, { "epoch": 2.8352033660589058, "grad_norm": 0.30778709053993225, "learning_rate": 2.005602558249893e-05, "loss": 0.098, "step": 4043 }, { "epoch": 2.835904628330996, "grad_norm": 0.2553997337818146, "learning_rate": 2.004513657901292e-05, "loss": 0.2502, "step": 4044 }, { "epoch": 2.8366058906030855, "grad_norm": 0.2576960623264313, "learning_rate": 2.0034248553873187e-05, "loss": 0.2494, "step": 4045 }, { "epoch": 2.837307152875175, "grad_norm": 0.27509650588035583, "learning_rate": 2.00233615092296e-05, "loss": 0.0963, "step": 4046 }, { "epoch": 2.8380084151472653, "grad_norm": 0.27605172991752625, "learning_rate": 2.0012475447231814e-05, "loss": 0.0967, "step": 4047 }, { "epoch": 2.838709677419355, "grad_norm": 0.2630045413970947, "learning_rate": 2.0001590370029305e-05, "loss": 0.2494, "step": 4048 }, { "epoch": 2.8394109396914446, "grad_norm": 0.2543158233165741, "learning_rate": 1.9990706279771355e-05, "loss": 0.0946, "step": 4049 }, { "epoch": 2.8401122019635343, "grad_norm": 0.27050071954727173, "learning_rate": 1.9979823178607042e-05, "loss": 0.0952, "step": 4050 }, { "epoch": 2.840813464235624, "grad_norm": 0.2509218156337738, "learning_rate": 1.996894106868525e-05, "loss": 0.0937, "step": 4051 }, { "epoch": 2.8415147265077136, "grad_norm": 0.25570639967918396, "learning_rate": 1.9958059952154675e-05, "loss": 0.251, "step": 4052 }, { "epoch": 2.8422159887798037, "grad_norm": 0.2504650354385376, "learning_rate": 1.9947179831163816e-05, "loss": 0.0936, "step": 4053 }, { "epoch": 2.8429172510518934, "grad_norm": 0.26521486043930054, "learning_rate": 1.9936300707860956e-05, "loss": 0.2545, "step": 4054 }, { "epoch": 2.843618513323983, "grad_norm": 0.25074851512908936, "learning_rate": 1.992542258439421e-05, "loss": 0.0936, "step": 4055 }, { "epoch": 2.844319775596073, "grad_norm": 0.25549569725990295, "learning_rate": 1.9914545462911478e-05, "loss": 0.2492, "step": 4056 }, { "epoch": 2.845021037868163, "grad_norm": 0.2687194049358368, "learning_rate": 1.9903669345560473e-05, "loss": 0.0944, "step": 4057 }, { "epoch": 2.8457223001402525, "grad_norm": 0.26031574606895447, "learning_rate": 1.9892794234488692e-05, "loss": 0.2549, "step": 4058 }, { "epoch": 2.846423562412342, "grad_norm": 0.2590359151363373, "learning_rate": 1.9881920131843445e-05, "loss": 0.2521, "step": 4059 }, { "epoch": 2.847124824684432, "grad_norm": 0.2486588954925537, "learning_rate": 1.987104703977185e-05, "loss": 0.0929, "step": 4060 }, { "epoch": 2.8478260869565215, "grad_norm": 0.252719521522522, "learning_rate": 1.9860174960420812e-05, "loss": 0.0939, "step": 4061 }, { "epoch": 2.8485273492286116, "grad_norm": 0.2511748969554901, "learning_rate": 1.9849303895937046e-05, "loss": 0.0936, "step": 4062 }, { "epoch": 2.8492286115007013, "grad_norm": 0.24993152916431427, "learning_rate": 1.9838433848467065e-05, "loss": 0.0931, "step": 4063 }, { "epoch": 2.849929873772791, "grad_norm": 0.30324357748031616, "learning_rate": 1.9827564820157172e-05, "loss": 0.0955, "step": 4064 }, { "epoch": 2.850631136044881, "grad_norm": 0.24758261442184448, "learning_rate": 1.9816696813153478e-05, "loss": 0.0923, "step": 4065 }, { "epoch": 2.8513323983169707, "grad_norm": 0.2679571211338043, "learning_rate": 1.980582982960188e-05, "loss": 0.0926, "step": 4066 }, { "epoch": 2.8520336605890604, "grad_norm": 0.24846217036247253, "learning_rate": 1.97949638716481e-05, "loss": 0.0928, "step": 4067 }, { "epoch": 2.85273492286115, "grad_norm": 0.2880379259586334, "learning_rate": 1.9784098941437622e-05, "loss": 0.0932, "step": 4068 }, { "epoch": 2.8534361851332397, "grad_norm": 0.263205349445343, "learning_rate": 1.977323504111576e-05, "loss": 0.0925, "step": 4069 }, { "epoch": 2.8541374474053294, "grad_norm": 0.32018378376960754, "learning_rate": 1.9762372172827602e-05, "loss": 0.0934, "step": 4070 }, { "epoch": 2.8548387096774195, "grad_norm": 0.2612738609313965, "learning_rate": 1.9751510338718053e-05, "loss": 0.0919, "step": 4071 }, { "epoch": 2.855539971949509, "grad_norm": 0.2629019320011139, "learning_rate": 1.9740649540931777e-05, "loss": 0.252, "step": 4072 }, { "epoch": 2.856241234221599, "grad_norm": 0.25616684556007385, "learning_rate": 1.972978978161327e-05, "loss": 0.0906, "step": 4073 }, { "epoch": 2.8569424964936885, "grad_norm": 0.2765290439128876, "learning_rate": 1.9718931062906816e-05, "loss": 0.2542, "step": 4074 }, { "epoch": 2.8576437587657786, "grad_norm": 0.24237757921218872, "learning_rate": 1.9708073386956476e-05, "loss": 0.0907, "step": 4075 }, { "epoch": 2.8583450210378682, "grad_norm": 0.24244120717048645, "learning_rate": 1.9697216755906126e-05, "loss": 0.0905, "step": 4076 }, { "epoch": 2.859046283309958, "grad_norm": 0.7572843432426453, "learning_rate": 1.968636117189942e-05, "loss": 0.42, "step": 4077 }, { "epoch": 2.8597475455820476, "grad_norm": 0.2683912217617035, "learning_rate": 1.9675506637079827e-05, "loss": 0.2502, "step": 4078 }, { "epoch": 2.8604488078541372, "grad_norm": 0.25265035033226013, "learning_rate": 1.966465315359057e-05, "loss": 0.0896, "step": 4079 }, { "epoch": 2.8611500701262274, "grad_norm": 0.24221491813659668, "learning_rate": 1.96538007235747e-05, "loss": 0.0905, "step": 4080 }, { "epoch": 2.861851332398317, "grad_norm": 0.2421107292175293, "learning_rate": 1.9642949349175043e-05, "loss": 0.0902, "step": 4081 }, { "epoch": 2.8625525946704067, "grad_norm": 0.24168796837329865, "learning_rate": 1.963209903253423e-05, "loss": 0.0905, "step": 4082 }, { "epoch": 2.8632538569424963, "grad_norm": 0.25296011567115784, "learning_rate": 1.9621249775794666e-05, "loss": 0.0892, "step": 4083 }, { "epoch": 2.8639551192145865, "grad_norm": 0.2704314589500427, "learning_rate": 1.9610401581098565e-05, "loss": 0.0884, "step": 4084 }, { "epoch": 2.864656381486676, "grad_norm": 1.0489152669906616, "learning_rate": 1.959955445058792e-05, "loss": 0.2733, "step": 4085 }, { "epoch": 2.865357643758766, "grad_norm": 0.2725112736225128, "learning_rate": 1.9588708386404504e-05, "loss": 0.0889, "step": 4086 }, { "epoch": 2.8660589060308554, "grad_norm": 1.2007662057876587, "learning_rate": 1.9577863390689898e-05, "loss": 0.2815, "step": 4087 }, { "epoch": 2.866760168302945, "grad_norm": 0.2793641686439514, "learning_rate": 1.956701946558547e-05, "loss": 0.252, "step": 4088 }, { "epoch": 2.867461430575035, "grad_norm": 1.0485461950302124, "learning_rate": 1.955617661323236e-05, "loss": 0.2689, "step": 4089 }, { "epoch": 2.868162692847125, "grad_norm": 0.28266459703445435, "learning_rate": 1.9545334835771516e-05, "loss": 0.2562, "step": 4090 }, { "epoch": 2.8688639551192145, "grad_norm": 0.2549438774585724, "learning_rate": 1.953449413534367e-05, "loss": 0.0906, "step": 4091 }, { "epoch": 2.869565217391304, "grad_norm": 0.25563308596611023, "learning_rate": 1.9523654514089336e-05, "loss": 0.0906, "step": 4092 }, { "epoch": 2.8702664796633943, "grad_norm": 0.2527416944503784, "learning_rate": 1.9512815974148803e-05, "loss": 0.0898, "step": 4093 }, { "epoch": 2.870967741935484, "grad_norm": 0.2736493647098541, "learning_rate": 1.9501978517662167e-05, "loss": 0.2538, "step": 4094 }, { "epoch": 2.8716690042075736, "grad_norm": 0.7476885318756104, "learning_rate": 1.9491142146769304e-05, "loss": 0.4199, "step": 4095 }, { "epoch": 2.8723702664796633, "grad_norm": 0.2665017545223236, "learning_rate": 1.948030686360987e-05, "loss": 0.2547, "step": 4096 }, { "epoch": 2.873071528751753, "grad_norm": 0.24118973314762115, "learning_rate": 1.9469472670323315e-05, "loss": 0.0904, "step": 4097 }, { "epoch": 2.8737727910238426, "grad_norm": 0.7477956414222717, "learning_rate": 1.945863956904887e-05, "loss": 0.4125, "step": 4098 }, { "epoch": 2.8744740532959328, "grad_norm": 0.24453730881214142, "learning_rate": 1.944780756192555e-05, "loss": 0.0914, "step": 4099 }, { "epoch": 2.8751753155680224, "grad_norm": 0.25974947214126587, "learning_rate": 1.9436976651092144e-05, "loss": 0.0921, "step": 4100 }, { "epoch": 2.875876577840112, "grad_norm": 0.2621353268623352, "learning_rate": 1.9426146838687236e-05, "loss": 0.2505, "step": 4101 }, { "epoch": 2.876577840112202, "grad_norm": 0.24264951050281525, "learning_rate": 1.9415318126849207e-05, "loss": 0.091, "step": 4102 }, { "epoch": 2.877279102384292, "grad_norm": 0.2431272566318512, "learning_rate": 1.9404490517716183e-05, "loss": 0.0912, "step": 4103 }, { "epoch": 2.8779803646563815, "grad_norm": 0.2656257450580597, "learning_rate": 1.9393664013426104e-05, "loss": 0.0922, "step": 4104 }, { "epoch": 2.878681626928471, "grad_norm": 0.26552581787109375, "learning_rate": 1.9382838616116683e-05, "loss": 0.2532, "step": 4105 }, { "epoch": 2.879382889200561, "grad_norm": 0.26935338973999023, "learning_rate": 1.937201432792542e-05, "loss": 0.253, "step": 4106 }, { "epoch": 2.8800841514726505, "grad_norm": 0.8319031000137329, "learning_rate": 1.936119115098957e-05, "loss": 0.2384, "step": 4107 }, { "epoch": 2.8807854137447406, "grad_norm": 0.7398943901062012, "learning_rate": 1.9350369087446203e-05, "loss": 0.4164, "step": 4108 }, { "epoch": 2.8814866760168303, "grad_norm": 0.2444543093442917, "learning_rate": 1.9339548139432145e-05, "loss": 0.0915, "step": 4109 }, { "epoch": 2.88218793828892, "grad_norm": 0.26387178897857666, "learning_rate": 1.932872830908402e-05, "loss": 0.0934, "step": 4110 }, { "epoch": 2.88288920056101, "grad_norm": 0.27118101716041565, "learning_rate": 1.9317909598538215e-05, "loss": 0.2558, "step": 4111 }, { "epoch": 2.8835904628330997, "grad_norm": 0.2721029818058014, "learning_rate": 1.9307092009930914e-05, "loss": 0.2558, "step": 4112 }, { "epoch": 2.8842917251051894, "grad_norm": 0.8375665545463562, "learning_rate": 1.9296275545398055e-05, "loss": 0.226, "step": 4113 }, { "epoch": 2.884992987377279, "grad_norm": 0.2449479103088379, "learning_rate": 1.9285460207075372e-05, "loss": 0.0919, "step": 4114 }, { "epoch": 2.8856942496493687, "grad_norm": 0.2468106597661972, "learning_rate": 1.9274645997098373e-05, "loss": 0.0924, "step": 4115 }, { "epoch": 2.8863955119214584, "grad_norm": 0.2614741027355194, "learning_rate": 1.9263832917602343e-05, "loss": 0.2523, "step": 4116 }, { "epoch": 2.8870967741935485, "grad_norm": 0.30592912435531616, "learning_rate": 1.9253020970722338e-05, "loss": 0.0967, "step": 4117 }, { "epoch": 2.887798036465638, "grad_norm": 0.2747158110141754, "learning_rate": 1.92422101585932e-05, "loss": 0.2541, "step": 4118 }, { "epoch": 2.888499298737728, "grad_norm": 0.7149889469146729, "learning_rate": 1.9231400483349543e-05, "loss": 0.2287, "step": 4119 }, { "epoch": 2.889200561009818, "grad_norm": 0.24598439037799835, "learning_rate": 1.9220591947125766e-05, "loss": 0.092, "step": 4120 }, { "epoch": 2.8899018232819076, "grad_norm": 0.24649357795715332, "learning_rate": 1.920978455205601e-05, "loss": 0.0922, "step": 4121 }, { "epoch": 2.8906030855539973, "grad_norm": 0.27074649930000305, "learning_rate": 1.9198978300274227e-05, "loss": 0.2549, "step": 4122 }, { "epoch": 2.891304347826087, "grad_norm": 0.7325629591941833, "learning_rate": 1.9188173193914126e-05, "loss": 0.4139, "step": 4123 }, { "epoch": 2.8920056100981766, "grad_norm": 0.3061792850494385, "learning_rate": 1.9177369235109196e-05, "loss": 0.099, "step": 4124 }, { "epoch": 2.8927068723702662, "grad_norm": 0.25907787680625916, "learning_rate": 1.91665664259927e-05, "loss": 0.2558, "step": 4125 }, { "epoch": 2.8934081346423564, "grad_norm": 0.24973133206367493, "learning_rate": 1.915576476869767e-05, "loss": 0.0932, "step": 4126 }, { "epoch": 2.894109396914446, "grad_norm": 0.24864839017391205, "learning_rate": 1.914496426535692e-05, "loss": 0.0929, "step": 4127 }, { "epoch": 2.8948106591865357, "grad_norm": 0.2701677083969116, "learning_rate": 1.9134164918103005e-05, "loss": 0.0952, "step": 4128 }, { "epoch": 2.895511921458626, "grad_norm": 0.2475201040506363, "learning_rate": 1.9123366729068286e-05, "loss": 0.0926, "step": 4129 }, { "epoch": 2.8962131837307155, "grad_norm": 0.25669994950294495, "learning_rate": 1.9112569700384892e-05, "loss": 0.2506, "step": 4130 }, { "epoch": 2.896914446002805, "grad_norm": 0.24747207760810852, "learning_rate": 1.9101773834184704e-05, "loss": 0.0928, "step": 4131 }, { "epoch": 2.897615708274895, "grad_norm": 0.2699098587036133, "learning_rate": 1.9090979132599393e-05, "loss": 0.0958, "step": 4132 }, { "epoch": 2.8983169705469845, "grad_norm": 0.24764864146709442, "learning_rate": 1.9080185597760387e-05, "loss": 0.0928, "step": 4133 }, { "epoch": 2.899018232819074, "grad_norm": 0.2753525376319885, "learning_rate": 1.9069393231798896e-05, "loss": 0.0949, "step": 4134 }, { "epoch": 2.8997194950911642, "grad_norm": 0.7448775768280029, "learning_rate": 1.9058602036845875e-05, "loss": 0.4115, "step": 4135 }, { "epoch": 2.900420757363254, "grad_norm": 0.848667323589325, "learning_rate": 1.904781201503207e-05, "loss": 0.2307, "step": 4136 }, { "epoch": 2.9011220196353436, "grad_norm": 0.24685904383659363, "learning_rate": 1.9037023168487993e-05, "loss": 0.0924, "step": 4137 }, { "epoch": 2.901823281907433, "grad_norm": 0.30867496132850647, "learning_rate": 1.902623549934392e-05, "loss": 0.0969, "step": 4138 }, { "epoch": 2.9025245441795233, "grad_norm": 0.2479078471660614, "learning_rate": 1.9015449009729893e-05, "loss": 0.0925, "step": 4139 }, { "epoch": 2.903225806451613, "grad_norm": 0.30979040265083313, "learning_rate": 1.9004663701775724e-05, "loss": 0.0998, "step": 4140 }, { "epoch": 2.9039270687237027, "grad_norm": 0.3499979078769684, "learning_rate": 1.8993879577611e-05, "loss": 0.1002, "step": 4141 }, { "epoch": 2.9046283309957923, "grad_norm": 0.24805963039398193, "learning_rate": 1.8983096639365046e-05, "loss": 0.0928, "step": 4142 }, { "epoch": 2.905329593267882, "grad_norm": 0.2617359459400177, "learning_rate": 1.897231488916698e-05, "loss": 0.2519, "step": 4143 }, { "epoch": 2.906030855539972, "grad_norm": 0.2761134207248688, "learning_rate": 1.8961534329145678e-05, "loss": 0.2556, "step": 4144 }, { "epoch": 2.9067321178120618, "grad_norm": 0.3769069314002991, "learning_rate": 1.8950754961429777e-05, "loss": 0.1013, "step": 4145 }, { "epoch": 2.9074333800841514, "grad_norm": 0.24517245590686798, "learning_rate": 1.893997678814769e-05, "loss": 0.0919, "step": 4146 }, { "epoch": 2.908134642356241, "grad_norm": 0.2983296811580658, "learning_rate": 1.8929199811427577e-05, "loss": 0.0973, "step": 4147 }, { "epoch": 2.908835904628331, "grad_norm": 0.26340538263320923, "learning_rate": 1.891842403339738e-05, "loss": 0.0936, "step": 4148 }, { "epoch": 2.909537166900421, "grad_norm": 0.2436319887638092, "learning_rate": 1.8907649456184785e-05, "loss": 0.0913, "step": 4149 }, { "epoch": 2.9102384291725105, "grad_norm": 0.26817673444747925, "learning_rate": 1.8896876081917257e-05, "loss": 0.2548, "step": 4150 }, { "epoch": 2.9109396914446, "grad_norm": 0.2601660192012787, "learning_rate": 1.888610391272201e-05, "loss": 0.2538, "step": 4151 }, { "epoch": 2.91164095371669, "grad_norm": 0.2670082449913025, "learning_rate": 1.8875332950726032e-05, "loss": 0.0942, "step": 4152 }, { "epoch": 2.91234221598878, "grad_norm": 0.284135639667511, "learning_rate": 1.8864563198056067e-05, "loss": 0.0934, "step": 4153 }, { "epoch": 2.9130434782608696, "grad_norm": 0.7487836480140686, "learning_rate": 1.8853794656838624e-05, "loss": 0.4135, "step": 4154 }, { "epoch": 2.9137447405329593, "grad_norm": 0.26287952065467834, "learning_rate": 1.8843027329199968e-05, "loss": 0.0928, "step": 4155 }, { "epoch": 2.914446002805049, "grad_norm": 0.26214399933815, "learning_rate": 1.883226121726613e-05, "loss": 0.0933, "step": 4156 }, { "epoch": 2.915147265077139, "grad_norm": 0.24236154556274414, "learning_rate": 1.8821496323162885e-05, "loss": 0.0908, "step": 4157 }, { "epoch": 2.9158485273492287, "grad_norm": 0.26493752002716064, "learning_rate": 1.881073264901579e-05, "loss": 0.0933, "step": 4158 }, { "epoch": 2.9165497896213184, "grad_norm": 0.7491867542266846, "learning_rate": 1.8799970196950154e-05, "loss": 0.4134, "step": 4159 }, { "epoch": 2.917251051893408, "grad_norm": 0.24199692904949188, "learning_rate": 1.8789208969091033e-05, "loss": 0.0907, "step": 4160 }, { "epoch": 2.9179523141654977, "grad_norm": 0.2639814615249634, "learning_rate": 1.877844896756326e-05, "loss": 0.2557, "step": 4161 }, { "epoch": 2.9186535764375874, "grad_norm": 0.841547966003418, "learning_rate": 1.8767690194491412e-05, "loss": 0.2344, "step": 4162 }, { "epoch": 2.9193548387096775, "grad_norm": 0.2434033453464508, "learning_rate": 1.8756932651999814e-05, "loss": 0.0911, "step": 4163 }, { "epoch": 2.920056100981767, "grad_norm": 0.2564419209957123, "learning_rate": 1.8746176342212574e-05, "loss": 0.0914, "step": 4164 }, { "epoch": 2.920757363253857, "grad_norm": 0.2695825695991516, "learning_rate": 1.8735421267253546e-05, "loss": 0.257, "step": 4165 }, { "epoch": 2.921458625525947, "grad_norm": 1.133716106414795, "learning_rate": 1.872466742924633e-05, "loss": 0.397, "step": 4166 }, { "epoch": 2.9221598877980366, "grad_norm": 0.24250568449497223, "learning_rate": 1.871391483031429e-05, "loss": 0.0907, "step": 4167 }, { "epoch": 2.9228611500701263, "grad_norm": 0.2661532461643219, "learning_rate": 1.8703163472580565e-05, "loss": 0.0941, "step": 4168 }, { "epoch": 2.923562412342216, "grad_norm": 0.29005134105682373, "learning_rate": 1.8692413358168e-05, "loss": 0.0947, "step": 4169 }, { "epoch": 2.9242636746143056, "grad_norm": 0.8526434302330017, "learning_rate": 1.8681664489199236e-05, "loss": 0.2338, "step": 4170 }, { "epoch": 2.9249649368863953, "grad_norm": 0.7462711334228516, "learning_rate": 1.8670916867796657e-05, "loss": 0.4149, "step": 4171 }, { "epoch": 2.9256661991584854, "grad_norm": 0.2443755865097046, "learning_rate": 1.8660170496082395e-05, "loss": 0.0908, "step": 4172 }, { "epoch": 2.926367461430575, "grad_norm": 0.24339276552200317, "learning_rate": 1.8649425376178346e-05, "loss": 0.0912, "step": 4173 }, { "epoch": 2.9270687237026647, "grad_norm": 0.2442454844713211, "learning_rate": 1.8638681510206145e-05, "loss": 0.0914, "step": 4174 }, { "epoch": 2.927769985974755, "grad_norm": 0.26278427243232727, "learning_rate": 1.8627938900287205e-05, "loss": 0.2528, "step": 4175 }, { "epoch": 2.9284712482468445, "grad_norm": 0.26563653349876404, "learning_rate": 1.8617197548542647e-05, "loss": 0.2521, "step": 4176 }, { "epoch": 2.929172510518934, "grad_norm": 0.2450980693101883, "learning_rate": 1.860645745709338e-05, "loss": 0.0917, "step": 4177 }, { "epoch": 2.929873772791024, "grad_norm": 0.2630435526371002, "learning_rate": 1.8595718628060055e-05, "loss": 0.2574, "step": 4178 }, { "epoch": 2.9305750350631135, "grad_norm": 0.2601494789123535, "learning_rate": 1.858498106356307e-05, "loss": 0.2517, "step": 4179 }, { "epoch": 2.931276297335203, "grad_norm": 0.2455204725265503, "learning_rate": 1.8574244765722575e-05, "loss": 0.0919, "step": 4180 }, { "epoch": 2.9319775596072932, "grad_norm": 0.259905070066452, "learning_rate": 1.856350973665848e-05, "loss": 0.2541, "step": 4181 }, { "epoch": 2.932678821879383, "grad_norm": 0.2641850709915161, "learning_rate": 1.8552775978490432e-05, "loss": 0.2561, "step": 4182 }, { "epoch": 2.9333800841514726, "grad_norm": 0.24583673477172852, "learning_rate": 1.8542043493337817e-05, "loss": 0.092, "step": 4183 }, { "epoch": 2.9340813464235627, "grad_norm": 0.2710319757461548, "learning_rate": 1.8531312283319788e-05, "loss": 0.0955, "step": 4184 }, { "epoch": 2.9347826086956523, "grad_norm": 0.26639166474342346, "learning_rate": 1.8520582350555245e-05, "loss": 0.0943, "step": 4185 }, { "epoch": 2.935483870967742, "grad_norm": 0.28046348690986633, "learning_rate": 1.8509853697162832e-05, "loss": 0.0952, "step": 4186 }, { "epoch": 2.9361851332398317, "grad_norm": 0.7250474095344543, "learning_rate": 1.8499126325260935e-05, "loss": 0.4141, "step": 4187 }, { "epoch": 2.9368863955119213, "grad_norm": 0.24525931477546692, "learning_rate": 1.84884002369677e-05, "loss": 0.0919, "step": 4188 }, { "epoch": 2.937587657784011, "grad_norm": 0.2578708827495575, "learning_rate": 1.8477675434401016e-05, "loss": 0.2504, "step": 4189 }, { "epoch": 2.938288920056101, "grad_norm": 0.25868991017341614, "learning_rate": 1.846695191967849e-05, "loss": 0.2541, "step": 4190 }, { "epoch": 2.9389901823281908, "grad_norm": 0.25635918974876404, "learning_rate": 1.8456229694917516e-05, "loss": 0.2551, "step": 4191 }, { "epoch": 2.9396914446002804, "grad_norm": 0.24645699560642242, "learning_rate": 1.8445508762235215e-05, "loss": 0.0919, "step": 4192 }, { "epoch": 2.9403927068723705, "grad_norm": 0.3012922406196594, "learning_rate": 1.8434789123748446e-05, "loss": 0.0979, "step": 4193 }, { "epoch": 2.94109396914446, "grad_norm": 0.2449636310338974, "learning_rate": 1.8424070781573828e-05, "loss": 0.0917, "step": 4194 }, { "epoch": 2.94179523141655, "grad_norm": 0.24453572928905487, "learning_rate": 1.8413353737827714e-05, "loss": 0.0918, "step": 4195 }, { "epoch": 2.9424964936886395, "grad_norm": 0.2744143605232239, "learning_rate": 1.8402637994626202e-05, "loss": 0.0934, "step": 4196 }, { "epoch": 2.943197755960729, "grad_norm": 0.7480837106704712, "learning_rate": 1.8391923554085138e-05, "loss": 0.2182, "step": 4197 }, { "epoch": 2.943899018232819, "grad_norm": 0.2643606960773468, "learning_rate": 1.8381210418320093e-05, "loss": 0.2532, "step": 4198 }, { "epoch": 2.944600280504909, "grad_norm": 0.265742689371109, "learning_rate": 1.8370498589446414e-05, "loss": 0.0944, "step": 4199 }, { "epoch": 2.9453015427769986, "grad_norm": 0.24411337077617645, "learning_rate": 1.8359788069579153e-05, "loss": 0.0917, "step": 4200 }, { "epoch": 2.9460028050490883, "grad_norm": 0.24374617636203766, "learning_rate": 1.8349078860833123e-05, "loss": 0.0911, "step": 4201 }, { "epoch": 2.946704067321178, "grad_norm": 0.26214462518692017, "learning_rate": 1.8338370965322886e-05, "loss": 0.0929, "step": 4202 }, { "epoch": 2.947405329593268, "grad_norm": 0.2582798898220062, "learning_rate": 1.832766438516273e-05, "loss": 0.2526, "step": 4203 }, { "epoch": 2.9481065918653577, "grad_norm": 0.24509383738040924, "learning_rate": 1.831695912246668e-05, "loss": 0.0917, "step": 4204 }, { "epoch": 2.9488078541374474, "grad_norm": 0.26802247762680054, "learning_rate": 1.8306255179348515e-05, "loss": 0.2572, "step": 4205 }, { "epoch": 2.949509116409537, "grad_norm": 0.2680884301662445, "learning_rate": 1.829555255792174e-05, "loss": 0.0941, "step": 4206 }, { "epoch": 2.9502103786816267, "grad_norm": 0.7447811961174011, "learning_rate": 1.828485126029962e-05, "loss": 0.4137, "step": 4207 }, { "epoch": 2.950911640953717, "grad_norm": 0.24267706274986267, "learning_rate": 1.827415128859513e-05, "loss": 0.0911, "step": 4208 }, { "epoch": 2.9516129032258065, "grad_norm": 0.26651638746261597, "learning_rate": 1.826345264492101e-05, "loss": 0.2516, "step": 4209 }, { "epoch": 2.952314165497896, "grad_norm": 0.29380810260772705, "learning_rate": 1.8252755331389727e-05, "loss": 0.0966, "step": 4210 }, { "epoch": 2.953015427769986, "grad_norm": 0.2623786926269531, "learning_rate": 1.824205935011347e-05, "loss": 0.0929, "step": 4211 }, { "epoch": 2.953716690042076, "grad_norm": 0.7553876638412476, "learning_rate": 1.823136470320418e-05, "loss": 0.207, "step": 4212 }, { "epoch": 2.9544179523141656, "grad_norm": 0.7174602150917053, "learning_rate": 1.8220671392773542e-05, "loss": 0.2117, "step": 4213 }, { "epoch": 2.9551192145862553, "grad_norm": 0.2864985167980194, "learning_rate": 1.8209979420932965e-05, "loss": 0.255, "step": 4214 }, { "epoch": 2.955820476858345, "grad_norm": 0.24412888288497925, "learning_rate": 1.8199288789793596e-05, "loss": 0.0913, "step": 4215 }, { "epoch": 2.9565217391304346, "grad_norm": 0.24390865862369537, "learning_rate": 1.8188599501466322e-05, "loss": 0.0913, "step": 4216 }, { "epoch": 2.9572230014025247, "grad_norm": 0.8003849983215332, "learning_rate": 1.817791155806177e-05, "loss": 0.2135, "step": 4217 }, { "epoch": 2.9579242636746144, "grad_norm": 0.24376751482486725, "learning_rate": 1.816722496169027e-05, "loss": 0.0916, "step": 4218 }, { "epoch": 2.958625525946704, "grad_norm": 0.27344244718551636, "learning_rate": 1.815653971446192e-05, "loss": 0.0959, "step": 4219 }, { "epoch": 2.9593267882187937, "grad_norm": 0.3075959086418152, "learning_rate": 1.8145855818486543e-05, "loss": 0.0984, "step": 4220 }, { "epoch": 2.960028050490884, "grad_norm": 0.269689679145813, "learning_rate": 1.813517327587369e-05, "loss": 0.261, "step": 4221 }, { "epoch": 2.9607293127629735, "grad_norm": 0.2806653380393982, "learning_rate": 1.8124492088732648e-05, "loss": 0.0975, "step": 4222 }, { "epoch": 2.961430575035063, "grad_norm": 0.3099628984928131, "learning_rate": 1.8113812259172436e-05, "loss": 0.0992, "step": 4223 }, { "epoch": 2.962131837307153, "grad_norm": 0.31926703453063965, "learning_rate": 1.8103133789301817e-05, "loss": 0.1006, "step": 4224 }, { "epoch": 2.9628330995792425, "grad_norm": 0.31351611018180847, "learning_rate": 1.8092456681229246e-05, "loss": 0.0996, "step": 4225 }, { "epoch": 2.963534361851332, "grad_norm": 1.8295462131500244, "learning_rate": 1.8081780937062957e-05, "loss": 0.677, "step": 4226 }, { "epoch": 2.9642356241234222, "grad_norm": 0.27614539861679077, "learning_rate": 1.8071106558910882e-05, "loss": 0.0964, "step": 4227 }, { "epoch": 2.964936886395512, "grad_norm": 0.27710476517677307, "learning_rate": 1.8060433548880705e-05, "loss": 0.0933, "step": 4228 }, { "epoch": 2.9656381486676016, "grad_norm": 0.2448335587978363, "learning_rate": 1.8049761909079826e-05, "loss": 0.0915, "step": 4229 }, { "epoch": 2.9663394109396917, "grad_norm": 0.26054978370666504, "learning_rate": 1.803909164161538e-05, "loss": 0.2516, "step": 4230 }, { "epoch": 2.9670406732117813, "grad_norm": 0.2443636655807495, "learning_rate": 1.8028422748594236e-05, "loss": 0.0908, "step": 4231 }, { "epoch": 2.967741935483871, "grad_norm": 0.92258220911026, "learning_rate": 1.8017755232122968e-05, "loss": 0.3744, "step": 4232 }, { "epoch": 2.9684431977559607, "grad_norm": 0.26759639382362366, "learning_rate": 1.800708909430791e-05, "loss": 0.2513, "step": 4233 }, { "epoch": 2.9691444600280503, "grad_norm": 1.4419395923614502, "learning_rate": 1.79964243372551e-05, "loss": 0.5257, "step": 4234 }, { "epoch": 2.96984572230014, "grad_norm": 0.24442869424819946, "learning_rate": 1.798576096307032e-05, "loss": 0.0913, "step": 4235 }, { "epoch": 2.97054698457223, "grad_norm": 0.2751627564430237, "learning_rate": 1.7975098973859063e-05, "loss": 0.2613, "step": 4236 }, { "epoch": 2.9712482468443198, "grad_norm": 0.2950488030910492, "learning_rate": 1.7964438371726566e-05, "loss": 0.0968, "step": 4237 }, { "epoch": 2.9719495091164094, "grad_norm": 0.3147618770599365, "learning_rate": 1.7953779158777793e-05, "loss": 0.0998, "step": 4238 }, { "epoch": 2.9726507713884995, "grad_norm": 0.33371660113334656, "learning_rate": 1.7943121337117397e-05, "loss": 0.1035, "step": 4239 }, { "epoch": 2.973352033660589, "grad_norm": 0.263618528842926, "learning_rate": 1.79324649088498e-05, "loss": 0.2578, "step": 4240 }, { "epoch": 2.974053295932679, "grad_norm": 0.24583126604557037, "learning_rate": 1.7921809876079126e-05, "loss": 0.0914, "step": 4241 }, { "epoch": 2.9747545582047685, "grad_norm": 0.24513454735279083, "learning_rate": 1.7911156240909233e-05, "loss": 0.0915, "step": 4242 }, { "epoch": 2.975455820476858, "grad_norm": 0.31392237544059753, "learning_rate": 1.7900504005443698e-05, "loss": 0.1002, "step": 4243 }, { "epoch": 2.976157082748948, "grad_norm": 0.2679433226585388, "learning_rate": 1.7889853171785825e-05, "loss": 0.2536, "step": 4244 }, { "epoch": 2.976858345021038, "grad_norm": 0.26925453543663025, "learning_rate": 1.7879203742038644e-05, "loss": 0.2579, "step": 4245 }, { "epoch": 2.9775596072931276, "grad_norm": 1.024696707725525, "learning_rate": 1.7868555718304892e-05, "loss": 0.3707, "step": 4246 }, { "epoch": 2.9782608695652173, "grad_norm": 0.26852333545684814, "learning_rate": 1.7857909102687046e-05, "loss": 0.2533, "step": 4247 }, { "epoch": 2.9789621318373074, "grad_norm": 0.2824082374572754, "learning_rate": 1.7847263897287303e-05, "loss": 0.0971, "step": 4248 }, { "epoch": 2.979663394109397, "grad_norm": 0.24602961540222168, "learning_rate": 1.7836620104207566e-05, "loss": 0.0919, "step": 4249 }, { "epoch": 2.9803646563814867, "grad_norm": 0.24288609623908997, "learning_rate": 1.7825977725549475e-05, "loss": 0.0903, "step": 4250 }, { "epoch": 2.9810659186535764, "grad_norm": 0.27873337268829346, "learning_rate": 1.781533676341439e-05, "loss": 0.0971, "step": 4251 }, { "epoch": 2.981767180925666, "grad_norm": 0.2455255687236786, "learning_rate": 1.7804697219903388e-05, "loss": 0.0919, "step": 4252 }, { "epoch": 2.9824684431977557, "grad_norm": 0.2647164463996887, "learning_rate": 1.7794059097117255e-05, "loss": 0.2538, "step": 4253 }, { "epoch": 2.983169705469846, "grad_norm": 0.280757874250412, "learning_rate": 1.7783422397156514e-05, "loss": 0.0969, "step": 4254 }, { "epoch": 2.9838709677419355, "grad_norm": 0.2636529207229614, "learning_rate": 1.7772787122121397e-05, "loss": 0.2586, "step": 4255 }, { "epoch": 2.984572230014025, "grad_norm": 0.24468250572681427, "learning_rate": 1.7762153274111858e-05, "loss": 0.0912, "step": 4256 }, { "epoch": 2.9852734922861153, "grad_norm": 0.2649300992488861, "learning_rate": 1.775152085522757e-05, "loss": 0.2525, "step": 4257 }, { "epoch": 2.985974754558205, "grad_norm": 0.6906221508979797, "learning_rate": 1.7740889867567928e-05, "loss": 0.1935, "step": 4258 }, { "epoch": 2.9866760168302946, "grad_norm": 0.28029584884643555, "learning_rate": 1.7730260313232034e-05, "loss": 0.0967, "step": 4259 }, { "epoch": 2.9873772791023843, "grad_norm": 0.29545900225639343, "learning_rate": 1.7719632194318702e-05, "loss": 0.2641, "step": 4260 }, { "epoch": 2.988078541374474, "grad_norm": 0.2779310643672943, "learning_rate": 1.770900551292648e-05, "loss": 0.0961, "step": 4261 }, { "epoch": 2.9887798036465636, "grad_norm": 0.28124725818634033, "learning_rate": 1.769838027115363e-05, "loss": 0.0976, "step": 4262 }, { "epoch": 2.9894810659186537, "grad_norm": 0.26159846782684326, "learning_rate": 1.7687756471098112e-05, "loss": 0.257, "step": 4263 }, { "epoch": 2.9901823281907434, "grad_norm": 0.28134509921073914, "learning_rate": 1.7677134114857624e-05, "loss": 0.0966, "step": 4264 }, { "epoch": 2.990883590462833, "grad_norm": 0.33729273080825806, "learning_rate": 1.7666513204529566e-05, "loss": 0.2664, "step": 4265 }, { "epoch": 2.9915848527349227, "grad_norm": 0.33129626512527466, "learning_rate": 1.7655893742211067e-05, "loss": 0.1029, "step": 4266 }, { "epoch": 2.992286115007013, "grad_norm": 0.2717721164226532, "learning_rate": 1.764527572999893e-05, "loss": 0.0954, "step": 4267 }, { "epoch": 2.9929873772791025, "grad_norm": 0.28518155217170715, "learning_rate": 1.7634659169989718e-05, "loss": 0.0978, "step": 4268 }, { "epoch": 2.993688639551192, "grad_norm": 0.28190889954566956, "learning_rate": 1.7624044064279683e-05, "loss": 0.0968, "step": 4269 }, { "epoch": 2.994389901823282, "grad_norm": 0.30127039551734924, "learning_rate": 1.76134304149648e-05, "loss": 0.2651, "step": 4270 }, { "epoch": 2.9950911640953715, "grad_norm": 0.3303273916244507, "learning_rate": 1.7602818224140753e-05, "loss": 0.0978, "step": 4271 }, { "epoch": 2.9957924263674616, "grad_norm": 0.31855764985084534, "learning_rate": 1.7592207493902933e-05, "loss": 0.099, "step": 4272 }, { "epoch": 2.9964936886395512, "grad_norm": 0.24201305210590363, "learning_rate": 1.7581598226346457e-05, "loss": 0.0903, "step": 4273 }, { "epoch": 2.997194950911641, "grad_norm": 0.30573570728302, "learning_rate": 1.757099042356613e-05, "loss": 0.097, "step": 4274 }, { "epoch": 2.9978962131837306, "grad_norm": 0.2856876254081726, "learning_rate": 1.7560384087656484e-05, "loss": 0.2609, "step": 4275 }, { "epoch": 2.9985974754558207, "grad_norm": 0.26795005798339844, "learning_rate": 1.754977922071176e-05, "loss": 0.2521, "step": 4276 }, { "epoch": 2.9992987377279103, "grad_norm": 0.24111761152744293, "learning_rate": 1.7539175824825903e-05, "loss": 0.0898, "step": 4277 }, { "epoch": 3.0, "grad_norm": 0.422321617603302, "learning_rate": 1.7528573902092582e-05, "loss": 0.2692, "step": 4278 }, { "epoch": 3.0, "eval_f1 (minor class)": 0.0, "eval_loss": 0.17381267249584198, "eval_roc_auc": 0.5290960716403008, "eval_runtime": 233.3499, "eval_samples_per_second": 5.434, "eval_steps_per_second": 1.358, "step": 4278 }, { "epoch": 3.0007012622720897, "grad_norm": 0.23914459347724915, "learning_rate": 1.7517973454605158e-05, "loss": 0.0894, "step": 4279 }, { "epoch": 3.0014025245441793, "grad_norm": 0.23880091309547424, "learning_rate": 1.7507374484456716e-05, "loss": 0.0891, "step": 4280 }, { "epoch": 3.0021037868162694, "grad_norm": 0.2624296545982361, "learning_rate": 1.749677699374003e-05, "loss": 0.0919, "step": 4281 }, { "epoch": 3.002805049088359, "grad_norm": 0.27512845396995544, "learning_rate": 1.748618098454759e-05, "loss": 0.2566, "step": 4282 }, { "epoch": 3.0035063113604488, "grad_norm": 0.8576096892356873, "learning_rate": 1.7475586458971603e-05, "loss": 0.2208, "step": 4283 }, { "epoch": 3.0042075736325384, "grad_norm": 0.2837718427181244, "learning_rate": 1.7464993419103975e-05, "loss": 0.2595, "step": 4284 }, { "epoch": 3.0049088359046285, "grad_norm": 0.27547144889831543, "learning_rate": 1.7454401867036324e-05, "loss": 0.2583, "step": 4285 }, { "epoch": 3.005610098176718, "grad_norm": 0.2625574767589569, "learning_rate": 1.7443811804859964e-05, "loss": 0.0916, "step": 4286 }, { "epoch": 3.006311360448808, "grad_norm": 0.23954923450946808, "learning_rate": 1.7433223234665932e-05, "loss": 0.0889, "step": 4287 }, { "epoch": 3.0070126227208975, "grad_norm": 0.2751530706882477, "learning_rate": 1.742263615854494e-05, "loss": 0.254, "step": 4288 }, { "epoch": 3.007713884992987, "grad_norm": 0.27900025248527527, "learning_rate": 1.7412050578587443e-05, "loss": 0.258, "step": 4289 }, { "epoch": 3.0084151472650773, "grad_norm": 0.27121204137802124, "learning_rate": 1.7401466496883567e-05, "loss": 0.2571, "step": 4290 }, { "epoch": 3.009116409537167, "grad_norm": 0.2889100909233093, "learning_rate": 1.7390883915523164e-05, "loss": 0.2631, "step": 4291 }, { "epoch": 3.0098176718092566, "grad_norm": 0.2791300117969513, "learning_rate": 1.7380302836595786e-05, "loss": 0.2556, "step": 4292 }, { "epoch": 3.0105189340813463, "grad_norm": 0.29163527488708496, "learning_rate": 1.7369723262190684e-05, "loss": 0.0903, "step": 4293 }, { "epoch": 3.0112201963534364, "grad_norm": 0.28354665637016296, "learning_rate": 1.735914519439681e-05, "loss": 0.0924, "step": 4294 }, { "epoch": 3.011921458625526, "grad_norm": 0.23889987170696259, "learning_rate": 1.734856863530282e-05, "loss": 0.0892, "step": 4295 }, { "epoch": 3.0126227208976157, "grad_norm": 0.2379365861415863, "learning_rate": 1.7337993586997075e-05, "loss": 0.0885, "step": 4296 }, { "epoch": 3.0133239831697054, "grad_norm": 0.9479599595069885, "learning_rate": 1.7327420051567648e-05, "loss": 0.2401, "step": 4297 }, { "epoch": 3.014025245441795, "grad_norm": 0.28198087215423584, "learning_rate": 1.7316848031102285e-05, "loss": 0.2553, "step": 4298 }, { "epoch": 3.014726507713885, "grad_norm": 0.25796782970428467, "learning_rate": 1.7306277527688452e-05, "loss": 0.0908, "step": 4299 }, { "epoch": 3.015427769985975, "grad_norm": 0.27595293521881104, "learning_rate": 1.7295708543413326e-05, "loss": 0.2547, "step": 4300 }, { "epoch": 3.0161290322580645, "grad_norm": 0.2721201777458191, "learning_rate": 1.728514108036376e-05, "loss": 0.2533, "step": 4301 }, { "epoch": 3.016830294530154, "grad_norm": 0.2545287013053894, "learning_rate": 1.7274575140626318e-05, "loss": 0.0899, "step": 4302 }, { "epoch": 3.017531556802244, "grad_norm": 0.2575669586658478, "learning_rate": 1.7264010726287262e-05, "loss": 0.0906, "step": 4303 }, { "epoch": 3.018232819074334, "grad_norm": 0.2817234396934509, "learning_rate": 1.7253447839432563e-05, "loss": 0.2543, "step": 4304 }, { "epoch": 3.0189340813464236, "grad_norm": 0.2929086685180664, "learning_rate": 1.724288648214787e-05, "loss": 0.2611, "step": 4305 }, { "epoch": 3.0196353436185133, "grad_norm": 0.27503150701522827, "learning_rate": 1.7232326656518546e-05, "loss": 0.2524, "step": 4306 }, { "epoch": 3.020336605890603, "grad_norm": 1.2625408172607422, "learning_rate": 1.722176836462966e-05, "loss": 0.586, "step": 4307 }, { "epoch": 3.021037868162693, "grad_norm": 0.26014867424964905, "learning_rate": 1.7211211608565938e-05, "loss": 0.0906, "step": 4308 }, { "epoch": 3.0217391304347827, "grad_norm": 0.23789924383163452, "learning_rate": 1.7200656390411847e-05, "loss": 0.0887, "step": 4309 }, { "epoch": 3.0224403927068724, "grad_norm": 0.23972424864768982, "learning_rate": 1.7190102712251525e-05, "loss": 0.0892, "step": 4310 }, { "epoch": 3.023141654978962, "grad_norm": 0.2389615774154663, "learning_rate": 1.717955057616882e-05, "loss": 0.0889, "step": 4311 }, { "epoch": 3.0238429172510517, "grad_norm": 0.2757178544998169, "learning_rate": 1.716899998424727e-05, "loss": 0.2551, "step": 4312 }, { "epoch": 3.024544179523142, "grad_norm": 0.28315550088882446, "learning_rate": 1.71584509385701e-05, "loss": 0.0874, "step": 4313 }, { "epoch": 3.0252454417952315, "grad_norm": 0.27098870277404785, "learning_rate": 1.7147903441220252e-05, "loss": 0.2557, "step": 4314 }, { "epoch": 3.025946704067321, "grad_norm": 0.24050237238407135, "learning_rate": 1.7137357494280332e-05, "loss": 0.0895, "step": 4315 }, { "epoch": 3.026647966339411, "grad_norm": 0.28392091393470764, "learning_rate": 1.7126813099832657e-05, "loss": 0.258, "step": 4316 }, { "epoch": 3.027349228611501, "grad_norm": 0.2755090594291687, "learning_rate": 1.7116270259959238e-05, "loss": 0.2543, "step": 4317 }, { "epoch": 3.0280504908835906, "grad_norm": 0.29241639375686646, "learning_rate": 1.7105728976741784e-05, "loss": 0.0927, "step": 4318 }, { "epoch": 3.0287517531556802, "grad_norm": 0.7682758569717407, "learning_rate": 1.7095189252261677e-05, "loss": 0.4222, "step": 4319 }, { "epoch": 3.02945301542777, "grad_norm": 0.25976675748825073, "learning_rate": 1.7084651088600018e-05, "loss": 0.0908, "step": 4320 }, { "epoch": 3.0301542776998596, "grad_norm": 0.29635271430015564, "learning_rate": 1.7074114487837582e-05, "loss": 0.0896, "step": 4321 }, { "epoch": 3.0308555399719497, "grad_norm": 0.2798285186290741, "learning_rate": 1.7063579452054826e-05, "loss": 0.2553, "step": 4322 }, { "epoch": 3.0315568022440393, "grad_norm": 0.31165239214897156, "learning_rate": 1.7053045983331923e-05, "loss": 0.2577, "step": 4323 }, { "epoch": 3.032258064516129, "grad_norm": 0.23884986340999603, "learning_rate": 1.704251408374872e-05, "loss": 0.0888, "step": 4324 }, { "epoch": 3.0329593267882187, "grad_norm": 0.27557942271232605, "learning_rate": 1.7031983755384757e-05, "loss": 0.2526, "step": 4325 }, { "epoch": 3.0336605890603083, "grad_norm": 0.26697760820388794, "learning_rate": 1.7021455000319267e-05, "loss": 0.0869, "step": 4326 }, { "epoch": 3.0343618513323984, "grad_norm": 1.0497931241989136, "learning_rate": 1.701092782063117e-05, "loss": 0.2482, "step": 4327 }, { "epoch": 3.035063113604488, "grad_norm": 0.2887676954269409, "learning_rate": 1.7000402218399097e-05, "loss": 0.0924, "step": 4328 }, { "epoch": 3.0357643758765778, "grad_norm": 0.7679063081741333, "learning_rate": 1.698987819570131e-05, "loss": 0.421, "step": 4329 }, { "epoch": 3.0364656381486674, "grad_norm": 0.27798938751220703, "learning_rate": 1.6979355754615814e-05, "loss": 0.2555, "step": 4330 }, { "epoch": 3.0371669004207575, "grad_norm": 0.3113824725151062, "learning_rate": 1.6968834897220282e-05, "loss": 0.2611, "step": 4331 }, { "epoch": 3.037868162692847, "grad_norm": 0.2747408449649811, "learning_rate": 1.695831562559207e-05, "loss": 0.2558, "step": 4332 }, { "epoch": 3.038569424964937, "grad_norm": 0.28886276483535767, "learning_rate": 1.6947797941808235e-05, "loss": 0.0883, "step": 4333 }, { "epoch": 3.0392706872370265, "grad_norm": 0.286038339138031, "learning_rate": 1.6937281847945506e-05, "loss": 0.0919, "step": 4334 }, { "epoch": 3.039971949509116, "grad_norm": 0.24004772305488586, "learning_rate": 1.692676734608032e-05, "loss": 0.0892, "step": 4335 }, { "epoch": 3.0406732117812063, "grad_norm": 0.24290619790554047, "learning_rate": 1.6916254438288752e-05, "loss": 0.0906, "step": 4336 }, { "epoch": 3.041374474053296, "grad_norm": 0.2771557867527008, "learning_rate": 1.6905743126646615e-05, "loss": 0.2562, "step": 4337 }, { "epoch": 3.0420757363253856, "grad_norm": 0.27544352412223816, "learning_rate": 1.6895233413229378e-05, "loss": 0.2554, "step": 4338 }, { "epoch": 3.0427769985974753, "grad_norm": 0.31523793935775757, "learning_rate": 1.6884725300112206e-05, "loss": 0.2629, "step": 4339 }, { "epoch": 3.0434782608695654, "grad_norm": 0.29017505049705505, "learning_rate": 1.687421878936994e-05, "loss": 0.0894, "step": 4340 }, { "epoch": 3.044179523141655, "grad_norm": 0.2855393588542938, "learning_rate": 1.6863713883077114e-05, "loss": 0.0923, "step": 4341 }, { "epoch": 3.0448807854137447, "grad_norm": 0.2411910593509674, "learning_rate": 1.6853210583307942e-05, "loss": 0.0894, "step": 4342 }, { "epoch": 3.0455820476858344, "grad_norm": 0.24178442358970642, "learning_rate": 1.6842708892136312e-05, "loss": 0.0901, "step": 4343 }, { "epoch": 3.046283309957924, "grad_norm": 0.2405317723751068, "learning_rate": 1.6832208811635796e-05, "loss": 0.0892, "step": 4344 }, { "epoch": 3.046984572230014, "grad_norm": 0.2777000367641449, "learning_rate": 1.682171034387967e-05, "loss": 0.256, "step": 4345 }, { "epoch": 3.047685834502104, "grad_norm": 0.2866397500038147, "learning_rate": 1.6811213490940857e-05, "loss": 0.0881, "step": 4346 }, { "epoch": 3.0483870967741935, "grad_norm": 0.24094749987125397, "learning_rate": 1.6800718254891983e-05, "loss": 0.0891, "step": 4347 }, { "epoch": 3.049088359046283, "grad_norm": 0.2797839343547821, "learning_rate": 1.6790224637805353e-05, "loss": 0.0903, "step": 4348 }, { "epoch": 3.0497896213183733, "grad_norm": 0.27547189593315125, "learning_rate": 1.677973264175296e-05, "loss": 0.0897, "step": 4349 }, { "epoch": 3.050490883590463, "grad_norm": 1.272271990776062, "learning_rate": 1.6769242268806445e-05, "loss": 0.4149, "step": 4350 }, { "epoch": 3.0511921458625526, "grad_norm": 0.2863743305206299, "learning_rate": 1.6758753521037163e-05, "loss": 0.0906, "step": 4351 }, { "epoch": 3.0518934081346423, "grad_norm": 0.27911582589149475, "learning_rate": 1.6748266400516135e-05, "loss": 0.0869, "step": 4352 }, { "epoch": 3.052594670406732, "grad_norm": 1.29931640625, "learning_rate": 1.673778090931406e-05, "loss": 0.4201, "step": 4353 }, { "epoch": 3.053295932678822, "grad_norm": 0.27949556708335876, "learning_rate": 1.6727297049501317e-05, "loss": 0.0857, "step": 4354 }, { "epoch": 3.0539971949509117, "grad_norm": 0.27371031045913696, "learning_rate": 1.6716814823147957e-05, "loss": 0.2532, "step": 4355 }, { "epoch": 3.0546984572230014, "grad_norm": 0.24050918221473694, "learning_rate": 1.670633423232373e-05, "loss": 0.0886, "step": 4356 }, { "epoch": 3.055399719495091, "grad_norm": 0.2516195774078369, "learning_rate": 1.669585527909802e-05, "loss": 0.0843, "step": 4357 }, { "epoch": 3.056100981767181, "grad_norm": 0.7693092823028564, "learning_rate": 1.6685377965539933e-05, "loss": 0.4233, "step": 4358 }, { "epoch": 3.056802244039271, "grad_norm": 0.31135931611061096, "learning_rate": 1.6674902293718226e-05, "loss": 0.0899, "step": 4359 }, { "epoch": 3.0575035063113605, "grad_norm": 0.9338065385818481, "learning_rate": 1.6664428265701343e-05, "loss": 0.2349, "step": 4360 }, { "epoch": 3.05820476858345, "grad_norm": 0.9275439381599426, "learning_rate": 1.665395588355739e-05, "loss": 0.2461, "step": 4361 }, { "epoch": 3.05890603085554, "grad_norm": 0.28368300199508667, "learning_rate": 1.664348514935417e-05, "loss": 0.0916, "step": 4362 }, { "epoch": 3.05960729312763, "grad_norm": 0.2882652282714844, "learning_rate": 1.6633016065159146e-05, "loss": 0.2489, "step": 4363 }, { "epoch": 3.0603085553997196, "grad_norm": 0.7755069136619568, "learning_rate": 1.662254863303944e-05, "loss": 0.4247, "step": 4364 }, { "epoch": 3.0610098176718092, "grad_norm": 0.26010122895240784, "learning_rate": 1.6612082855061878e-05, "loss": 0.0914, "step": 4365 }, { "epoch": 3.061711079943899, "grad_norm": 0.24289897084236145, "learning_rate": 1.6601618733292946e-05, "loss": 0.0906, "step": 4366 }, { "epoch": 3.0624123422159886, "grad_norm": 0.7456004619598389, "learning_rate": 1.6591156269798795e-05, "loss": 0.4172, "step": 4367 }, { "epoch": 3.0631136044880787, "grad_norm": 0.2677205801010132, "learning_rate": 1.6580695466645262e-05, "loss": 0.2521, "step": 4368 }, { "epoch": 3.0638148667601683, "grad_norm": 0.294971764087677, "learning_rate": 1.657023632589785e-05, "loss": 0.0946, "step": 4369 }, { "epoch": 3.064516129032258, "grad_norm": 0.2436314970254898, "learning_rate": 1.6559778849621744e-05, "loss": 0.0907, "step": 4370 }, { "epoch": 3.0652173913043477, "grad_norm": 0.2444945126771927, "learning_rate": 1.654932303988177e-05, "loss": 0.0912, "step": 4371 }, { "epoch": 3.065918653576438, "grad_norm": 0.24503250420093536, "learning_rate": 1.653886889874246e-05, "loss": 0.0913, "step": 4372 }, { "epoch": 3.0666199158485274, "grad_norm": 0.2432895451784134, "learning_rate": 1.6528416428267996e-05, "loss": 0.0904, "step": 4373 }, { "epoch": 3.067321178120617, "grad_norm": 0.24458830058574677, "learning_rate": 1.6517965630522237e-05, "loss": 0.091, "step": 4374 }, { "epoch": 3.0680224403927068, "grad_norm": 0.2687150835990906, "learning_rate": 1.6507516507568716e-05, "loss": 0.094, "step": 4375 }, { "epoch": 3.0687237026647964, "grad_norm": 0.820685863494873, "learning_rate": 1.649706906147062e-05, "loss": 0.2213, "step": 4376 }, { "epoch": 3.0694249649368865, "grad_norm": 0.31585970520973206, "learning_rate": 1.648662329429084e-05, "loss": 0.2547, "step": 4377 }, { "epoch": 3.070126227208976, "grad_norm": 0.26428836584091187, "learning_rate": 1.647617920809188e-05, "loss": 0.2507, "step": 4378 }, { "epoch": 3.070827489481066, "grad_norm": 0.2644016146659851, "learning_rate": 1.6465736804935954e-05, "loss": 0.2484, "step": 4379 }, { "epoch": 3.0715287517531555, "grad_norm": 0.2967150807380676, "learning_rate": 1.6455296086884937e-05, "loss": 0.0965, "step": 4380 }, { "epoch": 3.0722300140252456, "grad_norm": 0.26522523164749146, "learning_rate": 1.6444857056000358e-05, "loss": 0.2552, "step": 4381 }, { "epoch": 3.0729312762973353, "grad_norm": 0.2758888602256775, "learning_rate": 1.643441971434343e-05, "loss": 0.0961, "step": 4382 }, { "epoch": 3.073632538569425, "grad_norm": 0.2684580981731415, "learning_rate": 1.6423984063975023e-05, "loss": 0.2541, "step": 4383 }, { "epoch": 3.0743338008415146, "grad_norm": 0.2447829395532608, "learning_rate": 1.6413550106955682e-05, "loss": 0.0907, "step": 4384 }, { "epoch": 3.0750350631136043, "grad_norm": 0.6988956928253174, "learning_rate": 1.6403117845345585e-05, "loss": 0.212, "step": 4385 }, { "epoch": 3.0757363253856944, "grad_norm": 0.2587706446647644, "learning_rate": 1.6392687281204616e-05, "loss": 0.2543, "step": 4386 }, { "epoch": 3.076437587657784, "grad_norm": 0.26627498865127563, "learning_rate": 1.638225841659231e-05, "loss": 0.2511, "step": 4387 }, { "epoch": 3.0771388499298737, "grad_norm": 0.24384865164756775, "learning_rate": 1.637183125356786e-05, "loss": 0.0908, "step": 4388 }, { "epoch": 3.0778401122019634, "grad_norm": 0.24523070454597473, "learning_rate": 1.6361405794190125e-05, "loss": 0.0908, "step": 4389 }, { "epoch": 3.078541374474053, "grad_norm": 0.2611936032772064, "learning_rate": 1.635098204051764e-05, "loss": 0.2548, "step": 4390 }, { "epoch": 3.079242636746143, "grad_norm": 0.2711770534515381, "learning_rate": 1.6340559994608582e-05, "loss": 0.0952, "step": 4391 }, { "epoch": 3.079943899018233, "grad_norm": 0.24604694545269012, "learning_rate": 1.6330139658520806e-05, "loss": 0.0917, "step": 4392 }, { "epoch": 3.0806451612903225, "grad_norm": 0.27222293615341187, "learning_rate": 1.631972103431182e-05, "loss": 0.0946, "step": 4393 }, { "epoch": 3.081346423562412, "grad_norm": 0.7142115831375122, "learning_rate": 1.6309304124038814e-05, "loss": 0.2114, "step": 4394 }, { "epoch": 3.0820476858345023, "grad_norm": 0.2611478269100189, "learning_rate": 1.6298888929758608e-05, "loss": 0.2511, "step": 4395 }, { "epoch": 3.082748948106592, "grad_norm": 0.2899724245071411, "learning_rate": 1.6288475453527708e-05, "loss": 0.2523, "step": 4396 }, { "epoch": 3.0834502103786816, "grad_norm": 0.2587776184082031, "learning_rate": 1.6278063697402268e-05, "loss": 0.2551, "step": 4397 }, { "epoch": 3.0841514726507713, "grad_norm": 0.7434572577476501, "learning_rate": 1.626765366343812e-05, "loss": 0.4086, "step": 4398 }, { "epoch": 3.084852734922861, "grad_norm": 0.2760993540287018, "learning_rate": 1.6257245353690727e-05, "loss": 0.0968, "step": 4399 }, { "epoch": 3.085553997194951, "grad_norm": 0.2604403495788574, "learning_rate": 1.6246838770215233e-05, "loss": 0.2529, "step": 4400 }, { "epoch": 3.0862552594670407, "grad_norm": 0.24604393541812897, "learning_rate": 1.623643391506644e-05, "loss": 0.0913, "step": 4401 }, { "epoch": 3.0869565217391304, "grad_norm": 0.27116215229034424, "learning_rate": 1.62260307902988e-05, "loss": 0.0948, "step": 4402 }, { "epoch": 3.08765778401122, "grad_norm": 0.24915042519569397, "learning_rate": 1.621562939796643e-05, "loss": 0.0928, "step": 4403 }, { "epoch": 3.08835904628331, "grad_norm": 0.3117724359035492, "learning_rate": 1.6205229740123106e-05, "loss": 0.0986, "step": 4404 }, { "epoch": 3.0890603085554, "grad_norm": 0.2494102418422699, "learning_rate": 1.6194831818822255e-05, "loss": 0.0924, "step": 4405 }, { "epoch": 3.0897615708274895, "grad_norm": 0.26316049695014954, "learning_rate": 1.6184435636116962e-05, "loss": 0.2494, "step": 4406 }, { "epoch": 3.090462833099579, "grad_norm": 0.24823257327079773, "learning_rate": 1.6174041194059968e-05, "loss": 0.0923, "step": 4407 }, { "epoch": 3.091164095371669, "grad_norm": 0.27693623304367065, "learning_rate": 1.616364849470368e-05, "loss": 0.0968, "step": 4408 }, { "epoch": 3.091865357643759, "grad_norm": 0.27403807640075684, "learning_rate": 1.615325754010016e-05, "loss": 0.0957, "step": 4409 }, { "epoch": 3.0925666199158486, "grad_norm": 0.2765270471572876, "learning_rate": 1.61428683323011e-05, "loss": 0.096, "step": 4410 }, { "epoch": 3.0932678821879382, "grad_norm": 0.2470749318599701, "learning_rate": 1.613248087335789e-05, "loss": 0.0921, "step": 4411 }, { "epoch": 3.093969144460028, "grad_norm": 0.2455456256866455, "learning_rate": 1.6122095165321545e-05, "loss": 0.0912, "step": 4412 }, { "epoch": 3.094670406732118, "grad_norm": 0.24714820086956024, "learning_rate": 1.611171121024273e-05, "loss": 0.0921, "step": 4413 }, { "epoch": 3.0953716690042077, "grad_norm": 0.24567605555057526, "learning_rate": 1.6101329010171782e-05, "loss": 0.0914, "step": 4414 }, { "epoch": 3.0960729312762973, "grad_norm": 0.26742374897003174, "learning_rate": 1.6090948567158682e-05, "loss": 0.2517, "step": 4415 }, { "epoch": 3.096774193548387, "grad_norm": 0.27430418133735657, "learning_rate": 1.6080569883253067e-05, "loss": 0.0953, "step": 4416 }, { "epoch": 3.0974754558204767, "grad_norm": 0.7486504912376404, "learning_rate": 1.607019296050423e-05, "loss": 0.4189, "step": 4417 }, { "epoch": 3.098176718092567, "grad_norm": 0.30831634998321533, "learning_rate": 1.605981780096112e-05, "loss": 0.255, "step": 4418 }, { "epoch": 3.0988779803646564, "grad_norm": 0.26285335421562195, "learning_rate": 1.6049444406672305e-05, "loss": 0.0869, "step": 4419 }, { "epoch": 3.099579242636746, "grad_norm": 0.2679397165775299, "learning_rate": 1.6039072779686046e-05, "loss": 0.2517, "step": 4420 }, { "epoch": 3.1002805049088358, "grad_norm": 0.2452889084815979, "learning_rate": 1.6028702922050237e-05, "loss": 0.0907, "step": 4421 }, { "epoch": 3.100981767180926, "grad_norm": 0.26968899369239807, "learning_rate": 1.6018334835812422e-05, "loss": 0.2534, "step": 4422 }, { "epoch": 3.1016830294530155, "grad_norm": 0.2741638422012329, "learning_rate": 1.60079685230198e-05, "loss": 0.0945, "step": 4423 }, { "epoch": 3.102384291725105, "grad_norm": 0.269707053899765, "learning_rate": 1.5997603985719216e-05, "loss": 0.0938, "step": 4424 }, { "epoch": 3.103085553997195, "grad_norm": 0.2659779191017151, "learning_rate": 1.598724122595718e-05, "loss": 0.0925, "step": 4425 }, { "epoch": 3.1037868162692845, "grad_norm": 0.24325861036777496, "learning_rate": 1.5976880245779804e-05, "loss": 0.0898, "step": 4426 }, { "epoch": 3.1044880785413747, "grad_norm": 0.3044300079345703, "learning_rate": 1.5966521047232906e-05, "loss": 0.0953, "step": 4427 }, { "epoch": 3.1051893408134643, "grad_norm": 0.2692506015300751, "learning_rate": 1.595616363236192e-05, "loss": 0.0922, "step": 4428 }, { "epoch": 3.105890603085554, "grad_norm": 0.2778695225715637, "learning_rate": 1.5945808003211936e-05, "loss": 0.255, "step": 4429 }, { "epoch": 3.1065918653576436, "grad_norm": 0.26333943009376526, "learning_rate": 1.593545416182769e-05, "loss": 0.0919, "step": 4430 }, { "epoch": 3.1072931276297333, "grad_norm": 0.24131877720355988, "learning_rate": 1.592510211025357e-05, "loss": 0.0897, "step": 4431 }, { "epoch": 3.1079943899018234, "grad_norm": 0.2642788887023926, "learning_rate": 1.5914751850533614e-05, "loss": 0.092, "step": 4432 }, { "epoch": 3.108695652173913, "grad_norm": 0.4163639545440674, "learning_rate": 1.5904403384711476e-05, "loss": 0.2697, "step": 4433 }, { "epoch": 3.1093969144460027, "grad_norm": 0.2779884934425354, "learning_rate": 1.589405671483049e-05, "loss": 0.2538, "step": 4434 }, { "epoch": 3.1100981767180924, "grad_norm": 0.2401886135339737, "learning_rate": 1.5883711842933625e-05, "loss": 0.0892, "step": 4435 }, { "epoch": 3.1107994389901825, "grad_norm": 0.26087486743927, "learning_rate": 1.5873368771063493e-05, "loss": 0.0898, "step": 4436 }, { "epoch": 3.111500701262272, "grad_norm": 0.2611025273799896, "learning_rate": 1.5863027501262352e-05, "loss": 0.0901, "step": 4437 }, { "epoch": 3.112201963534362, "grad_norm": 0.3772776424884796, "learning_rate": 1.58526880355721e-05, "loss": 0.2701, "step": 4438 }, { "epoch": 3.1129032258064515, "grad_norm": 0.24070443212985992, "learning_rate": 1.5842350376034293e-05, "loss": 0.0891, "step": 4439 }, { "epoch": 3.113604488078541, "grad_norm": 0.23796126246452332, "learning_rate": 1.5832014524690102e-05, "loss": 0.0873, "step": 4440 }, { "epoch": 3.1143057503506313, "grad_norm": 0.2840888500213623, "learning_rate": 1.5821680483580373e-05, "loss": 0.2565, "step": 4441 }, { "epoch": 3.115007012622721, "grad_norm": 0.23859891295433044, "learning_rate": 1.5811348254745572e-05, "loss": 0.0873, "step": 4442 }, { "epoch": 3.1157082748948106, "grad_norm": 0.28145989775657654, "learning_rate": 1.5801017840225824e-05, "loss": 0.2546, "step": 4443 }, { "epoch": 3.1164095371669003, "grad_norm": 0.23940810561180115, "learning_rate": 1.5790689242060875e-05, "loss": 0.0873, "step": 4444 }, { "epoch": 3.1171107994389904, "grad_norm": 0.25710225105285645, "learning_rate": 1.5780362462290137e-05, "loss": 0.0891, "step": 4445 }, { "epoch": 3.11781206171108, "grad_norm": 0.23711104691028595, "learning_rate": 1.577003750295264e-05, "loss": 0.0877, "step": 4446 }, { "epoch": 3.1185133239831697, "grad_norm": 0.2799714505672455, "learning_rate": 1.575971436608707e-05, "loss": 0.2548, "step": 4447 }, { "epoch": 3.1192145862552594, "grad_norm": 0.23845793306827545, "learning_rate": 1.5749393053731748e-05, "loss": 0.0874, "step": 4448 }, { "epoch": 3.119915848527349, "grad_norm": 0.36396098136901855, "learning_rate": 1.5739073567924633e-05, "loss": 0.0899, "step": 4449 }, { "epoch": 3.120617110799439, "grad_norm": 0.23718324303627014, "learning_rate": 1.5728755910703324e-05, "loss": 0.0874, "step": 4450 }, { "epoch": 3.121318373071529, "grad_norm": 0.28330934047698975, "learning_rate": 1.5718440084105064e-05, "loss": 0.0863, "step": 4451 }, { "epoch": 3.1220196353436185, "grad_norm": 0.2563825845718384, "learning_rate": 1.570812609016673e-05, "loss": 0.0872, "step": 4452 }, { "epoch": 3.122720897615708, "grad_norm": 0.7883300185203552, "learning_rate": 1.5697813930924845e-05, "loss": 0.4255, "step": 4453 }, { "epoch": 3.123422159887798, "grad_norm": 0.23653192818164825, "learning_rate": 1.5687503608415546e-05, "loss": 0.0871, "step": 4454 }, { "epoch": 3.124123422159888, "grad_norm": 0.23618483543395996, "learning_rate": 1.5677195124674633e-05, "loss": 0.087, "step": 4455 }, { "epoch": 3.1248246844319776, "grad_norm": 0.23670464754104614, "learning_rate": 1.566688848173753e-05, "loss": 0.0873, "step": 4456 }, { "epoch": 3.1255259467040672, "grad_norm": 0.25523069500923157, "learning_rate": 1.5656583681639307e-05, "loss": 0.0759, "step": 4457 }, { "epoch": 3.126227208976157, "grad_norm": 0.2794172465801239, "learning_rate": 1.5646280726414658e-05, "loss": 0.0837, "step": 4458 }, { "epoch": 3.126928471248247, "grad_norm": 0.2766595780849457, "learning_rate": 1.563597961809793e-05, "loss": 0.0841, "step": 4459 }, { "epoch": 3.1276297335203367, "grad_norm": 0.29163554310798645, "learning_rate": 1.5625680358723092e-05, "loss": 0.2562, "step": 4460 }, { "epoch": 3.1283309957924264, "grad_norm": 0.23720690608024597, "learning_rate": 1.5615382950323737e-05, "loss": 0.0855, "step": 4461 }, { "epoch": 3.129032258064516, "grad_norm": 0.23523631691932678, "learning_rate": 1.560508739493311e-05, "loss": 0.0839, "step": 4462 }, { "epoch": 3.1297335203366057, "grad_norm": 0.2501157820224762, "learning_rate": 1.55947936945841e-05, "loss": 0.0834, "step": 4463 }, { "epoch": 3.130434782608696, "grad_norm": 0.4490799307823181, "learning_rate": 1.5584501851309202e-05, "loss": 0.273, "step": 4464 }, { "epoch": 3.1311360448807855, "grad_norm": 0.27598267793655396, "learning_rate": 1.5574211867140565e-05, "loss": 0.0812, "step": 4465 }, { "epoch": 3.131837307152875, "grad_norm": 0.24922612309455872, "learning_rate": 1.556392374410996e-05, "loss": 0.0839, "step": 4466 }, { "epoch": 3.132538569424965, "grad_norm": 0.3152356445789337, "learning_rate": 1.5553637484248808e-05, "loss": 0.2565, "step": 4467 }, { "epoch": 3.133239831697055, "grad_norm": 0.32387006282806396, "learning_rate": 1.5543353089588126e-05, "loss": 0.2508, "step": 4468 }, { "epoch": 3.1339410939691446, "grad_norm": 0.24724183976650238, "learning_rate": 1.5533070562158593e-05, "loss": 0.0814, "step": 4469 }, { "epoch": 3.134642356241234, "grad_norm": 0.24283884465694427, "learning_rate": 1.552278990399052e-05, "loss": 0.0798, "step": 4470 }, { "epoch": 3.135343618513324, "grad_norm": 0.25244107842445374, "learning_rate": 1.5512511117113832e-05, "loss": 0.0824, "step": 4471 }, { "epoch": 3.1360448807854135, "grad_norm": 0.33106091618537903, "learning_rate": 1.55022342035581e-05, "loss": 0.2528, "step": 4472 }, { "epoch": 3.1367461430575037, "grad_norm": 2.3991007804870605, "learning_rate": 1.5491959165352515e-05, "loss": 0.5257, "step": 4473 }, { "epoch": 3.1374474053295933, "grad_norm": 0.3286961615085602, "learning_rate": 1.5481686004525907e-05, "loss": 0.2561, "step": 4474 }, { "epoch": 3.138148667601683, "grad_norm": 0.24695514142513275, "learning_rate": 1.547141472310672e-05, "loss": 0.0801, "step": 4475 }, { "epoch": 3.1388499298737726, "grad_norm": 0.24638475477695465, "learning_rate": 1.5461145323123032e-05, "loss": 0.0725, "step": 4476 }, { "epoch": 3.1395511921458628, "grad_norm": 0.24479149281978607, "learning_rate": 1.5450877806602566e-05, "loss": 0.0802, "step": 4477 }, { "epoch": 3.1402524544179524, "grad_norm": 0.23425762355327606, "learning_rate": 1.5440612175572653e-05, "loss": 0.0838, "step": 4478 }, { "epoch": 3.140953716690042, "grad_norm": 0.31101542711257935, "learning_rate": 1.5430348432060263e-05, "loss": 0.2579, "step": 4479 }, { "epoch": 3.1416549789621318, "grad_norm": 0.2362750917673111, "learning_rate": 1.5420086578091987e-05, "loss": 0.0821, "step": 4480 }, { "epoch": 3.1423562412342214, "grad_norm": 0.31684616208076477, "learning_rate": 1.540982661569406e-05, "loss": 0.2518, "step": 4481 }, { "epoch": 3.1430575035063115, "grad_norm": 0.24184004962444305, "learning_rate": 1.5399568546892308e-05, "loss": 0.0811, "step": 4482 }, { "epoch": 3.143758765778401, "grad_norm": 0.3328947424888611, "learning_rate": 1.538931237371221e-05, "loss": 0.252, "step": 4483 }, { "epoch": 3.144460028050491, "grad_norm": 0.2346736639738083, "learning_rate": 1.537905809817887e-05, "loss": 0.0846, "step": 4484 }, { "epoch": 3.1451612903225805, "grad_norm": 0.2335439771413803, "learning_rate": 1.536880572231701e-05, "loss": 0.0847, "step": 4485 }, { "epoch": 3.1458625525946706, "grad_norm": 0.23355843126773834, "learning_rate": 1.5358555248150976e-05, "loss": 0.0835, "step": 4486 }, { "epoch": 3.1465638148667603, "grad_norm": 2.0841197967529297, "learning_rate": 1.534830667770475e-05, "loss": 0.3405, "step": 4487 }, { "epoch": 3.14726507713885, "grad_norm": 0.3066689968109131, "learning_rate": 1.5338060013001927e-05, "loss": 0.2567, "step": 4488 }, { "epoch": 3.1479663394109396, "grad_norm": 0.3159591555595398, "learning_rate": 1.532781525606572e-05, "loss": 0.261, "step": 4489 }, { "epoch": 3.1486676016830293, "grad_norm": 0.2327156960964203, "learning_rate": 1.5317572408918983e-05, "loss": 0.0842, "step": 4490 }, { "epoch": 3.1493688639551194, "grad_norm": 1.7778096199035645, "learning_rate": 1.5307331473584188e-05, "loss": 0.3246, "step": 4491 }, { "epoch": 3.150070126227209, "grad_norm": 0.23352976143360138, "learning_rate": 1.5297092452083413e-05, "loss": 0.0825, "step": 4492 }, { "epoch": 3.1507713884992987, "grad_norm": 0.30999037623405457, "learning_rate": 1.5286855346438374e-05, "loss": 0.2576, "step": 4493 }, { "epoch": 3.1514726507713884, "grad_norm": 0.247878760099411, "learning_rate": 1.5276620158670403e-05, "loss": 0.0828, "step": 4494 }, { "epoch": 3.1521739130434785, "grad_norm": 0.27301478385925293, "learning_rate": 1.5266386890800473e-05, "loss": 0.0794, "step": 4495 }, { "epoch": 3.152875175315568, "grad_norm": 0.823840856552124, "learning_rate": 1.5256155544849132e-05, "loss": 0.4364, "step": 4496 }, { "epoch": 3.153576437587658, "grad_norm": 0.24731294810771942, "learning_rate": 1.5245926122836596e-05, "loss": 0.0822, "step": 4497 }, { "epoch": 3.1542776998597475, "grad_norm": 0.2494320422410965, "learning_rate": 1.5235698626782677e-05, "loss": 0.0832, "step": 4498 }, { "epoch": 3.154978962131837, "grad_norm": 5.01049280166626, "learning_rate": 1.5225473058706813e-05, "loss": 0.7125, "step": 4499 }, { "epoch": 3.1556802244039273, "grad_norm": 0.3045954704284668, "learning_rate": 1.5215249420628056e-05, "loss": 0.2591, "step": 4500 }, { "epoch": 3.156381486676017, "grad_norm": 0.247663676738739, "learning_rate": 1.5205027714565085e-05, "loss": 0.0842, "step": 4501 }, { "epoch": 3.1570827489481066, "grad_norm": 1.4798288345336914, "learning_rate": 1.5194807942536198e-05, "loss": 0.2962, "step": 4502 }, { "epoch": 3.1577840112201963, "grad_norm": 0.24966338276863098, "learning_rate": 1.5184590106559299e-05, "loss": 0.0849, "step": 4503 }, { "epoch": 3.158485273492286, "grad_norm": 0.8690905570983887, "learning_rate": 1.5174374208651912e-05, "loss": 0.4427, "step": 4504 }, { "epoch": 3.159186535764376, "grad_norm": 0.8459251523017883, "learning_rate": 1.5164160250831194e-05, "loss": 0.4379, "step": 4505 }, { "epoch": 3.1598877980364657, "grad_norm": 0.2343387007713318, "learning_rate": 1.5153948235113902e-05, "loss": 0.0858, "step": 4506 }, { "epoch": 3.1605890603085554, "grad_norm": 0.31353843212127686, "learning_rate": 1.5143738163516418e-05, "loss": 0.2513, "step": 4507 }, { "epoch": 3.161290322580645, "grad_norm": 0.2916162610054016, "learning_rate": 1.5133530038054745e-05, "loss": 0.2584, "step": 4508 }, { "epoch": 3.161991584852735, "grad_norm": 0.2927287817001343, "learning_rate": 1.5123323860744492e-05, "loss": 0.2583, "step": 4509 }, { "epoch": 3.162692847124825, "grad_norm": 0.3088410198688507, "learning_rate": 1.5113119633600875e-05, "loss": 0.2586, "step": 4510 }, { "epoch": 3.1633941093969145, "grad_norm": 0.28167909383773804, "learning_rate": 1.5102917358638746e-05, "loss": 0.0874, "step": 4511 }, { "epoch": 3.164095371669004, "grad_norm": 0.3479258120059967, "learning_rate": 1.5092717037872561e-05, "loss": 0.2616, "step": 4512 }, { "epoch": 3.164796633941094, "grad_norm": 0.23786313831806183, "learning_rate": 1.5082518673316387e-05, "loss": 0.0881, "step": 4513 }, { "epoch": 3.165497896213184, "grad_norm": 0.2567940354347229, "learning_rate": 1.5072322266983912e-05, "loss": 0.0898, "step": 4514 }, { "epoch": 3.1661991584852736, "grad_norm": 0.23772981762886047, "learning_rate": 1.5062127820888433e-05, "loss": 0.0882, "step": 4515 }, { "epoch": 3.166900420757363, "grad_norm": 1.0113317966461182, "learning_rate": 1.5051935337042871e-05, "loss": 0.2349, "step": 4516 }, { "epoch": 3.167601683029453, "grad_norm": 0.2926931083202362, "learning_rate": 1.5041744817459733e-05, "loss": 0.2602, "step": 4517 }, { "epoch": 3.1683029453015426, "grad_norm": 0.25469622015953064, "learning_rate": 1.5031556264151158e-05, "loss": 0.0889, "step": 4518 }, { "epoch": 3.1690042075736327, "grad_norm": 0.23858322203159332, "learning_rate": 1.5021369679128894e-05, "loss": 0.0886, "step": 4519 }, { "epoch": 3.1697054698457223, "grad_norm": 0.2613413333892822, "learning_rate": 1.5011185064404306e-05, "loss": 0.0913, "step": 4520 }, { "epoch": 3.170406732117812, "grad_norm": 0.23797930777072906, "learning_rate": 1.5001002421988358e-05, "loss": 0.0886, "step": 4521 }, { "epoch": 3.1711079943899017, "grad_norm": 0.23769131302833557, "learning_rate": 1.4990821753891635e-05, "loss": 0.0881, "step": 4522 }, { "epoch": 3.1718092566619918, "grad_norm": 0.23834337294101715, "learning_rate": 1.498064306212433e-05, "loss": 0.0886, "step": 4523 }, { "epoch": 3.1725105189340814, "grad_norm": 0.2721993625164032, "learning_rate": 1.497046634869623e-05, "loss": 0.2578, "step": 4524 }, { "epoch": 3.173211781206171, "grad_norm": 0.25838202238082886, "learning_rate": 1.4960291615616751e-05, "loss": 0.0901, "step": 4525 }, { "epoch": 3.1739130434782608, "grad_norm": 0.7647160291671753, "learning_rate": 1.4950118864894912e-05, "loss": 0.4252, "step": 4526 }, { "epoch": 3.1746143057503504, "grad_norm": 0.2825545072555542, "learning_rate": 1.4939948098539345e-05, "loss": 0.2558, "step": 4527 }, { "epoch": 3.1753155680224405, "grad_norm": 0.27564841508865356, "learning_rate": 1.4929779318558276e-05, "loss": 0.2542, "step": 4528 }, { "epoch": 3.17601683029453, "grad_norm": 0.23992976546287537, "learning_rate": 1.4919612526959554e-05, "loss": 0.0886, "step": 4529 }, { "epoch": 3.17671809256662, "grad_norm": 0.2796703577041626, "learning_rate": 1.4909447725750638e-05, "loss": 0.2549, "step": 4530 }, { "epoch": 3.1774193548387095, "grad_norm": 1.2545453310012817, "learning_rate": 1.4899284916938566e-05, "loss": 0.5894, "step": 4531 }, { "epoch": 3.1781206171107996, "grad_norm": 0.2591164708137512, "learning_rate": 1.4889124102530013e-05, "loss": 0.0902, "step": 4532 }, { "epoch": 3.1788218793828893, "grad_norm": 0.28128665685653687, "learning_rate": 1.4878965284531249e-05, "loss": 0.0919, "step": 4533 }, { "epoch": 3.179523141654979, "grad_norm": 0.7809267640113831, "learning_rate": 1.4868808464948148e-05, "loss": 0.4241, "step": 4534 }, { "epoch": 3.1802244039270686, "grad_norm": 0.237825408577919, "learning_rate": 1.4858653645786191e-05, "loss": 0.0883, "step": 4535 }, { "epoch": 3.1809256661991583, "grad_norm": 0.2379598617553711, "learning_rate": 1.4848500829050465e-05, "loss": 0.0888, "step": 4536 }, { "epoch": 3.1816269284712484, "grad_norm": 0.27331870794296265, "learning_rate": 1.4838350016745672e-05, "loss": 0.2548, "step": 4537 }, { "epoch": 3.182328190743338, "grad_norm": 0.2393309772014618, "learning_rate": 1.4828201210876088e-05, "loss": 0.0892, "step": 4538 }, { "epoch": 3.1830294530154277, "grad_norm": 0.2568231225013733, "learning_rate": 1.4818054413445623e-05, "loss": 0.0903, "step": 4539 }, { "epoch": 3.1837307152875174, "grad_norm": 0.7674357295036316, "learning_rate": 1.4807909626457782e-05, "loss": 0.4206, "step": 4540 }, { "epoch": 3.1844319775596075, "grad_norm": 0.2397373467683792, "learning_rate": 1.4797766851915665e-05, "loss": 0.0892, "step": 4541 }, { "epoch": 3.185133239831697, "grad_norm": 0.2600989043712616, "learning_rate": 1.4787626091821976e-05, "loss": 0.0905, "step": 4542 }, { "epoch": 3.185834502103787, "grad_norm": 0.2714305520057678, "learning_rate": 1.4777487348179042e-05, "loss": 0.2554, "step": 4543 }, { "epoch": 3.1865357643758765, "grad_norm": 0.2587897479534149, "learning_rate": 1.4767350622988757e-05, "loss": 0.0903, "step": 4544 }, { "epoch": 3.187237026647966, "grad_norm": 0.2777040898799896, "learning_rate": 1.4757215918252642e-05, "loss": 0.2553, "step": 4545 }, { "epoch": 3.1879382889200563, "grad_norm": 0.2391783595085144, "learning_rate": 1.4747083235971815e-05, "loss": 0.0894, "step": 4546 }, { "epoch": 3.188639551192146, "grad_norm": 0.25656557083129883, "learning_rate": 1.4736952578146987e-05, "loss": 0.0899, "step": 4547 }, { "epoch": 3.1893408134642356, "grad_norm": 0.2840229570865631, "learning_rate": 1.4726823946778475e-05, "loss": 0.0917, "step": 4548 }, { "epoch": 3.1900420757363253, "grad_norm": 0.31533798575401306, "learning_rate": 1.4716697343866197e-05, "loss": 0.0895, "step": 4549 }, { "epoch": 3.1907433380084154, "grad_norm": 0.7681803107261658, "learning_rate": 1.4706572771409677e-05, "loss": 0.4207, "step": 4550 }, { "epoch": 3.191444600280505, "grad_norm": 0.311379998922348, "learning_rate": 1.4696450231408004e-05, "loss": 0.2589, "step": 4551 }, { "epoch": 3.1921458625525947, "grad_norm": 0.23957331478595734, "learning_rate": 1.468632972585991e-05, "loss": 0.0888, "step": 4552 }, { "epoch": 3.1928471248246844, "grad_norm": 0.2866700291633606, "learning_rate": 1.4676211256763698e-05, "loss": 0.0914, "step": 4553 }, { "epoch": 3.193548387096774, "grad_norm": 0.23807771503925323, "learning_rate": 1.4666094826117283e-05, "loss": 0.0887, "step": 4554 }, { "epoch": 3.194249649368864, "grad_norm": 0.2928329408168793, "learning_rate": 1.4655980435918164e-05, "loss": 0.258, "step": 4555 }, { "epoch": 3.194950911640954, "grad_norm": 0.28542789816856384, "learning_rate": 1.4645868088163455e-05, "loss": 0.0913, "step": 4556 }, { "epoch": 3.1956521739130435, "grad_norm": 0.28029000759124756, "learning_rate": 1.4635757784849864e-05, "loss": 0.2548, "step": 4557 }, { "epoch": 3.196353436185133, "grad_norm": 0.23997937142848969, "learning_rate": 1.4625649527973661e-05, "loss": 0.0893, "step": 4558 }, { "epoch": 3.1970546984572232, "grad_norm": 0.23918074369430542, "learning_rate": 1.4615543319530754e-05, "loss": 0.0892, "step": 4559 }, { "epoch": 3.197755960729313, "grad_norm": 0.2571398615837097, "learning_rate": 1.4605439161516626e-05, "loss": 0.0901, "step": 4560 }, { "epoch": 3.1984572230014026, "grad_norm": 0.28248754143714905, "learning_rate": 1.459533705592637e-05, "loss": 0.0903, "step": 4561 }, { "epoch": 3.1991584852734922, "grad_norm": 0.2719326317310333, "learning_rate": 1.4585237004754659e-05, "loss": 0.2522, "step": 4562 }, { "epoch": 3.199859747545582, "grad_norm": 0.2394731640815735, "learning_rate": 1.4575139009995758e-05, "loss": 0.089, "step": 4563 }, { "epoch": 3.200561009817672, "grad_norm": 0.2772178053855896, "learning_rate": 1.4565043073643548e-05, "loss": 0.0864, "step": 4564 }, { "epoch": 3.2012622720897617, "grad_norm": 0.2385057955980301, "learning_rate": 1.455494919769148e-05, "loss": 0.088, "step": 4565 }, { "epoch": 3.2019635343618513, "grad_norm": 0.283381849527359, "learning_rate": 1.4544857384132602e-05, "loss": 0.2548, "step": 4566 }, { "epoch": 3.202664796633941, "grad_norm": 0.23787076771259308, "learning_rate": 1.4534767634959573e-05, "loss": 0.0884, "step": 4567 }, { "epoch": 3.2033660589060307, "grad_norm": 0.25292670726776123, "learning_rate": 1.4524679952164616e-05, "loss": 0.0882, "step": 4568 }, { "epoch": 3.2040673211781208, "grad_norm": 0.32687991857528687, "learning_rate": 1.4514594337739578e-05, "loss": 0.249, "step": 4569 }, { "epoch": 3.2047685834502104, "grad_norm": 0.2819111943244934, "learning_rate": 1.4504510793675874e-05, "loss": 0.2548, "step": 4570 }, { "epoch": 3.2054698457223, "grad_norm": 0.2848909795284271, "learning_rate": 1.4494429321964515e-05, "loss": 0.2566, "step": 4571 }, { "epoch": 3.2061711079943898, "grad_norm": 0.24956397712230682, "learning_rate": 1.4484349924596097e-05, "loss": 0.0868, "step": 4572 }, { "epoch": 3.20687237026648, "grad_norm": 0.24838925898075104, "learning_rate": 1.447427260356083e-05, "loss": 0.0865, "step": 4573 }, { "epoch": 3.2075736325385695, "grad_norm": 0.24973459541797638, "learning_rate": 1.446419736084848e-05, "loss": 0.0876, "step": 4574 }, { "epoch": 3.208274894810659, "grad_norm": 2.0097904205322266, "learning_rate": 1.4454124198448446e-05, "loss": 0.3005, "step": 4575 }, { "epoch": 3.208976157082749, "grad_norm": 0.23637655377388, "learning_rate": 1.4444053118349665e-05, "loss": 0.0875, "step": 4576 }, { "epoch": 3.2096774193548385, "grad_norm": 0.28183186054229736, "learning_rate": 1.4433984122540711e-05, "loss": 0.2543, "step": 4577 }, { "epoch": 3.2103786816269286, "grad_norm": 0.2876405119895935, "learning_rate": 1.4423917213009717e-05, "loss": 0.257, "step": 4578 }, { "epoch": 3.2110799438990183, "grad_norm": 0.28492698073387146, "learning_rate": 1.4413852391744406e-05, "loss": 0.2565, "step": 4579 }, { "epoch": 3.211781206171108, "grad_norm": 0.2500091791152954, "learning_rate": 1.440378966073209e-05, "loss": 0.0873, "step": 4580 }, { "epoch": 3.2124824684431976, "grad_norm": 0.2496383637189865, "learning_rate": 1.439372902195969e-05, "loss": 0.0869, "step": 4581 }, { "epoch": 3.2131837307152873, "grad_norm": 0.291496217250824, "learning_rate": 1.4383670477413674e-05, "loss": 0.2584, "step": 4582 }, { "epoch": 3.2138849929873774, "grad_norm": 0.24704909324645996, "learning_rate": 1.4373614029080146e-05, "loss": 0.0862, "step": 4583 }, { "epoch": 3.214586255259467, "grad_norm": 0.31008002161979675, "learning_rate": 1.4363559678944738e-05, "loss": 0.0882, "step": 4584 }, { "epoch": 3.2152875175315567, "grad_norm": 0.24945266544818878, "learning_rate": 1.4353507428992741e-05, "loss": 0.0864, "step": 4585 }, { "epoch": 3.2159887798036464, "grad_norm": 0.2427823692560196, "learning_rate": 1.4343457281208938e-05, "loss": 0.0829, "step": 4586 }, { "epoch": 3.2166900420757365, "grad_norm": 1.069264531135559, "learning_rate": 1.4333409237577785e-05, "loss": 0.2649, "step": 4587 }, { "epoch": 3.217391304347826, "grad_norm": 0.23567716777324677, "learning_rate": 1.4323363300083264e-05, "loss": 0.0875, "step": 4588 }, { "epoch": 3.218092566619916, "grad_norm": 0.291107714176178, "learning_rate": 1.431331947070898e-05, "loss": 0.2577, "step": 4589 }, { "epoch": 3.2187938288920055, "grad_norm": 0.2354957014322281, "learning_rate": 1.4303277751438087e-05, "loss": 0.0875, "step": 4590 }, { "epoch": 3.219495091164095, "grad_norm": 0.2929995059967041, "learning_rate": 1.429323814425336e-05, "loss": 0.2563, "step": 4591 }, { "epoch": 3.2201963534361853, "grad_norm": 0.26843565702438354, "learning_rate": 1.4283200651137124e-05, "loss": 0.0848, "step": 4592 }, { "epoch": 3.220897615708275, "grad_norm": 0.29073867201805115, "learning_rate": 1.4273165274071303e-05, "loss": 0.2563, "step": 4593 }, { "epoch": 3.2215988779803646, "grad_norm": 1.267252802848816, "learning_rate": 1.4263132015037384e-05, "loss": 0.416, "step": 4594 }, { "epoch": 3.2223001402524543, "grad_norm": 0.2864839434623718, "learning_rate": 1.425310087601648e-05, "loss": 0.2577, "step": 4595 }, { "epoch": 3.2230014025245444, "grad_norm": 0.3438251316547394, "learning_rate": 1.4243071858989224e-05, "loss": 0.0874, "step": 4596 }, { "epoch": 3.223702664796634, "grad_norm": 0.7738631963729858, "learning_rate": 1.4233044965935893e-05, "loss": 0.4228, "step": 4597 }, { "epoch": 3.2244039270687237, "grad_norm": 0.23658612370491028, "learning_rate": 1.422302019883629e-05, "loss": 0.0874, "step": 4598 }, { "epoch": 3.2251051893408134, "grad_norm": 0.2375783771276474, "learning_rate": 1.4212997559669855e-05, "loss": 0.088, "step": 4599 }, { "epoch": 3.225806451612903, "grad_norm": 0.29004669189453125, "learning_rate": 1.4202977050415533e-05, "loss": 0.2574, "step": 4600 }, { "epoch": 3.226507713884993, "grad_norm": 0.23809018731117249, "learning_rate": 1.419295867305192e-05, "loss": 0.0884, "step": 4601 }, { "epoch": 3.227208976157083, "grad_norm": 0.2506674826145172, "learning_rate": 1.4182942429557144e-05, "loss": 0.0879, "step": 4602 }, { "epoch": 3.2279102384291725, "grad_norm": 0.2521611452102661, "learning_rate": 1.4172928321908949e-05, "loss": 0.0879, "step": 4603 }, { "epoch": 3.228611500701262, "grad_norm": 0.9524524211883545, "learning_rate": 1.4162916352084615e-05, "loss": 0.2454, "step": 4604 }, { "epoch": 3.2293127629733522, "grad_norm": 0.23672796785831451, "learning_rate": 1.4152906522061048e-05, "loss": 0.0881, "step": 4605 }, { "epoch": 3.230014025245442, "grad_norm": 0.2381451427936554, "learning_rate": 1.4142898833814694e-05, "loss": 0.0881, "step": 4606 }, { "epoch": 3.2307152875175316, "grad_norm": 0.25242364406585693, "learning_rate": 1.4132893289321575e-05, "loss": 0.0878, "step": 4607 }, { "epoch": 3.2314165497896212, "grad_norm": 0.23689498007297516, "learning_rate": 1.4122889890557325e-05, "loss": 0.088, "step": 4608 }, { "epoch": 3.232117812061711, "grad_norm": 0.31718242168426514, "learning_rate": 1.4112888639497124e-05, "loss": 0.2591, "step": 4609 }, { "epoch": 3.232819074333801, "grad_norm": 0.29720863699913025, "learning_rate": 1.4102889538115723e-05, "loss": 0.2564, "step": 4610 }, { "epoch": 3.2335203366058907, "grad_norm": 0.3363461196422577, "learning_rate": 1.4092892588387485e-05, "loss": 0.0867, "step": 4611 }, { "epoch": 3.2342215988779803, "grad_norm": 0.2572129964828491, "learning_rate": 1.4082897792286301e-05, "loss": 0.089, "step": 4612 }, { "epoch": 3.23492286115007, "grad_norm": 0.2529100775718689, "learning_rate": 1.4072905151785684e-05, "loss": 0.0876, "step": 4613 }, { "epoch": 3.23562412342216, "grad_norm": 1.0128635168075562, "learning_rate": 1.4062914668858686e-05, "loss": 0.2499, "step": 4614 }, { "epoch": 3.2363253856942498, "grad_norm": 0.23754681646823883, "learning_rate": 1.405292634547794e-05, "loss": 0.0882, "step": 4615 }, { "epoch": 3.2370266479663394, "grad_norm": 0.25348034501075745, "learning_rate": 1.4042940183615669e-05, "loss": 0.089, "step": 4616 }, { "epoch": 3.237727910238429, "grad_norm": 0.25339752435684204, "learning_rate": 1.4032956185243643e-05, "loss": 0.0884, "step": 4617 }, { "epoch": 3.2384291725105188, "grad_norm": 0.2573435306549072, "learning_rate": 1.4022974352333241e-05, "loss": 0.0854, "step": 4618 }, { "epoch": 3.239130434782609, "grad_norm": 0.23773525655269623, "learning_rate": 1.4012994686855376e-05, "loss": 0.0883, "step": 4619 }, { "epoch": 3.2398316970546985, "grad_norm": 0.2873857021331787, "learning_rate": 1.4003017190780559e-05, "loss": 0.2579, "step": 4620 }, { "epoch": 3.240532959326788, "grad_norm": 0.31477436423301697, "learning_rate": 1.399304186607885e-05, "loss": 0.2572, "step": 4621 }, { "epoch": 3.241234221598878, "grad_norm": 0.25218233466148376, "learning_rate": 1.398306871471991e-05, "loss": 0.0878, "step": 4622 }, { "epoch": 3.241935483870968, "grad_norm": 0.23714959621429443, "learning_rate": 1.397309773867294e-05, "loss": 0.0882, "step": 4623 }, { "epoch": 3.2426367461430576, "grad_norm": 0.23669707775115967, "learning_rate": 1.3963128939906744e-05, "loss": 0.0876, "step": 4624 }, { "epoch": 3.2433380084151473, "grad_norm": 0.28735727071762085, "learning_rate": 1.3953162320389662e-05, "loss": 0.2561, "step": 4625 }, { "epoch": 3.244039270687237, "grad_norm": 0.2527099847793579, "learning_rate": 1.3943197882089638e-05, "loss": 0.0878, "step": 4626 }, { "epoch": 3.2447405329593266, "grad_norm": 1.0475152730941772, "learning_rate": 1.3933235626974153e-05, "loss": 0.2528, "step": 4627 }, { "epoch": 3.2454417952314167, "grad_norm": 0.2851855456829071, "learning_rate": 1.3923275557010277e-05, "loss": 0.2564, "step": 4628 }, { "epoch": 3.2461430575035064, "grad_norm": 0.2539239227771759, "learning_rate": 1.391331767416463e-05, "loss": 0.0887, "step": 4629 }, { "epoch": 3.246844319775596, "grad_norm": 0.23668056726455688, "learning_rate": 1.3903361980403434e-05, "loss": 0.0875, "step": 4630 }, { "epoch": 3.2475455820476857, "grad_norm": 0.28795573115348816, "learning_rate": 1.3893408477692438e-05, "loss": 0.2568, "step": 4631 }, { "epoch": 3.2482468443197754, "grad_norm": 0.25227847695350647, "learning_rate": 1.3883457167996999e-05, "loss": 0.087, "step": 4632 }, { "epoch": 3.2489481065918655, "grad_norm": 0.29798921942710876, "learning_rate": 1.3873508053281996e-05, "loss": 0.2542, "step": 4633 }, { "epoch": 3.249649368863955, "grad_norm": 0.40605443716049194, "learning_rate": 1.3863561135511938e-05, "loss": 0.2646, "step": 4634 }, { "epoch": 3.250350631136045, "grad_norm": 0.2538469135761261, "learning_rate": 1.385361641665081e-05, "loss": 0.0856, "step": 4635 }, { "epoch": 3.2510518934081345, "grad_norm": 0.23711992800235748, "learning_rate": 1.3843673898662252e-05, "loss": 0.0878, "step": 4636 }, { "epoch": 3.251753155680224, "grad_norm": 0.25236499309539795, "learning_rate": 1.3833733583509406e-05, "loss": 0.0881, "step": 4637 }, { "epoch": 3.2524544179523143, "grad_norm": 0.2809286415576935, "learning_rate": 1.3823795473155026e-05, "loss": 0.0893, "step": 4638 }, { "epoch": 3.253155680224404, "grad_norm": 0.28217613697052, "learning_rate": 1.3813859569561387e-05, "loss": 0.2551, "step": 4639 }, { "epoch": 3.2538569424964936, "grad_norm": 0.2892591953277588, "learning_rate": 1.3803925874690377e-05, "loss": 0.261, "step": 4640 }, { "epoch": 3.2545582047685833, "grad_norm": 0.9750303626060486, "learning_rate": 1.3793994390503404e-05, "loss": 0.2466, "step": 4641 }, { "epoch": 3.2552594670406734, "grad_norm": 0.2878339886665344, "learning_rate": 1.378406511896146e-05, "loss": 0.0899, "step": 4642 }, { "epoch": 3.255960729312763, "grad_norm": 0.2770034074783325, "learning_rate": 1.3774138062025083e-05, "loss": 0.2536, "step": 4643 }, { "epoch": 3.2566619915848527, "grad_norm": 0.2516760528087616, "learning_rate": 1.3764213221654416e-05, "loss": 0.0878, "step": 4644 }, { "epoch": 3.2573632538569424, "grad_norm": 0.2895430028438568, "learning_rate": 1.3754290599809105e-05, "loss": 0.2561, "step": 4645 }, { "epoch": 3.258064516129032, "grad_norm": 0.25753262639045715, "learning_rate": 1.3744370198448415e-05, "loss": 0.0894, "step": 4646 }, { "epoch": 3.258765778401122, "grad_norm": 0.25230520963668823, "learning_rate": 1.3734452019531124e-05, "loss": 0.088, "step": 4647 }, { "epoch": 3.259467040673212, "grad_norm": 0.25682926177978516, "learning_rate": 1.3724536065015626e-05, "loss": 0.088, "step": 4648 }, { "epoch": 3.2601683029453015, "grad_norm": 0.2833740711212158, "learning_rate": 1.37146223368598e-05, "loss": 0.255, "step": 4649 }, { "epoch": 3.260869565217391, "grad_norm": 0.28588923811912537, "learning_rate": 1.3704710837021159e-05, "loss": 0.2574, "step": 4650 }, { "epoch": 3.2615708274894812, "grad_norm": 0.23460465669631958, "learning_rate": 1.3694801567456727e-05, "loss": 0.0866, "step": 4651 }, { "epoch": 3.262272089761571, "grad_norm": 0.8197197318077087, "learning_rate": 1.368489453012313e-05, "loss": 0.4318, "step": 4652 }, { "epoch": 3.2629733520336606, "grad_norm": 0.2852354943752289, "learning_rate": 1.3674989726976503e-05, "loss": 0.0902, "step": 4653 }, { "epoch": 3.2636746143057502, "grad_norm": 0.8151727318763733, "learning_rate": 1.3665087159972589e-05, "loss": 0.435, "step": 4654 }, { "epoch": 3.26437587657784, "grad_norm": 0.2812046408653259, "learning_rate": 1.3655186831066657e-05, "loss": 0.2568, "step": 4655 }, { "epoch": 3.26507713884993, "grad_norm": 0.2577629089355469, "learning_rate": 1.3645288742213533e-05, "loss": 0.086, "step": 4656 }, { "epoch": 3.2657784011220197, "grad_norm": 0.7821254134178162, "learning_rate": 1.3635392895367632e-05, "loss": 0.4235, "step": 4657 }, { "epoch": 3.2664796633941093, "grad_norm": 0.28111907839775085, "learning_rate": 1.3625499292482898e-05, "loss": 0.2542, "step": 4658 }, { "epoch": 3.267180925666199, "grad_norm": 0.236892968416214, "learning_rate": 1.3615607935512828e-05, "loss": 0.0875, "step": 4659 }, { "epoch": 3.267882187938289, "grad_norm": 0.25966402888298035, "learning_rate": 1.3605718826410507e-05, "loss": 0.0889, "step": 4660 }, { "epoch": 3.2685834502103788, "grad_norm": 0.293364554643631, "learning_rate": 1.3595831967128535e-05, "loss": 0.2599, "step": 4661 }, { "epoch": 3.2692847124824684, "grad_norm": 0.2369164228439331, "learning_rate": 1.3585947359619117e-05, "loss": 0.0879, "step": 4662 }, { "epoch": 3.269985974754558, "grad_norm": 0.2518913149833679, "learning_rate": 1.3576065005833966e-05, "loss": 0.0881, "step": 4663 }, { "epoch": 3.2706872370266478, "grad_norm": 0.2379426509141922, "learning_rate": 1.3566184907724366e-05, "loss": 0.0882, "step": 4664 }, { "epoch": 3.271388499298738, "grad_norm": 0.259644478559494, "learning_rate": 1.3556307067241175e-05, "loss": 0.0888, "step": 4665 }, { "epoch": 3.2720897615708275, "grad_norm": 0.28311964869499207, "learning_rate": 1.354643148633477e-05, "loss": 0.2556, "step": 4666 }, { "epoch": 3.272791023842917, "grad_norm": 0.23715434968471527, "learning_rate": 1.3536558166955127e-05, "loss": 0.0881, "step": 4667 }, { "epoch": 3.273492286115007, "grad_norm": 1.018147349357605, "learning_rate": 1.3526687111051733e-05, "loss": 0.2453, "step": 4668 }, { "epoch": 3.274193548387097, "grad_norm": 0.2826903164386749, "learning_rate": 1.351681832057365e-05, "loss": 0.2568, "step": 4669 }, { "epoch": 3.2748948106591866, "grad_norm": 0.28056925535202026, "learning_rate": 1.3506951797469476e-05, "loss": 0.2582, "step": 4670 }, { "epoch": 3.2755960729312763, "grad_norm": 0.2577386200428009, "learning_rate": 1.3497087543687392e-05, "loss": 0.0898, "step": 4671 }, { "epoch": 3.276297335203366, "grad_norm": 0.23889605700969696, "learning_rate": 1.3487225561175091e-05, "loss": 0.0882, "step": 4672 }, { "epoch": 3.2769985974754556, "grad_norm": 0.238262340426445, "learning_rate": 1.3477365851879858e-05, "loss": 0.0883, "step": 4673 }, { "epoch": 3.2776998597475457, "grad_norm": 0.2881773114204407, "learning_rate": 1.3467508417748492e-05, "loss": 0.2557, "step": 4674 }, { "epoch": 3.2784011220196354, "grad_norm": 0.28451791405677795, "learning_rate": 1.3457653260727393e-05, "loss": 0.2577, "step": 4675 }, { "epoch": 3.279102384291725, "grad_norm": 0.23990227282047272, "learning_rate": 1.3447800382762432e-05, "loss": 0.0887, "step": 4676 }, { "epoch": 3.2798036465638147, "grad_norm": 0.29260802268981934, "learning_rate": 1.3437949785799109e-05, "loss": 0.092, "step": 4677 }, { "epoch": 3.280504908835905, "grad_norm": 0.2901014983654022, "learning_rate": 1.3428101471782422e-05, "loss": 0.2574, "step": 4678 }, { "epoch": 3.2812061711079945, "grad_norm": 0.3096025884151459, "learning_rate": 1.341825544265696e-05, "loss": 0.2613, "step": 4679 }, { "epoch": 3.281907433380084, "grad_norm": 0.28266212344169617, "learning_rate": 1.3408411700366813e-05, "loss": 0.254, "step": 4680 }, { "epoch": 3.282608695652174, "grad_norm": 0.2553822696208954, "learning_rate": 1.3398570246855669e-05, "loss": 0.0891, "step": 4681 }, { "epoch": 3.2833099579242635, "grad_norm": 0.2838016748428345, "learning_rate": 1.338873108406673e-05, "loss": 0.2566, "step": 4682 }, { "epoch": 3.2840112201963536, "grad_norm": 0.2926941514015198, "learning_rate": 1.337889421394275e-05, "loss": 0.2558, "step": 4683 }, { "epoch": 3.2847124824684433, "grad_norm": 0.2600253224372864, "learning_rate": 1.3369059638426035e-05, "loss": 0.0905, "step": 4684 }, { "epoch": 3.285413744740533, "grad_norm": 0.23934175074100494, "learning_rate": 1.3359227359458448e-05, "loss": 0.0891, "step": 4685 }, { "epoch": 3.2861150070126226, "grad_norm": 0.23824624717235565, "learning_rate": 1.334939737898138e-05, "loss": 0.0882, "step": 4686 }, { "epoch": 3.2868162692847127, "grad_norm": 0.23723676800727844, "learning_rate": 1.333956969893579e-05, "loss": 0.0875, "step": 4687 }, { "epoch": 3.2875175315568024, "grad_norm": 0.2867675721645355, "learning_rate": 1.3329744321262152e-05, "loss": 0.0915, "step": 4688 }, { "epoch": 3.288218793828892, "grad_norm": 0.2571101486682892, "learning_rate": 1.3319921247900535e-05, "loss": 0.0897, "step": 4689 }, { "epoch": 3.2889200561009817, "grad_norm": 0.2799453139305115, "learning_rate": 1.3310100480790483e-05, "loss": 0.0862, "step": 4690 }, { "epoch": 3.2896213183730714, "grad_norm": 0.25608938932418823, "learning_rate": 1.3300282021871149e-05, "loss": 0.0894, "step": 4691 }, { "epoch": 3.2903225806451615, "grad_norm": 0.32125934958457947, "learning_rate": 1.3290465873081188e-05, "loss": 0.0904, "step": 4692 }, { "epoch": 3.291023842917251, "grad_norm": 0.2540777027606964, "learning_rate": 1.3280652036358837e-05, "loss": 0.0859, "step": 4693 }, { "epoch": 3.291725105189341, "grad_norm": 0.2912999987602234, "learning_rate": 1.327084051364183e-05, "loss": 0.2567, "step": 4694 }, { "epoch": 3.2924263674614305, "grad_norm": 0.24808450043201447, "learning_rate": 1.326103130686749e-05, "loss": 0.0872, "step": 4695 }, { "epoch": 3.2931276297335206, "grad_norm": 0.314471960067749, "learning_rate": 1.3251224417972655e-05, "loss": 0.2585, "step": 4696 }, { "epoch": 3.2938288920056102, "grad_norm": 0.7714337110519409, "learning_rate": 1.324141984889371e-05, "loss": 0.425, "step": 4697 }, { "epoch": 3.2945301542777, "grad_norm": 0.23535270988941193, "learning_rate": 1.3231617601566574e-05, "loss": 0.0874, "step": 4698 }, { "epoch": 3.2952314165497896, "grad_norm": 0.2808654308319092, "learning_rate": 1.3221817677926735e-05, "loss": 0.2549, "step": 4699 }, { "epoch": 3.2959326788218792, "grad_norm": 0.23604312539100647, "learning_rate": 1.3212020079909191e-05, "loss": 0.0873, "step": 4700 }, { "epoch": 3.296633941093969, "grad_norm": 0.23660075664520264, "learning_rate": 1.320222480944851e-05, "loss": 0.0872, "step": 4701 }, { "epoch": 3.297335203366059, "grad_norm": 0.2815960645675659, "learning_rate": 1.3192431868478767e-05, "loss": 0.2532, "step": 4702 }, { "epoch": 3.2980364656381487, "grad_norm": 0.23646995425224304, "learning_rate": 1.3182641258933608e-05, "loss": 0.0874, "step": 4703 }, { "epoch": 3.2987377279102383, "grad_norm": 0.253982812166214, "learning_rate": 1.3172852982746207e-05, "loss": 0.0867, "step": 4704 }, { "epoch": 3.299438990182328, "grad_norm": 0.2350856065750122, "learning_rate": 1.316306704184927e-05, "loss": 0.0867, "step": 4705 }, { "epoch": 3.300140252454418, "grad_norm": 0.3602187931537628, "learning_rate": 1.3153283438175034e-05, "loss": 0.2654, "step": 4706 }, { "epoch": 3.3008415147265078, "grad_norm": 0.2729870080947876, "learning_rate": 1.3143502173655318e-05, "loss": 0.0835, "step": 4707 }, { "epoch": 3.3015427769985974, "grad_norm": 1.1710419654846191, "learning_rate": 1.3133723250221419e-05, "loss": 0.2649, "step": 4708 }, { "epoch": 3.302244039270687, "grad_norm": 0.2357354760169983, "learning_rate": 1.3123946669804233e-05, "loss": 0.087, "step": 4709 }, { "epoch": 3.3029453015427768, "grad_norm": 0.2868809103965759, "learning_rate": 1.3114172434334144e-05, "loss": 0.2567, "step": 4710 }, { "epoch": 3.303646563814867, "grad_norm": 0.31517326831817627, "learning_rate": 1.310440054574108e-05, "loss": 0.2543, "step": 4711 }, { "epoch": 3.3043478260869565, "grad_norm": 0.2358502894639969, "learning_rate": 1.3094631005954547e-05, "loss": 0.0864, "step": 4712 }, { "epoch": 3.305049088359046, "grad_norm": 0.24903500080108643, "learning_rate": 1.308486381690353e-05, "loss": 0.0865, "step": 4713 }, { "epoch": 3.305750350631136, "grad_norm": 1.2860177755355835, "learning_rate": 1.3075098980516597e-05, "loss": 0.5962, "step": 4714 }, { "epoch": 3.306451612903226, "grad_norm": 0.23535338044166565, "learning_rate": 1.306533649872182e-05, "loss": 0.0866, "step": 4715 }, { "epoch": 3.3071528751753156, "grad_norm": 0.2744186222553253, "learning_rate": 1.3055576373446827e-05, "loss": 0.0871, "step": 4716 }, { "epoch": 3.3078541374474053, "grad_norm": 0.23608872294425964, "learning_rate": 1.3045818606618765e-05, "loss": 0.087, "step": 4717 }, { "epoch": 3.308555399719495, "grad_norm": 0.28914931416511536, "learning_rate": 1.3036063200164325e-05, "loss": 0.2558, "step": 4718 }, { "epoch": 3.3092566619915846, "grad_norm": 0.24729374051094055, "learning_rate": 1.3026310156009718e-05, "loss": 0.0858, "step": 4719 }, { "epoch": 3.3099579242636747, "grad_norm": 0.30024152994155884, "learning_rate": 1.3016559476080714e-05, "loss": 0.2553, "step": 4720 }, { "epoch": 3.3106591865357644, "grad_norm": 0.30151185393333435, "learning_rate": 1.300681116230259e-05, "loss": 0.2526, "step": 4721 }, { "epoch": 3.311360448807854, "grad_norm": 0.2357397824525833, "learning_rate": 1.299706521660018e-05, "loss": 0.0867, "step": 4722 }, { "epoch": 3.3120617110799437, "grad_norm": 0.2850799262523651, "learning_rate": 1.2987321640897821e-05, "loss": 0.2551, "step": 4723 }, { "epoch": 3.312762973352034, "grad_norm": 0.24684706330299377, "learning_rate": 1.2977580437119424e-05, "loss": 0.0849, "step": 4724 }, { "epoch": 3.3134642356241235, "grad_norm": 0.23583219945430756, "learning_rate": 1.2967841607188374e-05, "loss": 0.0868, "step": 4725 }, { "epoch": 3.314165497896213, "grad_norm": 0.24998371303081512, "learning_rate": 1.2958105153027645e-05, "loss": 0.0849, "step": 4726 }, { "epoch": 3.314866760168303, "grad_norm": 0.2910756766796112, "learning_rate": 1.2948371076559702e-05, "loss": 0.2564, "step": 4727 }, { "epoch": 3.3155680224403925, "grad_norm": 0.2500705420970917, "learning_rate": 1.2938639379706566e-05, "loss": 0.0865, "step": 4728 }, { "epoch": 3.3162692847124826, "grad_norm": 0.2949768006801605, "learning_rate": 1.2928910064389762e-05, "loss": 0.2548, "step": 4729 }, { "epoch": 3.3169705469845723, "grad_norm": 0.2352568507194519, "learning_rate": 1.2919183132530382e-05, "loss": 0.0867, "step": 4730 }, { "epoch": 3.317671809256662, "grad_norm": 0.7864445447921753, "learning_rate": 1.2909458586049012e-05, "loss": 0.4262, "step": 4731 }, { "epoch": 3.3183730715287516, "grad_norm": 0.2507055401802063, "learning_rate": 1.2899736426865782e-05, "loss": 0.0855, "step": 4732 }, { "epoch": 3.3190743338008417, "grad_norm": 0.7985792756080627, "learning_rate": 1.2890016656900338e-05, "loss": 0.4261, "step": 4733 }, { "epoch": 3.3197755960729314, "grad_norm": 0.2507522702217102, "learning_rate": 1.2880299278071889e-05, "loss": 0.0858, "step": 4734 }, { "epoch": 3.320476858345021, "grad_norm": 0.2902856469154358, "learning_rate": 1.2870584292299127e-05, "loss": 0.2555, "step": 4735 }, { "epoch": 3.3211781206171107, "grad_norm": 0.23603853583335876, "learning_rate": 1.2860871701500305e-05, "loss": 0.087, "step": 4736 }, { "epoch": 3.3218793828892004, "grad_norm": 0.2363576591014862, "learning_rate": 1.2851161507593179e-05, "loss": 0.0875, "step": 4737 }, { "epoch": 3.3225806451612905, "grad_norm": 0.24903596937656403, "learning_rate": 1.2841453712495073e-05, "loss": 0.0856, "step": 4738 }, { "epoch": 3.32328190743338, "grad_norm": 0.33716511726379395, "learning_rate": 1.283174831812276e-05, "loss": 0.2594, "step": 4739 }, { "epoch": 3.32398316970547, "grad_norm": 0.2362283170223236, "learning_rate": 1.2822045326392627e-05, "loss": 0.0867, "step": 4740 }, { "epoch": 3.3246844319775595, "grad_norm": 0.24779778718948364, "learning_rate": 1.2812344739220522e-05, "loss": 0.086, "step": 4741 }, { "epoch": 3.3253856942496496, "grad_norm": 0.25050488114356995, "learning_rate": 1.2802646558521858e-05, "loss": 0.0866, "step": 4742 }, { "epoch": 3.3260869565217392, "grad_norm": 0.23741304874420166, "learning_rate": 1.2792950786211547e-05, "loss": 0.0871, "step": 4743 }, { "epoch": 3.326788218793829, "grad_norm": 0.34573256969451904, "learning_rate": 1.2783257424204048e-05, "loss": 0.2619, "step": 4744 }, { "epoch": 3.3274894810659186, "grad_norm": 0.2471298724412918, "learning_rate": 1.2773566474413326e-05, "loss": 0.0854, "step": 4745 }, { "epoch": 3.3281907433380082, "grad_norm": 0.2927061915397644, "learning_rate": 1.2763877938752872e-05, "loss": 0.2577, "step": 4746 }, { "epoch": 3.3288920056100983, "grad_norm": 0.2490745633840561, "learning_rate": 1.27541918191357e-05, "loss": 0.0848, "step": 4747 }, { "epoch": 3.329593267882188, "grad_norm": 0.581524133682251, "learning_rate": 1.2744508117474362e-05, "loss": 0.2722, "step": 4748 }, { "epoch": 3.3302945301542777, "grad_norm": 0.2365335077047348, "learning_rate": 1.2734826835680913e-05, "loss": 0.0867, "step": 4749 }, { "epoch": 3.3309957924263673, "grad_norm": 0.26607903838157654, "learning_rate": 1.2725147975666948e-05, "loss": 0.0836, "step": 4750 }, { "epoch": 3.3316970546984574, "grad_norm": 0.2889515161514282, "learning_rate": 1.271547153934356e-05, "loss": 0.2565, "step": 4751 }, { "epoch": 3.332398316970547, "grad_norm": 0.24603591859340668, "learning_rate": 1.2705797528621399e-05, "loss": 0.0832, "step": 4752 }, { "epoch": 3.333099579242637, "grad_norm": 0.2353249341249466, "learning_rate": 1.2696125945410598e-05, "loss": 0.0865, "step": 4753 }, { "epoch": 3.3338008415147264, "grad_norm": 0.23700633645057678, "learning_rate": 1.2686456791620837e-05, "loss": 0.0866, "step": 4754 }, { "epoch": 3.334502103786816, "grad_norm": 0.2989386320114136, "learning_rate": 1.2676790069161293e-05, "loss": 0.2595, "step": 4755 }, { "epoch": 3.335203366058906, "grad_norm": 0.301077276468277, "learning_rate": 1.2667125779940692e-05, "loss": 0.2579, "step": 4756 }, { "epoch": 3.335904628330996, "grad_norm": 0.2485160082578659, "learning_rate": 1.2657463925867251e-05, "loss": 0.0855, "step": 4757 }, { "epoch": 3.3366058906030855, "grad_norm": 0.3195628523826599, "learning_rate": 1.264780450884874e-05, "loss": 0.254, "step": 4758 }, { "epoch": 3.337307152875175, "grad_norm": 0.2348202019929886, "learning_rate": 1.2638147530792412e-05, "loss": 0.0866, "step": 4759 }, { "epoch": 3.3380084151472653, "grad_norm": 0.23677818477153778, "learning_rate": 1.262849299360505e-05, "loss": 0.0862, "step": 4760 }, { "epoch": 3.338709677419355, "grad_norm": 0.23447056114673615, "learning_rate": 1.2618840899192975e-05, "loss": 0.0861, "step": 4761 }, { "epoch": 3.3394109396914446, "grad_norm": 0.23534788191318512, "learning_rate": 1.2609191249461987e-05, "loss": 0.0851, "step": 4762 }, { "epoch": 3.3401122019635343, "grad_norm": 0.23482264578342438, "learning_rate": 1.2599544046317451e-05, "loss": 0.0864, "step": 4763 }, { "epoch": 3.340813464235624, "grad_norm": 0.24752086400985718, "learning_rate": 1.2589899291664206e-05, "loss": 0.0851, "step": 4764 }, { "epoch": 3.3415147265077136, "grad_norm": 0.26477643847465515, "learning_rate": 1.258025698740664e-05, "loss": 0.0824, "step": 4765 }, { "epoch": 3.3422159887798037, "grad_norm": 0.26801374554634094, "learning_rate": 1.2570617135448631e-05, "loss": 0.082, "step": 4766 }, { "epoch": 3.3429172510518934, "grad_norm": 0.30050796270370483, "learning_rate": 1.256097973769359e-05, "loss": 0.2537, "step": 4767 }, { "epoch": 3.343618513323983, "grad_norm": 0.23330329358577728, "learning_rate": 1.2551344796044425e-05, "loss": 0.0851, "step": 4768 }, { "epoch": 3.3443197755960727, "grad_norm": 0.24757592380046844, "learning_rate": 1.2541712312403591e-05, "loss": 0.0835, "step": 4769 }, { "epoch": 3.345021037868163, "grad_norm": 1.9150376319885254, "learning_rate": 1.253208228867302e-05, "loss": 0.6247, "step": 4770 }, { "epoch": 3.3457223001402525, "grad_norm": 0.2605372965335846, "learning_rate": 1.2522454726754196e-05, "loss": 0.0806, "step": 4771 }, { "epoch": 3.346423562412342, "grad_norm": 0.3673856556415558, "learning_rate": 1.2512829628548079e-05, "loss": 0.2669, "step": 4772 }, { "epoch": 3.347124824684432, "grad_norm": 0.23437075316905975, "learning_rate": 1.2503206995955186e-05, "loss": 0.0855, "step": 4773 }, { "epoch": 3.3478260869565215, "grad_norm": 2.023259401321411, "learning_rate": 1.2493586830875489e-05, "loss": 0.6337, "step": 4774 }, { "epoch": 3.3485273492286116, "grad_norm": 0.23374557495117188, "learning_rate": 1.2483969135208532e-05, "loss": 0.0849, "step": 4775 }, { "epoch": 3.3492286115007013, "grad_norm": 0.24663391709327698, "learning_rate": 1.2474353910853326e-05, "loss": 0.0833, "step": 4776 }, { "epoch": 3.349929873772791, "grad_norm": 0.2432050108909607, "learning_rate": 1.2464741159708435e-05, "loss": 0.0836, "step": 4777 }, { "epoch": 3.3506311360448806, "grad_norm": 0.23780454695224762, "learning_rate": 1.2455130883671892e-05, "loss": 0.0805, "step": 4778 }, { "epoch": 3.3513323983169707, "grad_norm": 0.30142471194267273, "learning_rate": 1.2445523084641283e-05, "loss": 0.2585, "step": 4779 }, { "epoch": 3.3520336605890604, "grad_norm": 0.2333812564611435, "learning_rate": 1.2435917764513677e-05, "loss": 0.0849, "step": 4780 }, { "epoch": 3.35273492286115, "grad_norm": 0.24740448594093323, "learning_rate": 1.2426314925185653e-05, "loss": 0.0842, "step": 4781 }, { "epoch": 3.3534361851332397, "grad_norm": 0.24477696418762207, "learning_rate": 1.241671456855331e-05, "loss": 0.084, "step": 4782 }, { "epoch": 3.3541374474053294, "grad_norm": 0.2624581456184387, "learning_rate": 1.240711669651226e-05, "loss": 0.0818, "step": 4783 }, { "epoch": 3.3548387096774195, "grad_norm": 0.26243746280670166, "learning_rate": 1.2397521310957613e-05, "loss": 0.082, "step": 4784 }, { "epoch": 3.355539971949509, "grad_norm": 0.24321246147155762, "learning_rate": 1.2387928413784008e-05, "loss": 0.0831, "step": 4785 }, { "epoch": 3.356241234221599, "grad_norm": 0.24432916939258575, "learning_rate": 1.237833800688556e-05, "loss": 0.0833, "step": 4786 }, { "epoch": 3.3569424964936885, "grad_norm": 3.627211570739746, "learning_rate": 1.236875009215594e-05, "loss": 0.6035, "step": 4787 }, { "epoch": 3.3576437587657786, "grad_norm": 0.3615441918373108, "learning_rate": 1.235916467148826e-05, "loss": 0.2652, "step": 4788 }, { "epoch": 3.3583450210378682, "grad_norm": 0.23207087814807892, "learning_rate": 1.2349581746775208e-05, "loss": 0.0843, "step": 4789 }, { "epoch": 3.359046283309958, "grad_norm": 0.26336073875427246, "learning_rate": 1.2340001319908925e-05, "loss": 0.0807, "step": 4790 }, { "epoch": 3.3597475455820476, "grad_norm": 0.23233480751514435, "learning_rate": 1.2330423392781107e-05, "loss": 0.0853, "step": 4791 }, { "epoch": 3.3604488078541372, "grad_norm": 0.30185264348983765, "learning_rate": 1.2320847967282914e-05, "loss": 0.2584, "step": 4792 }, { "epoch": 3.3611500701262274, "grad_norm": 0.24889537692070007, "learning_rate": 1.2311275045305043e-05, "loss": 0.0844, "step": 4793 }, { "epoch": 3.361851332398317, "grad_norm": 0.2335103154182434, "learning_rate": 1.2301704628737678e-05, "loss": 0.0855, "step": 4794 }, { "epoch": 3.3625525946704067, "grad_norm": 0.24169589579105377, "learning_rate": 1.2292136719470518e-05, "loss": 0.0828, "step": 4795 }, { "epoch": 3.3632538569424963, "grad_norm": 0.24135102331638336, "learning_rate": 1.2282571319392747e-05, "loss": 0.0826, "step": 4796 }, { "epoch": 3.3639551192145865, "grad_norm": 4.205555438995361, "learning_rate": 1.227300843039309e-05, "loss": 0.7713, "step": 4797 }, { "epoch": 3.364656381486676, "grad_norm": 0.2332838624715805, "learning_rate": 1.2263448054359739e-05, "loss": 0.0851, "step": 4798 }, { "epoch": 3.365357643758766, "grad_norm": 0.2591381371021271, "learning_rate": 1.2253890193180425e-05, "loss": 0.0806, "step": 4799 }, { "epoch": 3.3660589060308554, "grad_norm": 0.3074290454387665, "learning_rate": 1.2244334848742357e-05, "loss": 0.2607, "step": 4800 }, { "epoch": 3.366760168302945, "grad_norm": 0.31125178933143616, "learning_rate": 1.2234782022932237e-05, "loss": 0.2622, "step": 4801 }, { "epoch": 3.367461430575035, "grad_norm": 0.3000907301902771, "learning_rate": 1.2225231717636316e-05, "loss": 0.2564, "step": 4802 }, { "epoch": 3.368162692847125, "grad_norm": 1.2687405347824097, "learning_rate": 1.22156839347403e-05, "loss": 0.2786, "step": 4803 }, { "epoch": 3.3688639551192145, "grad_norm": 0.23214536905288696, "learning_rate": 1.2206138676129409e-05, "loss": 0.0848, "step": 4804 }, { "epoch": 3.369565217391304, "grad_norm": 0.30510666966438293, "learning_rate": 1.2196595943688391e-05, "loss": 0.2577, "step": 4805 }, { "epoch": 3.3702664796633943, "grad_norm": 0.2477756291627884, "learning_rate": 1.2187055739301453e-05, "loss": 0.0842, "step": 4806 }, { "epoch": 3.370967741935484, "grad_norm": 0.3027876317501068, "learning_rate": 1.217751806485235e-05, "loss": 0.2557, "step": 4807 }, { "epoch": 3.3716690042075736, "grad_norm": 0.2330096811056137, "learning_rate": 1.2167982922224294e-05, "loss": 0.0855, "step": 4808 }, { "epoch": 3.3723702664796633, "grad_norm": 0.24615877866744995, "learning_rate": 1.215845031330001e-05, "loss": 0.0843, "step": 4809 }, { "epoch": 3.373071528751753, "grad_norm": 0.30019938945770264, "learning_rate": 1.214892023996175e-05, "loss": 0.2557, "step": 4810 }, { "epoch": 3.373772791023843, "grad_norm": 0.2338753640651703, "learning_rate": 1.2139392704091219e-05, "loss": 0.0855, "step": 4811 }, { "epoch": 3.3744740532959328, "grad_norm": 0.23308143019676208, "learning_rate": 1.2129867707569667e-05, "loss": 0.0857, "step": 4812 }, { "epoch": 3.3751753155680224, "grad_norm": 0.27080363035202026, "learning_rate": 1.2120345252277809e-05, "loss": 0.0836, "step": 4813 }, { "epoch": 3.375876577840112, "grad_norm": 0.24369865655899048, "learning_rate": 1.2110825340095874e-05, "loss": 0.084, "step": 4814 }, { "epoch": 3.376577840112202, "grad_norm": 0.2332410216331482, "learning_rate": 1.2101307972903573e-05, "loss": 0.0853, "step": 4815 }, { "epoch": 3.377279102384292, "grad_norm": 0.4047980010509491, "learning_rate": 1.2091793152580147e-05, "loss": 0.2646, "step": 4816 }, { "epoch": 3.3779803646563815, "grad_norm": 0.2333509773015976, "learning_rate": 1.2082280881004291e-05, "loss": 0.0853, "step": 4817 }, { "epoch": 3.378681626928471, "grad_norm": 0.30939286947250366, "learning_rate": 1.2072771160054244e-05, "loss": 0.2556, "step": 4818 }, { "epoch": 3.379382889200561, "grad_norm": 0.27013513445854187, "learning_rate": 1.2063263991607692e-05, "loss": 0.0833, "step": 4819 }, { "epoch": 3.380084151472651, "grad_norm": 0.31396234035491943, "learning_rate": 1.2053759377541866e-05, "loss": 0.2575, "step": 4820 }, { "epoch": 3.3807854137447406, "grad_norm": 1.3759814500808716, "learning_rate": 1.2044257319733453e-05, "loss": 0.6129, "step": 4821 }, { "epoch": 3.3814866760168303, "grad_norm": 0.24772433936595917, "learning_rate": 1.2034757820058656e-05, "loss": 0.0823, "step": 4822 }, { "epoch": 3.38218793828892, "grad_norm": 0.2332610785961151, "learning_rate": 1.2025260880393156e-05, "loss": 0.0853, "step": 4823 }, { "epoch": 3.38288920056101, "grad_norm": 0.24227793514728546, "learning_rate": 1.2015766502612158e-05, "loss": 0.0829, "step": 4824 }, { "epoch": 3.3835904628330997, "grad_norm": 0.3096465766429901, "learning_rate": 1.2006274688590325e-05, "loss": 0.2619, "step": 4825 }, { "epoch": 3.3842917251051894, "grad_norm": 0.2335234433412552, "learning_rate": 1.199678544020185e-05, "loss": 0.0859, "step": 4826 }, { "epoch": 3.384992987377279, "grad_norm": 0.23272500932216644, "learning_rate": 1.1987298759320384e-05, "loss": 0.0848, "step": 4827 }, { "epoch": 3.3856942496493687, "grad_norm": 0.2948565185070038, "learning_rate": 1.1977814647819117e-05, "loss": 0.0814, "step": 4828 }, { "epoch": 3.3863955119214584, "grad_norm": 0.2336227148771286, "learning_rate": 1.1968333107570664e-05, "loss": 0.0856, "step": 4829 }, { "epoch": 3.3870967741935485, "grad_norm": 0.30281805992126465, "learning_rate": 1.1958854140447201e-05, "loss": 0.2578, "step": 4830 }, { "epoch": 3.387798036465638, "grad_norm": 0.299886554479599, "learning_rate": 1.1949377748320346e-05, "loss": 0.2568, "step": 4831 }, { "epoch": 3.388499298737728, "grad_norm": 1.6766594648361206, "learning_rate": 1.1939903933061248e-05, "loss": 0.475, "step": 4832 }, { "epoch": 3.3892005610098175, "grad_norm": 0.23344066739082336, "learning_rate": 1.1930432696540508e-05, "loss": 0.0854, "step": 4833 }, { "epoch": 3.3899018232819076, "grad_norm": 0.30893585085868835, "learning_rate": 1.192096404062826e-05, "loss": 0.2568, "step": 4834 }, { "epoch": 3.3906030855539973, "grad_norm": 1.3396894931793213, "learning_rate": 1.1911497967194096e-05, "loss": 0.2884, "step": 4835 }, { "epoch": 3.391304347826087, "grad_norm": 0.24594071507453918, "learning_rate": 1.1902034478107107e-05, "loss": 0.0842, "step": 4836 }, { "epoch": 3.3920056100981766, "grad_norm": 0.3055954575538635, "learning_rate": 1.1892573575235866e-05, "loss": 0.2596, "step": 4837 }, { "epoch": 3.3927068723702662, "grad_norm": 0.24601487815380096, "learning_rate": 1.1883115260448466e-05, "loss": 0.0849, "step": 4838 }, { "epoch": 3.3934081346423564, "grad_norm": 0.29574811458587646, "learning_rate": 1.1873659535612444e-05, "loss": 0.2579, "step": 4839 }, { "epoch": 3.394109396914446, "grad_norm": 0.2470911145210266, "learning_rate": 1.1864206402594869e-05, "loss": 0.0847, "step": 4840 }, { "epoch": 3.3948106591865357, "grad_norm": 0.23420566320419312, "learning_rate": 1.1854755863262262e-05, "loss": 0.0856, "step": 4841 }, { "epoch": 3.3955119214586253, "grad_norm": 0.3072197437286377, "learning_rate": 1.1845307919480677e-05, "loss": 0.2556, "step": 4842 }, { "epoch": 3.3962131837307155, "grad_norm": 0.2984790802001953, "learning_rate": 1.1835862573115586e-05, "loss": 0.2585, "step": 4843 }, { "epoch": 3.396914446002805, "grad_norm": 0.23362688720226288, "learning_rate": 1.1826419826032017e-05, "loss": 0.0855, "step": 4844 }, { "epoch": 3.397615708274895, "grad_norm": 0.24961163103580475, "learning_rate": 1.1816979680094442e-05, "loss": 0.0851, "step": 4845 }, { "epoch": 3.3983169705469845, "grad_norm": 0.24822835624217987, "learning_rate": 1.180754213716685e-05, "loss": 0.0855, "step": 4846 }, { "epoch": 3.399018232819074, "grad_norm": 0.2974912226200104, "learning_rate": 1.179810719911268e-05, "loss": 0.2588, "step": 4847 }, { "epoch": 3.3997194950911642, "grad_norm": 1.3341739177703857, "learning_rate": 1.1788674867794894e-05, "loss": 0.4312, "step": 4848 }, { "epoch": 3.400420757363254, "grad_norm": 0.23275312781333923, "learning_rate": 1.1779245145075917e-05, "loss": 0.0853, "step": 4849 }, { "epoch": 3.4011220196353436, "grad_norm": 0.2350710779428482, "learning_rate": 1.1769818032817656e-05, "loss": 0.0862, "step": 4850 }, { "epoch": 3.401823281907433, "grad_norm": 0.2351149171590805, "learning_rate": 1.1760393532881522e-05, "loss": 0.086, "step": 4851 }, { "epoch": 3.4025245441795233, "grad_norm": 0.2949346601963043, "learning_rate": 1.1750971647128398e-05, "loss": 0.2555, "step": 4852 }, { "epoch": 3.403225806451613, "grad_norm": 0.2503627836704254, "learning_rate": 1.1741552377418633e-05, "loss": 0.086, "step": 4853 }, { "epoch": 3.4039270687237027, "grad_norm": 0.23431795835494995, "learning_rate": 1.1732135725612104e-05, "loss": 0.0861, "step": 4854 }, { "epoch": 3.4046283309957923, "grad_norm": 0.29535433650016785, "learning_rate": 1.1722721693568122e-05, "loss": 0.2565, "step": 4855 }, { "epoch": 3.405329593267882, "grad_norm": 0.33181408047676086, "learning_rate": 1.1713310283145524e-05, "loss": 0.2625, "step": 4856 }, { "epoch": 3.406030855539972, "grad_norm": 1.4138994216918945, "learning_rate": 1.17039014962026e-05, "loss": 0.301, "step": 4857 }, { "epoch": 3.4067321178120618, "grad_norm": 0.23495489358901978, "learning_rate": 1.1694495334597122e-05, "loss": 0.0869, "step": 4858 }, { "epoch": 3.4074333800841514, "grad_norm": 0.2333025336265564, "learning_rate": 1.168509180018637e-05, "loss": 0.086, "step": 4859 }, { "epoch": 3.408134642356241, "grad_norm": 0.29394423961639404, "learning_rate": 1.1675690894827071e-05, "loss": 0.2562, "step": 4860 }, { "epoch": 3.408835904628331, "grad_norm": 0.2351870834827423, "learning_rate": 1.1666292620375469e-05, "loss": 0.0871, "step": 4861 }, { "epoch": 3.409537166900421, "grad_norm": 0.8075534105300903, "learning_rate": 1.1656896978687259e-05, "loss": 0.4339, "step": 4862 }, { "epoch": 3.4102384291725105, "grad_norm": 0.29458239674568176, "learning_rate": 1.1647503971617626e-05, "loss": 0.2602, "step": 4863 }, { "epoch": 3.4109396914446, "grad_norm": 0.27977094054222107, "learning_rate": 1.1638113601021225e-05, "loss": 0.0882, "step": 4864 }, { "epoch": 3.41164095371669, "grad_norm": 0.23539167642593384, "learning_rate": 1.1628725868752222e-05, "loss": 0.0869, "step": 4865 }, { "epoch": 3.41234221598878, "grad_norm": 0.9743853807449341, "learning_rate": 1.1619340776664223e-05, "loss": 0.2524, "step": 4866 }, { "epoch": 3.4130434782608696, "grad_norm": 0.255077987909317, "learning_rate": 1.1609958326610345e-05, "loss": 0.088, "step": 4867 }, { "epoch": 3.4137447405329593, "grad_norm": 0.2511155903339386, "learning_rate": 1.1600578520443154e-05, "loss": 0.0871, "step": 4868 }, { "epoch": 3.414446002805049, "grad_norm": 0.29414498805999756, "learning_rate": 1.1591201360014725e-05, "loss": 0.2582, "step": 4869 }, { "epoch": 3.415147265077139, "grad_norm": 0.2357393205165863, "learning_rate": 1.1581826847176586e-05, "loss": 0.0868, "step": 4870 }, { "epoch": 3.4158485273492287, "grad_norm": 0.2353390008211136, "learning_rate": 1.1572454983779751e-05, "loss": 0.0869, "step": 4871 }, { "epoch": 3.4165497896213184, "grad_norm": 0.25351977348327637, "learning_rate": 1.1563085771674703e-05, "loss": 0.0871, "step": 4872 }, { "epoch": 3.417251051893408, "grad_norm": 0.2568106949329376, "learning_rate": 1.1553719212711423e-05, "loss": 0.0886, "step": 4873 }, { "epoch": 3.4179523141654977, "grad_norm": 0.23656882345676422, "learning_rate": 1.1544355308739338e-05, "loss": 0.0873, "step": 4874 }, { "epoch": 3.418653576437588, "grad_norm": 0.2561438977718353, "learning_rate": 1.1534994061607387e-05, "loss": 0.0886, "step": 4875 }, { "epoch": 3.4193548387096775, "grad_norm": 0.29616519808769226, "learning_rate": 1.1525635473163942e-05, "loss": 0.2601, "step": 4876 }, { "epoch": 3.420056100981767, "grad_norm": 0.30481526255607605, "learning_rate": 1.1516279545256905e-05, "loss": 0.2611, "step": 4877 }, { "epoch": 3.420757363253857, "grad_norm": 1.2842882871627808, "learning_rate": 1.1506926279733582e-05, "loss": 0.5926, "step": 4878 }, { "epoch": 3.421458625525947, "grad_norm": 0.2852122485637665, "learning_rate": 1.1497575678440814e-05, "loss": 0.2567, "step": 4879 }, { "epoch": 3.4221598877980366, "grad_norm": 1.3011298179626465, "learning_rate": 1.148822774322488e-05, "loss": 0.5987, "step": 4880 }, { "epoch": 3.4228611500701263, "grad_norm": 0.2513676881790161, "learning_rate": 1.1478882475931562e-05, "loss": 0.088, "step": 4881 }, { "epoch": 3.423562412342216, "grad_norm": 0.2354552447795868, "learning_rate": 1.1469539878406083e-05, "loss": 0.087, "step": 4882 }, { "epoch": 3.4242636746143056, "grad_norm": 0.28561660647392273, "learning_rate": 1.146019995249317e-05, "loss": 0.2566, "step": 4883 }, { "epoch": 3.4249649368863957, "grad_norm": 0.25347909331321716, "learning_rate": 1.1450862700036998e-05, "loss": 0.088, "step": 4884 }, { "epoch": 3.4256661991584854, "grad_norm": 0.23429147899150848, "learning_rate": 1.1441528122881228e-05, "loss": 0.0866, "step": 4885 }, { "epoch": 3.426367461430575, "grad_norm": 0.9598780274391174, "learning_rate": 1.1432196222868974e-05, "loss": 0.2448, "step": 4886 }, { "epoch": 3.4270687237026647, "grad_norm": 0.29273632168769836, "learning_rate": 1.1422867001842857e-05, "loss": 0.2576, "step": 4887 }, { "epoch": 3.427769985974755, "grad_norm": 0.2566283345222473, "learning_rate": 1.1413540461644931e-05, "loss": 0.0884, "step": 4888 }, { "epoch": 3.4284712482468445, "grad_norm": 0.2367866039276123, "learning_rate": 1.1404216604116751e-05, "loss": 0.0872, "step": 4889 }, { "epoch": 3.429172510518934, "grad_norm": 0.2625068426132202, "learning_rate": 1.1394895431099314e-05, "loss": 0.0871, "step": 4890 }, { "epoch": 3.429873772791024, "grad_norm": 0.23467765748500824, "learning_rate": 1.1385576944433126e-05, "loss": 0.0871, "step": 4891 }, { "epoch": 3.4305750350631135, "grad_norm": 0.7880160808563232, "learning_rate": 1.1376261145958107e-05, "loss": 0.4276, "step": 4892 }, { "epoch": 3.431276297335203, "grad_norm": 0.7823438048362732, "learning_rate": 1.1366948037513698e-05, "loss": 0.4256, "step": 4893 }, { "epoch": 3.4319775596072932, "grad_norm": 0.28553035855293274, "learning_rate": 1.1357637620938773e-05, "loss": 0.2568, "step": 4894 }, { "epoch": 3.432678821879383, "grad_norm": 0.28912442922592163, "learning_rate": 1.134832989807171e-05, "loss": 0.2595, "step": 4895 }, { "epoch": 3.4333800841514726, "grad_norm": 0.27911797165870667, "learning_rate": 1.1339024870750314e-05, "loss": 0.0893, "step": 4896 }, { "epoch": 3.434081346423562, "grad_norm": 0.2804643511772156, "learning_rate": 1.1329722540811896e-05, "loss": 0.2569, "step": 4897 }, { "epoch": 3.4347826086956523, "grad_norm": 0.27808499336242676, "learning_rate": 1.1320422910093211e-05, "loss": 0.2556, "step": 4898 }, { "epoch": 3.435483870967742, "grad_norm": 1.2592123746871948, "learning_rate": 1.1311125980430478e-05, "loss": 0.4138, "step": 4899 }, { "epoch": 3.4361851332398317, "grad_norm": 0.28060585260391235, "learning_rate": 1.1301831753659405e-05, "loss": 0.2543, "step": 4900 }, { "epoch": 3.4368863955119213, "grad_norm": 0.23564676940441132, "learning_rate": 1.129254023161515e-05, "loss": 0.0877, "step": 4901 }, { "epoch": 3.437587657784011, "grad_norm": 0.2372344732284546, "learning_rate": 1.1283251416132327e-05, "loss": 0.0882, "step": 4902 }, { "epoch": 3.438288920056101, "grad_norm": 1.035268783569336, "learning_rate": 1.1273965309045048e-05, "loss": 0.2454, "step": 4903 }, { "epoch": 3.4389901823281908, "grad_norm": 0.2585485875606537, "learning_rate": 1.1264681912186852e-05, "loss": 0.0892, "step": 4904 }, { "epoch": 3.4396914446002804, "grad_norm": 0.2808792293071747, "learning_rate": 1.125540122739078e-05, "loss": 0.2572, "step": 4905 }, { "epoch": 3.44039270687237, "grad_norm": 0.3013174533843994, "learning_rate": 1.124612325648931e-05, "loss": 0.0908, "step": 4906 }, { "epoch": 3.44109396914446, "grad_norm": 0.2608543336391449, "learning_rate": 1.1236848001314387e-05, "loss": 0.0911, "step": 4907 }, { "epoch": 3.44179523141655, "grad_norm": 0.2571045458316803, "learning_rate": 1.122757546369744e-05, "loss": 0.0899, "step": 4908 }, { "epoch": 3.4424964936886395, "grad_norm": 0.2377767413854599, "learning_rate": 1.1218305645469329e-05, "loss": 0.0888, "step": 4909 }, { "epoch": 3.443197755960729, "grad_norm": 0.3411526381969452, "learning_rate": 1.1209038548460418e-05, "loss": 0.093, "step": 4910 }, { "epoch": 3.443899018232819, "grad_norm": 0.2605535686016083, "learning_rate": 1.11997741745005e-05, "loss": 0.0911, "step": 4911 }, { "epoch": 3.444600280504909, "grad_norm": 0.23889359831809998, "learning_rate": 1.1190512525418839e-05, "loss": 0.0888, "step": 4912 }, { "epoch": 3.4453015427769986, "grad_norm": 1.2671931982040405, "learning_rate": 1.1181253603044159e-05, "loss": 0.5926, "step": 4913 }, { "epoch": 3.4460028050490883, "grad_norm": 0.24014180898666382, "learning_rate": 1.1171997409204665e-05, "loss": 0.0884, "step": 4914 }, { "epoch": 3.446704067321178, "grad_norm": 0.23748591542243958, "learning_rate": 1.1162743945727991e-05, "loss": 0.0885, "step": 4915 }, { "epoch": 3.447405329593268, "grad_norm": 0.28966325521469116, "learning_rate": 1.1153493214441268e-05, "loss": 0.092, "step": 4916 }, { "epoch": 3.4481065918653577, "grad_norm": 0.23819807171821594, "learning_rate": 1.114424521717105e-05, "loss": 0.0889, "step": 4917 }, { "epoch": 3.4488078541374474, "grad_norm": 0.3271748423576355, "learning_rate": 1.1134999955743388e-05, "loss": 0.0919, "step": 4918 }, { "epoch": 3.449509116409537, "grad_norm": 0.25531789660453796, "learning_rate": 1.1125757431983766e-05, "loss": 0.0896, "step": 4919 }, { "epoch": 3.4502103786816267, "grad_norm": 0.86777263879776, "learning_rate": 1.1116517647717137e-05, "loss": 0.2384, "step": 4920 }, { "epoch": 3.450911640953717, "grad_norm": 0.2614191770553589, "learning_rate": 1.11072806047679e-05, "loss": 0.0913, "step": 4921 }, { "epoch": 3.4516129032258065, "grad_norm": 0.2826938331127167, "learning_rate": 1.1098046304959947e-05, "loss": 0.2551, "step": 4922 }, { "epoch": 3.452314165497896, "grad_norm": 0.29741576313972473, "learning_rate": 1.1088814750116589e-05, "loss": 0.259, "step": 4923 }, { "epoch": 3.453015427769986, "grad_norm": 0.23752140998840332, "learning_rate": 1.1079585942060625e-05, "loss": 0.088, "step": 4924 }, { "epoch": 3.453716690042076, "grad_norm": 0.29794666171073914, "learning_rate": 1.1070359882614292e-05, "loss": 0.2545, "step": 4925 }, { "epoch": 3.4544179523141656, "grad_norm": 0.25944915413856506, "learning_rate": 1.1061136573599292e-05, "loss": 0.0904, "step": 4926 }, { "epoch": 3.4551192145862553, "grad_norm": 0.2866508960723877, "learning_rate": 1.1051916016836778e-05, "loss": 0.092, "step": 4927 }, { "epoch": 3.455820476858345, "grad_norm": 0.2905360758304596, "learning_rate": 1.1042698214147376e-05, "loss": 0.2603, "step": 4928 }, { "epoch": 3.4565217391304346, "grad_norm": 0.2606920599937439, "learning_rate": 1.103348316735114e-05, "loss": 0.0903, "step": 4929 }, { "epoch": 3.4572230014025247, "grad_norm": 0.23879167437553406, "learning_rate": 1.1024270878267623e-05, "loss": 0.0887, "step": 4930 }, { "epoch": 3.4579242636746144, "grad_norm": 0.23790626227855682, "learning_rate": 1.1015061348715777e-05, "loss": 0.0885, "step": 4931 }, { "epoch": 3.458625525946704, "grad_norm": 0.7687922120094299, "learning_rate": 1.1005854580514078e-05, "loss": 0.4275, "step": 4932 }, { "epoch": 3.4593267882187937, "grad_norm": 1.0619149208068848, "learning_rate": 1.0996650575480374e-05, "loss": 0.3844, "step": 4933 }, { "epoch": 3.460028050490884, "grad_norm": 0.25309810042381287, "learning_rate": 1.098744933543204e-05, "loss": 0.0889, "step": 4934 }, { "epoch": 3.4607293127629735, "grad_norm": 0.23777084052562714, "learning_rate": 1.0978250862185863e-05, "loss": 0.0887, "step": 4935 }, { "epoch": 3.461430575035063, "grad_norm": 0.2789210081100464, "learning_rate": 1.0969055157558108e-05, "loss": 0.2572, "step": 4936 }, { "epoch": 3.462131837307153, "grad_norm": 0.3127196133136749, "learning_rate": 1.0959862223364473e-05, "loss": 0.2586, "step": 4937 }, { "epoch": 3.4628330995792425, "grad_norm": 0.2590794265270233, "learning_rate": 1.0950672061420131e-05, "loss": 0.0896, "step": 4938 }, { "epoch": 3.4635343618513326, "grad_norm": 0.2369363158941269, "learning_rate": 1.0941484673539686e-05, "loss": 0.0878, "step": 4939 }, { "epoch": 3.4642356241234222, "grad_norm": 0.29028424620628357, "learning_rate": 1.093230006153721e-05, "loss": 0.0892, "step": 4940 }, { "epoch": 3.464936886395512, "grad_norm": 0.23887118697166443, "learning_rate": 1.0923118227226203e-05, "loss": 0.0886, "step": 4941 }, { "epoch": 3.4656381486676016, "grad_norm": 0.28639283776283264, "learning_rate": 1.0913939172419657e-05, "loss": 0.257, "step": 4942 }, { "epoch": 3.4663394109396917, "grad_norm": 0.2814018130302429, "learning_rate": 1.090476289892997e-05, "loss": 0.2585, "step": 4943 }, { "epoch": 3.4670406732117813, "grad_norm": 0.2389691323041916, "learning_rate": 1.0895589408569037e-05, "loss": 0.0889, "step": 4944 }, { "epoch": 3.467741935483871, "grad_norm": 0.2852397561073303, "learning_rate": 1.0886418703148157e-05, "loss": 0.2574, "step": 4945 }, { "epoch": 3.4684431977559607, "grad_norm": 0.28162267804145813, "learning_rate": 1.0877250784478119e-05, "loss": 0.259, "step": 4946 }, { "epoch": 3.4691444600280503, "grad_norm": 0.25834712386131287, "learning_rate": 1.086808565436914e-05, "loss": 0.0899, "step": 4947 }, { "epoch": 3.4698457223001404, "grad_norm": 0.23819804191589355, "learning_rate": 1.0858923314630876e-05, "loss": 0.0884, "step": 4948 }, { "epoch": 3.47054698457223, "grad_norm": 0.26104065775871277, "learning_rate": 1.0849763767072466e-05, "loss": 0.0908, "step": 4949 }, { "epoch": 3.4712482468443198, "grad_norm": 0.25754356384277344, "learning_rate": 1.0840607013502474e-05, "loss": 0.089, "step": 4950 }, { "epoch": 3.4719495091164094, "grad_norm": 0.28591567277908325, "learning_rate": 1.0831453055728902e-05, "loss": 0.0916, "step": 4951 }, { "epoch": 3.4726507713884995, "grad_norm": 0.2874506413936615, "learning_rate": 1.0822301895559237e-05, "loss": 0.2583, "step": 4952 }, { "epoch": 3.473352033660589, "grad_norm": 0.27933886647224426, "learning_rate": 1.0813153534800377e-05, "loss": 0.2554, "step": 4953 }, { "epoch": 3.474053295932679, "grad_norm": 0.27927958965301514, "learning_rate": 1.0804007975258678e-05, "loss": 0.2542, "step": 4954 }, { "epoch": 3.4747545582047685, "grad_norm": 0.235578715801239, "learning_rate": 1.079486521873996e-05, "loss": 0.0877, "step": 4955 }, { "epoch": 3.475455820476858, "grad_norm": 0.28205204010009766, "learning_rate": 1.0785725267049462e-05, "loss": 0.257, "step": 4956 }, { "epoch": 3.476157082748948, "grad_norm": 0.23698939383029938, "learning_rate": 1.07765881219919e-05, "loss": 0.0885, "step": 4957 }, { "epoch": 3.476858345021038, "grad_norm": 0.27929043769836426, "learning_rate": 1.0767453785371398e-05, "loss": 0.2571, "step": 4958 }, { "epoch": 3.4775596072931276, "grad_norm": 0.2894834280014038, "learning_rate": 1.0758322258991569e-05, "loss": 0.0917, "step": 4959 }, { "epoch": 3.4782608695652173, "grad_norm": 0.29103922843933105, "learning_rate": 1.0749193544655434e-05, "loss": 0.2591, "step": 4960 }, { "epoch": 3.478962131837307, "grad_norm": 0.2833743095397949, "learning_rate": 1.074006764416548e-05, "loss": 0.2559, "step": 4961 }, { "epoch": 3.479663394109397, "grad_norm": 0.25682616233825684, "learning_rate": 1.0730944559323617e-05, "loss": 0.0885, "step": 4962 }, { "epoch": 3.4803646563814867, "grad_norm": 0.2852306365966797, "learning_rate": 1.0721824291931235e-05, "loss": 0.256, "step": 4963 }, { "epoch": 3.4810659186535764, "grad_norm": 0.9027326107025146, "learning_rate": 1.0712706843789124e-05, "loss": 0.2418, "step": 4964 }, { "epoch": 3.481767180925666, "grad_norm": 1.2679015398025513, "learning_rate": 1.0703592216697567e-05, "loss": 0.5946, "step": 4965 }, { "epoch": 3.4824684431977557, "grad_norm": 0.2814860939979553, "learning_rate": 1.0694480412456233e-05, "loss": 0.2576, "step": 4966 }, { "epoch": 3.483169705469846, "grad_norm": 0.2548355758190155, "learning_rate": 1.0685371432864302e-05, "loss": 0.0891, "step": 4967 }, { "epoch": 3.4838709677419355, "grad_norm": 0.7774091958999634, "learning_rate": 1.0676265279720312e-05, "loss": 0.4254, "step": 4968 }, { "epoch": 3.484572230014025, "grad_norm": 0.2823633551597595, "learning_rate": 1.0667161954822322e-05, "loss": 0.2565, "step": 4969 }, { "epoch": 3.485273492286115, "grad_norm": 0.2809590697288513, "learning_rate": 1.0658061459967775e-05, "loss": 0.2543, "step": 4970 }, { "epoch": 3.485974754558205, "grad_norm": 0.2375977337360382, "learning_rate": 1.0648963796953605e-05, "loss": 0.0884, "step": 4971 }, { "epoch": 3.4866760168302946, "grad_norm": 0.2796207368373871, "learning_rate": 1.0639868967576136e-05, "loss": 0.2566, "step": 4972 }, { "epoch": 3.4873772791023843, "grad_norm": 0.25829267501831055, "learning_rate": 1.0630776973631179e-05, "loss": 0.0908, "step": 4973 }, { "epoch": 3.488078541374474, "grad_norm": 0.25868287682533264, "learning_rate": 1.0621687816913956e-05, "loss": 0.09, "step": 4974 }, { "epoch": 3.4887798036465636, "grad_norm": 0.25616157054901123, "learning_rate": 1.0612601499219135e-05, "loss": 0.0898, "step": 4975 }, { "epoch": 3.4894810659186537, "grad_norm": 0.9418911337852478, "learning_rate": 1.0603518022340814e-05, "loss": 0.2379, "step": 4976 }, { "epoch": 3.4901823281907434, "grad_norm": 0.23825007677078247, "learning_rate": 1.0594437388072564e-05, "loss": 0.0883, "step": 4977 }, { "epoch": 3.490883590462833, "grad_norm": 0.265703409910202, "learning_rate": 1.058535959820735e-05, "loss": 0.0872, "step": 4978 }, { "epoch": 3.4915848527349227, "grad_norm": 0.27428221702575684, "learning_rate": 1.0576284654537616e-05, "loss": 0.2535, "step": 4979 }, { "epoch": 3.492286115007013, "grad_norm": 0.23950639367103577, "learning_rate": 1.0567212558855207e-05, "loss": 0.0893, "step": 4980 }, { "epoch": 3.4929873772791025, "grad_norm": 0.2387818992137909, "learning_rate": 1.0558143312951451e-05, "loss": 0.0888, "step": 4981 }, { "epoch": 3.493688639551192, "grad_norm": 0.27496644854545593, "learning_rate": 1.0549076918617051e-05, "loss": 0.2573, "step": 4982 }, { "epoch": 3.494389901823282, "grad_norm": 0.3131084442138672, "learning_rate": 1.0540013377642207e-05, "loss": 0.2623, "step": 4983 }, { "epoch": 3.4950911640953715, "grad_norm": 0.2570154666900635, "learning_rate": 1.0530952691816518e-05, "loss": 0.0905, "step": 4984 }, { "epoch": 3.4957924263674616, "grad_norm": 1.0704749822616577, "learning_rate": 1.0521894862929044e-05, "loss": 0.3803, "step": 4985 }, { "epoch": 3.4964936886395512, "grad_norm": 0.25984683632850647, "learning_rate": 1.0512839892768253e-05, "loss": 0.0914, "step": 4986 }, { "epoch": 3.497194950911641, "grad_norm": 0.2761804461479187, "learning_rate": 1.0503787783122082e-05, "loss": 0.2557, "step": 4987 }, { "epoch": 3.4978962131837306, "grad_norm": 0.295048326253891, "learning_rate": 1.0494738535777879e-05, "loss": 0.0935, "step": 4988 }, { "epoch": 3.4985974754558207, "grad_norm": 0.24041429162025452, "learning_rate": 1.0485692152522428e-05, "loss": 0.0893, "step": 4989 }, { "epoch": 3.4992987377279103, "grad_norm": 0.23992334306240082, "learning_rate": 1.047664863514195e-05, "loss": 0.0888, "step": 4990 }, { "epoch": 3.5, "grad_norm": 0.8519158363342285, "learning_rate": 1.0467607985422117e-05, "loss": 0.2281, "step": 4991 }, { "epoch": 3.5007012622720897, "grad_norm": 0.27617573738098145, "learning_rate": 1.0458570205148005e-05, "loss": 0.2563, "step": 4992 }, { "epoch": 3.5014025245441793, "grad_norm": 0.2679122984409332, "learning_rate": 1.0449535296104158e-05, "loss": 0.0928, "step": 4993 }, { "epoch": 3.5021037868162694, "grad_norm": 0.29963403940200806, "learning_rate": 1.044050326007451e-05, "loss": 0.0945, "step": 4994 }, { "epoch": 3.502805049088359, "grad_norm": 0.2903788089752197, "learning_rate": 1.0431474098842477e-05, "loss": 0.0919, "step": 4995 }, { "epoch": 3.5035063113604488, "grad_norm": 0.23814578354358673, "learning_rate": 1.0422447814190872e-05, "loss": 0.0886, "step": 4996 }, { "epoch": 3.5042075736325384, "grad_norm": 0.2625243663787842, "learning_rate": 1.041342440790194e-05, "loss": 0.0915, "step": 4997 }, { "epoch": 3.5049088359046285, "grad_norm": 0.23913297057151794, "learning_rate": 1.0404403881757385e-05, "loss": 0.089, "step": 4998 }, { "epoch": 3.505610098176718, "grad_norm": 0.27302443981170654, "learning_rate": 1.039538623753832e-05, "loss": 0.2527, "step": 4999 }, { "epoch": 3.506311360448808, "grad_norm": 0.26305902004241943, "learning_rate": 1.0386371477025283e-05, "loss": 0.0918, "step": 5000 }, { "epoch": 3.5070126227208975, "grad_norm": 0.26261913776397705, "learning_rate": 1.0377359601998272e-05, "loss": 0.0915, "step": 5001 }, { "epoch": 3.507713884992987, "grad_norm": 0.30256447196006775, "learning_rate": 1.0368350614236686e-05, "loss": 0.0929, "step": 5002 }, { "epoch": 3.5084151472650773, "grad_norm": 0.27597731351852417, "learning_rate": 1.0359344515519359e-05, "loss": 0.2569, "step": 5003 }, { "epoch": 3.509116409537167, "grad_norm": 0.29845625162124634, "learning_rate": 1.0350341307624577e-05, "loss": 0.0933, "step": 5004 }, { "epoch": 3.5098176718092566, "grad_norm": 0.2642538845539093, "learning_rate": 1.0341340992330026e-05, "loss": 0.092, "step": 5005 }, { "epoch": 3.5105189340813463, "grad_norm": 0.25835996866226196, "learning_rate": 1.0332343571412845e-05, "loss": 0.0869, "step": 5006 }, { "epoch": 3.5112201963534364, "grad_norm": 0.23937936127185822, "learning_rate": 1.0323349046649571e-05, "loss": 0.0883, "step": 5007 }, { "epoch": 3.511921458625526, "grad_norm": 0.27618587017059326, "learning_rate": 1.0314357419816215e-05, "loss": 0.2577, "step": 5008 }, { "epoch": 3.5126227208976157, "grad_norm": 0.26003217697143555, "learning_rate": 1.0305368692688174e-05, "loss": 0.0903, "step": 5009 }, { "epoch": 3.5133239831697054, "grad_norm": 0.27235835790634155, "learning_rate": 1.0296382867040288e-05, "loss": 0.255, "step": 5010 }, { "epoch": 3.514025245441795, "grad_norm": 0.26082199811935425, "learning_rate": 1.0287399944646816e-05, "loss": 0.0901, "step": 5011 }, { "epoch": 3.5147265077138847, "grad_norm": 0.2374396175146103, "learning_rate": 1.0278419927281469e-05, "loss": 0.0885, "step": 5012 }, { "epoch": 3.515427769985975, "grad_norm": 0.2942277491092682, "learning_rate": 1.0269442816717349e-05, "loss": 0.0931, "step": 5013 }, { "epoch": 3.5161290322580645, "grad_norm": 0.23727361857891083, "learning_rate": 1.0260468614727018e-05, "loss": 0.0881, "step": 5014 }, { "epoch": 3.516830294530154, "grad_norm": 0.2877791225910187, "learning_rate": 1.0251497323082432e-05, "loss": 0.2581, "step": 5015 }, { "epoch": 3.5175315568022443, "grad_norm": 0.2368779331445694, "learning_rate": 1.0242528943555014e-05, "loss": 0.088, "step": 5016 }, { "epoch": 3.518232819074334, "grad_norm": 0.2558947503566742, "learning_rate": 1.023356347791555e-05, "loss": 0.0878, "step": 5017 }, { "epoch": 3.5189340813464236, "grad_norm": 0.2783574163913727, "learning_rate": 1.0224600927934313e-05, "loss": 0.2529, "step": 5018 }, { "epoch": 3.5196353436185133, "grad_norm": 0.2533476948738098, "learning_rate": 1.0215641295380956e-05, "loss": 0.0839, "step": 5019 }, { "epoch": 3.520336605890603, "grad_norm": 0.2570507228374481, "learning_rate": 1.0206684582024595e-05, "loss": 0.0892, "step": 5020 }, { "epoch": 3.5210378681626926, "grad_norm": 0.28136008977890015, "learning_rate": 1.0197730789633725e-05, "loss": 0.2589, "step": 5021 }, { "epoch": 3.5217391304347827, "grad_norm": 0.2357199788093567, "learning_rate": 1.018877991997631e-05, "loss": 0.0876, "step": 5022 }, { "epoch": 3.5224403927068724, "grad_norm": 0.2839185297489166, "learning_rate": 1.0179831974819704e-05, "loss": 0.2571, "step": 5023 }, { "epoch": 3.523141654978962, "grad_norm": 0.7810450196266174, "learning_rate": 1.0170886955930692e-05, "loss": 0.4321, "step": 5024 }, { "epoch": 3.523842917251052, "grad_norm": 1.0524282455444336, "learning_rate": 1.016194486507548e-05, "loss": 0.2537, "step": 5025 }, { "epoch": 3.524544179523142, "grad_norm": 0.25108030438423157, "learning_rate": 1.0153005704019707e-05, "loss": 0.0876, "step": 5026 }, { "epoch": 3.5252454417952315, "grad_norm": 0.23599600791931152, "learning_rate": 1.014406947452842e-05, "loss": 0.0875, "step": 5027 }, { "epoch": 3.525946704067321, "grad_norm": 0.23638373613357544, "learning_rate": 1.0135136178366103e-05, "loss": 0.0877, "step": 5028 }, { "epoch": 3.526647966339411, "grad_norm": 0.2832505702972412, "learning_rate": 1.0126205817296636e-05, "loss": 0.2557, "step": 5029 }, { "epoch": 3.5273492286115005, "grad_norm": 0.2549661695957184, "learning_rate": 1.0117278393083358e-05, "loss": 0.0898, "step": 5030 }, { "epoch": 3.5280504908835906, "grad_norm": 0.7883176207542419, "learning_rate": 1.0108353907488971e-05, "loss": 0.4279, "step": 5031 }, { "epoch": 3.5287517531556802, "grad_norm": 0.2370545119047165, "learning_rate": 1.0099432362275654e-05, "loss": 0.088, "step": 5032 }, { "epoch": 3.52945301542777, "grad_norm": 0.3223516643047333, "learning_rate": 1.0090513759204967e-05, "loss": 0.0918, "step": 5033 }, { "epoch": 3.53015427769986, "grad_norm": 0.2363615185022354, "learning_rate": 1.008159810003792e-05, "loss": 0.0879, "step": 5034 }, { "epoch": 3.5308555399719497, "grad_norm": 0.25510790944099426, "learning_rate": 1.0072685386534908e-05, "loss": 0.0892, "step": 5035 }, { "epoch": 3.5315568022440393, "grad_norm": 0.248942032456398, "learning_rate": 1.006377562045578e-05, "loss": 0.0836, "step": 5036 }, { "epoch": 3.532258064516129, "grad_norm": 0.253583163022995, "learning_rate": 1.0054868803559769e-05, "loss": 0.088, "step": 5037 }, { "epoch": 3.5329593267882187, "grad_norm": 0.2828965187072754, "learning_rate": 1.0045964937605552e-05, "loss": 0.2561, "step": 5038 }, { "epoch": 3.5336605890603083, "grad_norm": 0.3237293064594269, "learning_rate": 1.0037064024351195e-05, "loss": 0.2614, "step": 5039 }, { "epoch": 3.5343618513323984, "grad_norm": 0.28774333000183105, "learning_rate": 1.0028166065554218e-05, "loss": 0.2561, "step": 5040 }, { "epoch": 3.535063113604488, "grad_norm": 0.2945364713668823, "learning_rate": 1.0019271062971524e-05, "loss": 0.2572, "step": 5041 }, { "epoch": 3.5357643758765778, "grad_norm": 0.2982458174228668, "learning_rate": 1.0010379018359464e-05, "loss": 0.257, "step": 5042 }, { "epoch": 3.536465638148668, "grad_norm": 0.7790392637252808, "learning_rate": 1.0001489933473765e-05, "loss": 0.4236, "step": 5043 }, { "epoch": 3.5371669004207575, "grad_norm": 0.2350563406944275, "learning_rate": 9.992603810069615e-06, "loss": 0.0874, "step": 5044 }, { "epoch": 3.537868162692847, "grad_norm": 0.2802540063858032, "learning_rate": 9.983720649901582e-06, "loss": 0.2538, "step": 5045 }, { "epoch": 3.538569424964937, "grad_norm": 0.23616839945316315, "learning_rate": 9.974840454723664e-06, "loss": 0.0877, "step": 5046 }, { "epoch": 3.5392706872370265, "grad_norm": 0.2520923912525177, "learning_rate": 9.96596322628926e-06, "loss": 0.0877, "step": 5047 }, { "epoch": 3.539971949509116, "grad_norm": 0.2854221761226654, "learning_rate": 9.957088966351211e-06, "loss": 0.255, "step": 5048 }, { "epoch": 3.5406732117812063, "grad_norm": 0.28757092356681824, "learning_rate": 9.948217676661739e-06, "loss": 0.2584, "step": 5049 }, { "epoch": 3.541374474053296, "grad_norm": 0.32412606477737427, "learning_rate": 9.939349358972511e-06, "loss": 0.2621, "step": 5050 }, { "epoch": 3.5420757363253856, "grad_norm": 0.23642167448997498, "learning_rate": 9.930484015034586e-06, "loss": 0.0869, "step": 5051 }, { "epoch": 3.5427769985974753, "grad_norm": 0.28484800457954407, "learning_rate": 9.92162164659843e-06, "loss": 0.2581, "step": 5052 }, { "epoch": 3.5434782608695654, "grad_norm": 0.2364966869354248, "learning_rate": 9.91276225541395e-06, "loss": 0.0875, "step": 5053 }, { "epoch": 3.544179523141655, "grad_norm": 0.25261661410331726, "learning_rate": 9.903905843230432e-06, "loss": 0.0878, "step": 5054 }, { "epoch": 3.5448807854137447, "grad_norm": 0.9236574769020081, "learning_rate": 9.895052411796607e-06, "loss": 0.2439, "step": 5055 }, { "epoch": 3.5455820476858344, "grad_norm": 0.7689926028251648, "learning_rate": 9.88620196286058e-06, "loss": 0.4233, "step": 5056 }, { "epoch": 3.546283309957924, "grad_norm": 0.23632968962192535, "learning_rate": 9.87735449816992e-06, "loss": 0.0874, "step": 5057 }, { "epoch": 3.546984572230014, "grad_norm": 0.2837185859680176, "learning_rate": 9.868510019471534e-06, "loss": 0.0901, "step": 5058 }, { "epoch": 3.547685834502104, "grad_norm": 0.2375067174434662, "learning_rate": 9.859668528511807e-06, "loss": 0.0882, "step": 5059 }, { "epoch": 3.5483870967741935, "grad_norm": 0.31827840209007263, "learning_rate": 9.850830027036486e-06, "loss": 0.2627, "step": 5060 }, { "epoch": 3.549088359046283, "grad_norm": 0.3096301257610321, "learning_rate": 9.84199451679077e-06, "loss": 0.2606, "step": 5061 }, { "epoch": 3.5497896213183733, "grad_norm": 0.2845951020717621, "learning_rate": 9.833161999519228e-06, "loss": 0.0901, "step": 5062 }, { "epoch": 3.550490883590463, "grad_norm": 0.2593320310115814, "learning_rate": 9.82433247696587e-06, "loss": 0.0903, "step": 5063 }, { "epoch": 3.5511921458625526, "grad_norm": 0.27949991822242737, "learning_rate": 9.815505950874096e-06, "loss": 0.2546, "step": 5064 }, { "epoch": 3.5518934081346423, "grad_norm": 0.2362336963415146, "learning_rate": 9.806682422986715e-06, "loss": 0.0874, "step": 5065 }, { "epoch": 3.552594670406732, "grad_norm": 0.23613986372947693, "learning_rate": 9.79786189504594e-06, "loss": 0.0879, "step": 5066 }, { "epoch": 3.553295932678822, "grad_norm": 0.7787168025970459, "learning_rate": 9.789044368793418e-06, "loss": 0.4261, "step": 5067 }, { "epoch": 3.5539971949509117, "grad_norm": 0.28608107566833496, "learning_rate": 9.780229845970166e-06, "loss": 0.0902, "step": 5068 }, { "epoch": 3.5546984572230014, "grad_norm": 0.28734663128852844, "learning_rate": 9.771418328316645e-06, "loss": 0.258, "step": 5069 }, { "epoch": 3.555399719495091, "grad_norm": 0.2781190276145935, "learning_rate": 9.762609817572686e-06, "loss": 0.2554, "step": 5070 }, { "epoch": 3.556100981767181, "grad_norm": 0.2552354037761688, "learning_rate": 9.75380431547757e-06, "loss": 0.0884, "step": 5071 }, { "epoch": 3.556802244039271, "grad_norm": 0.9848619699478149, "learning_rate": 9.745001823769926e-06, "loss": 0.2453, "step": 5072 }, { "epoch": 3.5575035063113605, "grad_norm": 0.252583384513855, "learning_rate": 9.736202344187844e-06, "loss": 0.0872, "step": 5073 }, { "epoch": 3.55820476858345, "grad_norm": 0.2837553322315216, "learning_rate": 9.72740587846878e-06, "loss": 0.2567, "step": 5074 }, { "epoch": 3.55890603085554, "grad_norm": 0.23595349490642548, "learning_rate": 9.718612428349633e-06, "loss": 0.0878, "step": 5075 }, { "epoch": 3.5596072931276295, "grad_norm": 0.23777012526988983, "learning_rate": 9.709821995566662e-06, "loss": 0.0875, "step": 5076 }, { "epoch": 3.5603085553997196, "grad_norm": 0.7853771448135376, "learning_rate": 9.701034581855572e-06, "loss": 0.4289, "step": 5077 }, { "epoch": 3.5610098176718092, "grad_norm": 0.31678271293640137, "learning_rate": 9.692250188951446e-06, "loss": 0.0914, "step": 5078 }, { "epoch": 3.561711079943899, "grad_norm": 0.25422176718711853, "learning_rate": 9.683468818588775e-06, "loss": 0.0884, "step": 5079 }, { "epoch": 3.562412342215989, "grad_norm": 0.2807510495185852, "learning_rate": 9.67469047250145e-06, "loss": 0.2565, "step": 5080 }, { "epoch": 3.5631136044880787, "grad_norm": 0.2815540134906769, "learning_rate": 9.665915152422784e-06, "loss": 0.2563, "step": 5081 }, { "epoch": 3.5638148667601683, "grad_norm": 0.28637394309043884, "learning_rate": 9.65714286008546e-06, "loss": 0.2558, "step": 5082 }, { "epoch": 3.564516129032258, "grad_norm": 0.28061461448669434, "learning_rate": 9.648373597221608e-06, "loss": 0.255, "step": 5083 }, { "epoch": 3.5652173913043477, "grad_norm": 0.29189634323120117, "learning_rate": 9.639607365562709e-06, "loss": 0.0818, "step": 5084 }, { "epoch": 3.5659186535764373, "grad_norm": 0.23893852531909943, "learning_rate": 9.630844166839698e-06, "loss": 0.0888, "step": 5085 }, { "epoch": 3.5666199158485274, "grad_norm": 0.2386232316493988, "learning_rate": 9.622084002782852e-06, "loss": 0.0885, "step": 5086 }, { "epoch": 3.567321178120617, "grad_norm": 0.29298651218414307, "learning_rate": 9.613326875121903e-06, "loss": 0.0925, "step": 5087 }, { "epoch": 3.5680224403927068, "grad_norm": 0.2386597990989685, "learning_rate": 9.604572785585946e-06, "loss": 0.0885, "step": 5088 }, { "epoch": 3.568723702664797, "grad_norm": 0.23810648918151855, "learning_rate": 9.595821735903507e-06, "loss": 0.0884, "step": 5089 }, { "epoch": 3.5694249649368865, "grad_norm": 0.2845517694950104, "learning_rate": 9.587073727802478e-06, "loss": 0.0897, "step": 5090 }, { "epoch": 3.570126227208976, "grad_norm": 0.7587696313858032, "learning_rate": 9.578328763010185e-06, "loss": 0.4205, "step": 5091 }, { "epoch": 3.570827489481066, "grad_norm": 0.2864046096801758, "learning_rate": 9.569586843253334e-06, "loss": 0.2581, "step": 5092 }, { "epoch": 3.5715287517531555, "grad_norm": 0.2578640282154083, "learning_rate": 9.560847970258013e-06, "loss": 0.0893, "step": 5093 }, { "epoch": 3.572230014025245, "grad_norm": 0.25432416796684265, "learning_rate": 9.552112145749753e-06, "loss": 0.0885, "step": 5094 }, { "epoch": 3.5729312762973353, "grad_norm": 0.28882160782814026, "learning_rate": 9.543379371453444e-06, "loss": 0.2567, "step": 5095 }, { "epoch": 3.573632538569425, "grad_norm": 0.256481796503067, "learning_rate": 9.534649649093383e-06, "loss": 0.0883, "step": 5096 }, { "epoch": 3.5743338008415146, "grad_norm": 0.29723450541496277, "learning_rate": 9.525922980393282e-06, "loss": 0.2556, "step": 5097 }, { "epoch": 3.5750350631136047, "grad_norm": 0.2986038625240326, "learning_rate": 9.51719936707622e-06, "loss": 0.2594, "step": 5098 }, { "epoch": 3.5757363253856944, "grad_norm": 0.7763403654098511, "learning_rate": 9.508478810864707e-06, "loss": 0.4228, "step": 5099 }, { "epoch": 3.576437587657784, "grad_norm": 0.237863227725029, "learning_rate": 9.499761313480626e-06, "loss": 0.0878, "step": 5100 }, { "epoch": 3.5771388499298737, "grad_norm": 0.7720427513122559, "learning_rate": 9.491046876645252e-06, "loss": 0.423, "step": 5101 }, { "epoch": 3.5778401122019634, "grad_norm": 0.770377516746521, "learning_rate": 9.482335502079279e-06, "loss": 0.423, "step": 5102 }, { "epoch": 3.578541374474053, "grad_norm": 0.2549365162849426, "learning_rate": 9.473627191502768e-06, "loss": 0.0885, "step": 5103 }, { "epoch": 3.579242636746143, "grad_norm": 0.2537190020084381, "learning_rate": 9.464921946635208e-06, "loss": 0.0873, "step": 5104 }, { "epoch": 3.579943899018233, "grad_norm": 0.28395015001296997, "learning_rate": 9.456219769195449e-06, "loss": 0.2549, "step": 5105 }, { "epoch": 3.5806451612903225, "grad_norm": 0.2812976539134979, "learning_rate": 9.44752066090177e-06, "loss": 0.084, "step": 5106 }, { "epoch": 3.5813464235624126, "grad_norm": 0.2591007947921753, "learning_rate": 9.438824623471798e-06, "loss": 0.0897, "step": 5107 }, { "epoch": 3.5820476858345023, "grad_norm": 0.2537132799625397, "learning_rate": 9.4301316586226e-06, "loss": 0.0877, "step": 5108 }, { "epoch": 3.582748948106592, "grad_norm": 0.23900005221366882, "learning_rate": 9.421441768070608e-06, "loss": 0.0882, "step": 5109 }, { "epoch": 3.5834502103786816, "grad_norm": 0.23813822865486145, "learning_rate": 9.412754953531663e-06, "loss": 0.0881, "step": 5110 }, { "epoch": 3.5841514726507713, "grad_norm": 0.23753994703292847, "learning_rate": 9.40407121672098e-06, "loss": 0.0882, "step": 5111 }, { "epoch": 3.584852734922861, "grad_norm": 0.2376166433095932, "learning_rate": 9.395390559353195e-06, "loss": 0.0873, "step": 5112 }, { "epoch": 3.585553997194951, "grad_norm": 0.23791110515594482, "learning_rate": 9.38671298314231e-06, "loss": 0.0877, "step": 5113 }, { "epoch": 3.5862552594670407, "grad_norm": 0.29314687848091125, "learning_rate": 9.378038489801724e-06, "loss": 0.2566, "step": 5114 }, { "epoch": 3.5869565217391304, "grad_norm": 0.28101846575737, "learning_rate": 9.369367081044226e-06, "loss": 0.0878, "step": 5115 }, { "epoch": 3.58765778401122, "grad_norm": 0.2876439392566681, "learning_rate": 9.360698758582016e-06, "loss": 0.2559, "step": 5116 }, { "epoch": 3.58835904628331, "grad_norm": 0.2879188358783722, "learning_rate": 9.352033524126649e-06, "loss": 0.2558, "step": 5117 }, { "epoch": 3.5890603085554, "grad_norm": 0.2851012051105499, "learning_rate": 9.343371379389113e-06, "loss": 0.2547, "step": 5118 }, { "epoch": 3.5897615708274895, "grad_norm": 0.25059911608695984, "learning_rate": 9.334712326079741e-06, "loss": 0.0831, "step": 5119 }, { "epoch": 3.590462833099579, "grad_norm": 0.3379638195037842, "learning_rate": 9.32605636590831e-06, "loss": 0.2611, "step": 5120 }, { "epoch": 3.591164095371669, "grad_norm": 0.7814541459083557, "learning_rate": 9.317403500583914e-06, "loss": 0.4257, "step": 5121 }, { "epoch": 3.591865357643759, "grad_norm": 0.2853226363658905, "learning_rate": 9.308753731815104e-06, "loss": 0.0874, "step": 5122 }, { "epoch": 3.5925666199158486, "grad_norm": 0.2853643000125885, "learning_rate": 9.300107061309776e-06, "loss": 0.2566, "step": 5123 }, { "epoch": 3.5932678821879382, "grad_norm": 0.23650935292243958, "learning_rate": 9.291463490775246e-06, "loss": 0.0873, "step": 5124 }, { "epoch": 3.593969144460028, "grad_norm": 0.2529064416885376, "learning_rate": 9.282823021918184e-06, "loss": 0.0864, "step": 5125 }, { "epoch": 3.594670406732118, "grad_norm": 0.2917211353778839, "learning_rate": 9.274185656444682e-06, "loss": 0.2558, "step": 5126 }, { "epoch": 3.5953716690042077, "grad_norm": 0.2801157832145691, "learning_rate": 9.265551396060193e-06, "loss": 0.2534, "step": 5127 }, { "epoch": 3.5960729312762973, "grad_norm": 0.23714832961559296, "learning_rate": 9.256920242469569e-06, "loss": 0.0877, "step": 5128 }, { "epoch": 3.596774193548387, "grad_norm": 0.2805997431278229, "learning_rate": 9.24829219737704e-06, "loss": 0.0877, "step": 5129 }, { "epoch": 3.5974754558204767, "grad_norm": 0.2885397970676422, "learning_rate": 9.23966726248624e-06, "loss": 0.2552, "step": 5130 }, { "epoch": 3.598176718092567, "grad_norm": 0.2860471308231354, "learning_rate": 9.231045439500163e-06, "loss": 0.2521, "step": 5131 }, { "epoch": 3.5988779803646564, "grad_norm": 0.23756541311740875, "learning_rate": 9.222426730121219e-06, "loss": 0.0877, "step": 5132 }, { "epoch": 3.599579242636746, "grad_norm": 0.7786398530006409, "learning_rate": 9.213811136051174e-06, "loss": 0.4196, "step": 5133 }, { "epoch": 3.6002805049088358, "grad_norm": 0.3024371862411499, "learning_rate": 9.205198658991207e-06, "loss": 0.2595, "step": 5134 }, { "epoch": 3.600981767180926, "grad_norm": 0.7792862057685852, "learning_rate": 9.196589300641842e-06, "loss": 0.4237, "step": 5135 }, { "epoch": 3.6016830294530155, "grad_norm": 0.27802738547325134, "learning_rate": 9.187983062703035e-06, "loss": 0.084, "step": 5136 }, { "epoch": 3.602384291725105, "grad_norm": 1.657288670539856, "learning_rate": 9.179379946874084e-06, "loss": 0.2895, "step": 5137 }, { "epoch": 3.603085553997195, "grad_norm": 0.23873461782932281, "learning_rate": 9.170779954853705e-06, "loss": 0.0874, "step": 5138 }, { "epoch": 3.6037868162692845, "grad_norm": 0.28496119379997253, "learning_rate": 9.162183088339968e-06, "loss": 0.254, "step": 5139 }, { "epoch": 3.604488078541374, "grad_norm": 0.2443075031042099, "learning_rate": 9.153589349030351e-06, "loss": 0.0826, "step": 5140 }, { "epoch": 3.6051893408134643, "grad_norm": 0.7806587219238281, "learning_rate": 9.1449987386217e-06, "loss": 0.4237, "step": 5141 }, { "epoch": 3.605890603085554, "grad_norm": 0.7730593085289001, "learning_rate": 9.136411258810229e-06, "loss": 0.4199, "step": 5142 }, { "epoch": 3.6065918653576436, "grad_norm": 0.28496676683425903, "learning_rate": 9.127826911291576e-06, "loss": 0.087, "step": 5143 }, { "epoch": 3.6072931276297338, "grad_norm": 0.24047382175922394, "learning_rate": 9.119245697760723e-06, "loss": 0.088, "step": 5144 }, { "epoch": 3.6079943899018234, "grad_norm": 1.4145985841751099, "learning_rate": 9.110667619912035e-06, "loss": 0.4262, "step": 5145 }, { "epoch": 3.608695652173913, "grad_norm": 0.7839202880859375, "learning_rate": 9.10209267943929e-06, "loss": 0.4233, "step": 5146 }, { "epoch": 3.6093969144460027, "grad_norm": 0.28001877665519714, "learning_rate": 9.093520878035602e-06, "loss": 0.2549, "step": 5147 }, { "epoch": 3.6100981767180924, "grad_norm": 0.2844821512699127, "learning_rate": 9.08495221739351e-06, "loss": 0.0862, "step": 5148 }, { "epoch": 3.610799438990182, "grad_norm": 0.24017839133739471, "learning_rate": 9.0763866992049e-06, "loss": 0.0886, "step": 5149 }, { "epoch": 3.611500701262272, "grad_norm": 0.2820020318031311, "learning_rate": 9.06782432516104e-06, "loss": 0.0874, "step": 5150 }, { "epoch": 3.612201963534362, "grad_norm": 0.2403094619512558, "learning_rate": 9.059265096952598e-06, "loss": 0.0891, "step": 5151 }, { "epoch": 3.6129032258064515, "grad_norm": 0.26042550802230835, "learning_rate": 9.050709016269601e-06, "loss": 0.0893, "step": 5152 }, { "epoch": 3.6136044880785416, "grad_norm": 0.2919791638851166, "learning_rate": 9.04215608480147e-06, "loss": 0.0898, "step": 5153 }, { "epoch": 3.6143057503506313, "grad_norm": 0.2751193642616272, "learning_rate": 9.033606304236993e-06, "loss": 0.2506, "step": 5154 }, { "epoch": 3.615007012622721, "grad_norm": 0.24069233238697052, "learning_rate": 9.025059676264336e-06, "loss": 0.0888, "step": 5155 }, { "epoch": 3.6157082748948106, "grad_norm": 0.23938918113708496, "learning_rate": 9.016516202571038e-06, "loss": 0.0883, "step": 5156 }, { "epoch": 3.6164095371669003, "grad_norm": 0.2416509985923767, "learning_rate": 9.007975884844038e-06, "loss": 0.0887, "step": 5157 }, { "epoch": 3.61711079943899, "grad_norm": 0.3420284390449524, "learning_rate": 8.999438724769621e-06, "loss": 0.2407, "step": 5158 }, { "epoch": 3.61781206171108, "grad_norm": 0.27333489060401917, "learning_rate": 8.99090472403348e-06, "loss": 0.2523, "step": 5159 }, { "epoch": 3.6185133239831697, "grad_norm": 0.2857491970062256, "learning_rate": 8.982373884320649e-06, "loss": 0.0887, "step": 5160 }, { "epoch": 3.6192145862552594, "grad_norm": 0.24099335074424744, "learning_rate": 8.97384620731558e-06, "loss": 0.088, "step": 5161 }, { "epoch": 3.6199158485273495, "grad_norm": 0.25775986909866333, "learning_rate": 8.965321694702064e-06, "loss": 0.0893, "step": 5162 }, { "epoch": 3.620617110799439, "grad_norm": 0.2844516336917877, "learning_rate": 8.956800348163283e-06, "loss": 0.0861, "step": 5163 }, { "epoch": 3.621318373071529, "grad_norm": 0.27928221225738525, "learning_rate": 8.948282169381781e-06, "loss": 0.2524, "step": 5164 }, { "epoch": 3.6220196353436185, "grad_norm": 0.25348684191703796, "learning_rate": 8.939767160039503e-06, "loss": 0.0869, "step": 5165 }, { "epoch": 3.622720897615708, "grad_norm": 0.2398097813129425, "learning_rate": 8.931255321817741e-06, "loss": 0.0884, "step": 5166 }, { "epoch": 3.623422159887798, "grad_norm": 0.28475382924079895, "learning_rate": 8.922746656397182e-06, "loss": 0.254, "step": 5167 }, { "epoch": 3.624123422159888, "grad_norm": 0.2547323703765869, "learning_rate": 8.914241165457865e-06, "loss": 0.0877, "step": 5168 }, { "epoch": 3.6248246844319776, "grad_norm": 0.23983973264694214, "learning_rate": 8.905738850679235e-06, "loss": 0.0887, "step": 5169 }, { "epoch": 3.6255259467040672, "grad_norm": 0.2389947921037674, "learning_rate": 8.897239713740058e-06, "loss": 0.0887, "step": 5170 }, { "epoch": 3.6262272089761574, "grad_norm": 0.2414553314447403, "learning_rate": 8.888743756318527e-06, "loss": 0.0887, "step": 5171 }, { "epoch": 3.626928471248247, "grad_norm": 0.24006199836730957, "learning_rate": 8.880250980092167e-06, "loss": 0.0882, "step": 5172 }, { "epoch": 3.6276297335203367, "grad_norm": 0.27975085377693176, "learning_rate": 8.871761386737907e-06, "loss": 0.2527, "step": 5173 }, { "epoch": 3.6283309957924264, "grad_norm": 0.2573721408843994, "learning_rate": 8.863274977932015e-06, "loss": 0.0874, "step": 5174 }, { "epoch": 3.629032258064516, "grad_norm": 0.25704535841941833, "learning_rate": 8.854791755350175e-06, "loss": 0.0881, "step": 5175 }, { "epoch": 3.6297335203366057, "grad_norm": 0.25676214694976807, "learning_rate": 8.846311720667377e-06, "loss": 0.0882, "step": 5176 }, { "epoch": 3.630434782608696, "grad_norm": 0.24003104865550995, "learning_rate": 8.837834875558046e-06, "loss": 0.0881, "step": 5177 }, { "epoch": 3.6311360448807855, "grad_norm": 0.28105682134628296, "learning_rate": 8.829361221695931e-06, "loss": 0.0823, "step": 5178 }, { "epoch": 3.631837307152875, "grad_norm": 0.30342918634414673, "learning_rate": 8.820890760754188e-06, "loss": 0.2609, "step": 5179 }, { "epoch": 3.632538569424965, "grad_norm": 0.2523110806941986, "learning_rate": 8.812423494405307e-06, "loss": 0.0856, "step": 5180 }, { "epoch": 3.633239831697055, "grad_norm": 0.23991350829601288, "learning_rate": 8.803959424321176e-06, "loss": 0.0874, "step": 5181 }, { "epoch": 3.6339410939691446, "grad_norm": 0.2935486435890198, "learning_rate": 8.79549855217304e-06, "loss": 0.258, "step": 5182 }, { "epoch": 3.634642356241234, "grad_norm": 0.7879507541656494, "learning_rate": 8.787040879631505e-06, "loss": 0.4216, "step": 5183 }, { "epoch": 3.635343618513324, "grad_norm": 0.2390756905078888, "learning_rate": 8.778586408366552e-06, "loss": 0.0868, "step": 5184 }, { "epoch": 3.6360448807854135, "grad_norm": 0.29234710335731506, "learning_rate": 8.770135140047542e-06, "loss": 0.2506, "step": 5185 }, { "epoch": 3.6367461430575037, "grad_norm": 0.2879049479961395, "learning_rate": 8.761687076343178e-06, "loss": 0.2536, "step": 5186 }, { "epoch": 3.6374474053295933, "grad_norm": 0.23844431340694427, "learning_rate": 8.753242218921562e-06, "loss": 0.0873, "step": 5187 }, { "epoch": 3.638148667601683, "grad_norm": 0.2384318858385086, "learning_rate": 8.744800569450126e-06, "loss": 0.0875, "step": 5188 }, { "epoch": 3.6388499298737726, "grad_norm": 0.3052339255809784, "learning_rate": 8.736362129595706e-06, "loss": 0.2506, "step": 5189 }, { "epoch": 3.6395511921458628, "grad_norm": 0.2913951575756073, "learning_rate": 8.72792690102448e-06, "loss": 0.2547, "step": 5190 }, { "epoch": 3.6402524544179524, "grad_norm": 0.28781774640083313, "learning_rate": 8.719494885401988e-06, "loss": 0.2568, "step": 5191 }, { "epoch": 3.640953716690042, "grad_norm": 0.31155383586883545, "learning_rate": 8.711066084393166e-06, "loss": 0.2517, "step": 5192 }, { "epoch": 3.6416549789621318, "grad_norm": 1.860893726348877, "learning_rate": 8.702640499662282e-06, "loss": 0.4589, "step": 5193 }, { "epoch": 3.6423562412342214, "grad_norm": 0.2534801959991455, "learning_rate": 8.694218132872977e-06, "loss": 0.0851, "step": 5194 }, { "epoch": 3.6430575035063115, "grad_norm": 0.23925915360450745, "learning_rate": 8.685798985688275e-06, "loss": 0.0873, "step": 5195 }, { "epoch": 3.643758765778401, "grad_norm": 0.2793492078781128, "learning_rate": 8.677383059770549e-06, "loss": 0.0839, "step": 5196 }, { "epoch": 3.644460028050491, "grad_norm": 0.371059775352478, "learning_rate": 8.668970356781525e-06, "loss": 0.2651, "step": 5197 }, { "epoch": 3.6451612903225805, "grad_norm": 0.23854553699493408, "learning_rate": 8.660560878382323e-06, "loss": 0.0878, "step": 5198 }, { "epoch": 3.6458625525946706, "grad_norm": 0.28070002794265747, "learning_rate": 8.652154626233392e-06, "loss": 0.2536, "step": 5199 }, { "epoch": 3.6465638148667603, "grad_norm": 0.27887746691703796, "learning_rate": 8.643751601994577e-06, "loss": 0.0854, "step": 5200 }, { "epoch": 3.64726507713885, "grad_norm": 0.2607824206352234, "learning_rate": 8.635351807325057e-06, "loss": 0.079, "step": 5201 }, { "epoch": 3.6479663394109396, "grad_norm": 0.2381310909986496, "learning_rate": 8.626955243883397e-06, "loss": 0.0877, "step": 5202 }, { "epoch": 3.6486676016830293, "grad_norm": 0.2517335116863251, "learning_rate": 8.618561913327508e-06, "loss": 0.0858, "step": 5203 }, { "epoch": 3.649368863955119, "grad_norm": 0.23894003033638, "learning_rate": 8.610171817314666e-06, "loss": 0.0877, "step": 5204 }, { "epoch": 3.650070126227209, "grad_norm": 0.3051700294017792, "learning_rate": 8.601784957501503e-06, "loss": 0.2522, "step": 5205 }, { "epoch": 3.6507713884992987, "grad_norm": 0.25248390436172485, "learning_rate": 8.593401335544032e-06, "loss": 0.0857, "step": 5206 }, { "epoch": 3.6514726507713884, "grad_norm": 1.5792847871780396, "learning_rate": 8.5850209530976e-06, "loss": 0.4497, "step": 5207 }, { "epoch": 3.6521739130434785, "grad_norm": 0.25315406918525696, "learning_rate": 8.576643811816945e-06, "loss": 0.0858, "step": 5208 }, { "epoch": 3.652875175315568, "grad_norm": 0.2751837968826294, "learning_rate": 8.568269913356131e-06, "loss": 0.0835, "step": 5209 }, { "epoch": 3.653576437587658, "grad_norm": 0.2367115318775177, "learning_rate": 8.559899259368624e-06, "loss": 0.0867, "step": 5210 }, { "epoch": 3.6542776998597475, "grad_norm": 0.2862080931663513, "learning_rate": 8.551531851507186e-06, "loss": 0.2546, "step": 5211 }, { "epoch": 3.654978962131837, "grad_norm": 0.23756971955299377, "learning_rate": 8.543167691424006e-06, "loss": 0.0872, "step": 5212 }, { "epoch": 3.655680224403927, "grad_norm": 0.23856957256793976, "learning_rate": 8.534806780770584e-06, "loss": 0.0876, "step": 5213 }, { "epoch": 3.656381486676017, "grad_norm": 0.25560203194618225, "learning_rate": 8.526449121197814e-06, "loss": 0.086, "step": 5214 }, { "epoch": 3.6570827489481066, "grad_norm": 0.2514761984348297, "learning_rate": 8.51809471435591e-06, "loss": 0.0845, "step": 5215 }, { "epoch": 3.6577840112201963, "grad_norm": 0.2941955327987671, "learning_rate": 8.509743561894481e-06, "loss": 0.2522, "step": 5216 }, { "epoch": 3.6584852734922864, "grad_norm": 0.25089412927627563, "learning_rate": 8.501395665462473e-06, "loss": 0.086, "step": 5217 }, { "epoch": 3.659186535764376, "grad_norm": 0.3043745756149292, "learning_rate": 8.493051026708188e-06, "loss": 0.2545, "step": 5218 }, { "epoch": 3.6598877980364657, "grad_norm": 0.29049110412597656, "learning_rate": 8.484709647279281e-06, "loss": 0.257, "step": 5219 }, { "epoch": 3.6605890603085554, "grad_norm": 0.9324542880058289, "learning_rate": 8.47637152882279e-06, "loss": 0.4449, "step": 5220 }, { "epoch": 3.661290322580645, "grad_norm": 0.801460325717926, "learning_rate": 8.468036672985074e-06, "loss": 0.4274, "step": 5221 }, { "epoch": 3.6619915848527347, "grad_norm": 0.2393035739660263, "learning_rate": 8.45970508141188e-06, "loss": 0.0868, "step": 5222 }, { "epoch": 3.662692847124825, "grad_norm": 0.2467825561761856, "learning_rate": 8.45137675574828e-06, "loss": 0.0822, "step": 5223 }, { "epoch": 3.6633941093969145, "grad_norm": 0.24890035390853882, "learning_rate": 8.443051697638741e-06, "loss": 0.0847, "step": 5224 }, { "epoch": 3.664095371669004, "grad_norm": 0.23891833424568176, "learning_rate": 8.434729908727024e-06, "loss": 0.0859, "step": 5225 }, { "epoch": 3.6647966339410942, "grad_norm": 0.23460496962070465, "learning_rate": 8.426411390656308e-06, "loss": 0.0792, "step": 5226 }, { "epoch": 3.665497896213184, "grad_norm": 0.2952468693256378, "learning_rate": 8.418096145069082e-06, "loss": 0.2565, "step": 5227 }, { "epoch": 3.6661991584852736, "grad_norm": 0.23892787098884583, "learning_rate": 8.40978417360722e-06, "loss": 0.0871, "step": 5228 }, { "epoch": 3.666900420757363, "grad_norm": 0.7886011600494385, "learning_rate": 8.401475477911922e-06, "loss": 0.4226, "step": 5229 }, { "epoch": 3.667601683029453, "grad_norm": 0.2884344160556793, "learning_rate": 8.393170059623768e-06, "loss": 0.2562, "step": 5230 }, { "epoch": 3.6683029453015426, "grad_norm": 0.23857101798057556, "learning_rate": 8.38486792038267e-06, "loss": 0.0873, "step": 5231 }, { "epoch": 3.6690042075736327, "grad_norm": 0.2508496046066284, "learning_rate": 8.376569061827898e-06, "loss": 0.0847, "step": 5232 }, { "epoch": 3.6697054698457223, "grad_norm": 0.2380044311285019, "learning_rate": 8.368273485598072e-06, "loss": 0.0867, "step": 5233 }, { "epoch": 3.670406732117812, "grad_norm": 0.23829680681228638, "learning_rate": 8.359981193331178e-06, "loss": 0.0876, "step": 5234 }, { "epoch": 3.671107994389902, "grad_norm": 0.300540953874588, "learning_rate": 8.351692186664531e-06, "loss": 0.2593, "step": 5235 }, { "epoch": 3.6718092566619918, "grad_norm": 0.7848923802375793, "learning_rate": 8.343406467234824e-06, "loss": 0.4258, "step": 5236 }, { "epoch": 3.6725105189340814, "grad_norm": 0.25656411051750183, "learning_rate": 8.335124036678075e-06, "loss": 0.0854, "step": 5237 }, { "epoch": 3.673211781206171, "grad_norm": 0.25261756777763367, "learning_rate": 8.32684489662967e-06, "loss": 0.0843, "step": 5238 }, { "epoch": 3.6739130434782608, "grad_norm": 0.28457969427108765, "learning_rate": 8.318569048724343e-06, "loss": 0.0856, "step": 5239 }, { "epoch": 3.6746143057503504, "grad_norm": 0.2382824867963791, "learning_rate": 8.31029649459616e-06, "loss": 0.0868, "step": 5240 }, { "epoch": 3.6753155680224405, "grad_norm": 0.2384495735168457, "learning_rate": 8.302027235878567e-06, "loss": 0.0874, "step": 5241 }, { "epoch": 3.67601683029453, "grad_norm": 0.25452473759651184, "learning_rate": 8.293761274204337e-06, "loss": 0.0839, "step": 5242 }, { "epoch": 3.67671809256662, "grad_norm": 0.26208195090293884, "learning_rate": 8.285498611205592e-06, "loss": 0.0794, "step": 5243 }, { "epoch": 3.6774193548387095, "grad_norm": 0.24973632395267487, "learning_rate": 8.277239248513821e-06, "loss": 0.0849, "step": 5244 }, { "epoch": 3.6781206171107996, "grad_norm": 0.23826159536838531, "learning_rate": 8.26898318775984e-06, "loss": 0.0867, "step": 5245 }, { "epoch": 3.6788218793828893, "grad_norm": 0.27374109625816345, "learning_rate": 8.260730430573824e-06, "loss": 0.0811, "step": 5246 }, { "epoch": 3.679523141654979, "grad_norm": 0.3136556148529053, "learning_rate": 8.252480978585297e-06, "loss": 0.2462, "step": 5247 }, { "epoch": 3.6802244039270686, "grad_norm": 0.23994074761867523, "learning_rate": 8.244234833423123e-06, "loss": 0.0863, "step": 5248 }, { "epoch": 3.6809256661991583, "grad_norm": 0.2700556218624115, "learning_rate": 8.235991996715525e-06, "loss": 0.0805, "step": 5249 }, { "epoch": 3.6816269284712484, "grad_norm": 0.2472744584083557, "learning_rate": 8.227752470090055e-06, "loss": 0.083, "step": 5250 }, { "epoch": 3.682328190743338, "grad_norm": 0.29534342885017395, "learning_rate": 8.219516255173634e-06, "loss": 0.2564, "step": 5251 }, { "epoch": 3.6830294530154277, "grad_norm": 0.31181731820106506, "learning_rate": 8.21128335359251e-06, "loss": 0.2522, "step": 5252 }, { "epoch": 3.6837307152875174, "grad_norm": 0.24630707502365112, "learning_rate": 8.203053766972285e-06, "loss": 0.0823, "step": 5253 }, { "epoch": 3.6844319775596075, "grad_norm": 0.2516384720802307, "learning_rate": 8.194827496937896e-06, "loss": 0.0834, "step": 5254 }, { "epoch": 3.685133239831697, "grad_norm": 0.2378687560558319, "learning_rate": 8.186604545113648e-06, "loss": 0.0868, "step": 5255 }, { "epoch": 3.685834502103787, "grad_norm": 0.25151902437210083, "learning_rate": 8.178384913123166e-06, "loss": 0.0839, "step": 5256 }, { "epoch": 3.6865357643758765, "grad_norm": 0.24495981633663177, "learning_rate": 8.17016860258944e-06, "loss": 0.0817, "step": 5257 }, { "epoch": 3.687237026647966, "grad_norm": 0.24936284124851227, "learning_rate": 8.161955615134784e-06, "loss": 0.0841, "step": 5258 }, { "epoch": 3.6879382889200563, "grad_norm": 3.748654842376709, "learning_rate": 8.153745952380887e-06, "loss": 0.4523, "step": 5259 }, { "epoch": 3.688639551192146, "grad_norm": 0.23748359084129333, "learning_rate": 8.14553961594873e-06, "loss": 0.086, "step": 5260 }, { "epoch": 3.6893408134642356, "grad_norm": 0.2377035915851593, "learning_rate": 8.137336607458693e-06, "loss": 0.0862, "step": 5261 }, { "epoch": 3.6900420757363253, "grad_norm": 0.3043361008167267, "learning_rate": 8.129136928530456e-06, "loss": 0.2547, "step": 5262 }, { "epoch": 3.6907433380084154, "grad_norm": 0.23611673712730408, "learning_rate": 8.120940580783078e-06, "loss": 0.0858, "step": 5263 }, { "epoch": 3.691444600280505, "grad_norm": 0.23691421747207642, "learning_rate": 8.112747565834927e-06, "loss": 0.0856, "step": 5264 }, { "epoch": 3.6921458625525947, "grad_norm": 0.23621992766857147, "learning_rate": 8.10455788530374e-06, "loss": 0.086, "step": 5265 }, { "epoch": 3.6928471248246844, "grad_norm": 0.24893823266029358, "learning_rate": 8.09637154080658e-06, "loss": 0.0835, "step": 5266 }, { "epoch": 3.693548387096774, "grad_norm": 0.2683342397212982, "learning_rate": 8.08818853395985e-06, "loss": 0.0789, "step": 5267 }, { "epoch": 3.6942496493688637, "grad_norm": 0.40940535068511963, "learning_rate": 8.080008866379294e-06, "loss": 0.2596, "step": 5268 }, { "epoch": 3.694950911640954, "grad_norm": 0.3083398640155792, "learning_rate": 8.071832539680016e-06, "loss": 0.2599, "step": 5269 }, { "epoch": 3.6956521739130435, "grad_norm": 2.0516879558563232, "learning_rate": 8.06365955547643e-06, "loss": 0.3414, "step": 5270 }, { "epoch": 3.696353436185133, "grad_norm": 0.24906079471111298, "learning_rate": 8.055489915382322e-06, "loss": 0.0818, "step": 5271 }, { "epoch": 3.6970546984572232, "grad_norm": 0.3064082860946655, "learning_rate": 8.047323621010782e-06, "loss": 0.2531, "step": 5272 }, { "epoch": 3.697755960729313, "grad_norm": 0.3160126507282257, "learning_rate": 8.039160673974292e-06, "loss": 0.2482, "step": 5273 }, { "epoch": 3.6984572230014026, "grad_norm": 0.3837699890136719, "learning_rate": 8.031001075884598e-06, "loss": 0.2624, "step": 5274 }, { "epoch": 3.6991584852734922, "grad_norm": 0.3018946945667267, "learning_rate": 8.022844828352858e-06, "loss": 0.2585, "step": 5275 }, { "epoch": 3.699859747545582, "grad_norm": 0.23647743463516235, "learning_rate": 8.014691932989512e-06, "loss": 0.0853, "step": 5276 }, { "epoch": 3.7005610098176716, "grad_norm": 0.23682846128940582, "learning_rate": 8.00654239140439e-06, "loss": 0.0861, "step": 5277 }, { "epoch": 3.7012622720897617, "grad_norm": 0.30152156949043274, "learning_rate": 7.99839620520661e-06, "loss": 0.2588, "step": 5278 }, { "epoch": 3.7019635343618513, "grad_norm": 0.2370060235261917, "learning_rate": 7.99025337600467e-06, "loss": 0.086, "step": 5279 }, { "epoch": 3.702664796633941, "grad_norm": 0.8197136521339417, "learning_rate": 7.982113905406374e-06, "loss": 0.4315, "step": 5280 }, { "epoch": 3.703366058906031, "grad_norm": 0.25089940428733826, "learning_rate": 7.973977795018877e-06, "loss": 0.0828, "step": 5281 }, { "epoch": 3.7040673211781208, "grad_norm": 0.23603582382202148, "learning_rate": 7.965845046448658e-06, "loss": 0.0852, "step": 5282 }, { "epoch": 3.7047685834502104, "grad_norm": 0.29649555683135986, "learning_rate": 7.957715661301559e-06, "loss": 0.2571, "step": 5283 }, { "epoch": 3.7054698457223, "grad_norm": 1.3740966320037842, "learning_rate": 7.949589641182729e-06, "loss": 0.6057, "step": 5284 }, { "epoch": 3.7061711079943898, "grad_norm": 0.24947036802768707, "learning_rate": 7.941466987696675e-06, "loss": 0.0827, "step": 5285 }, { "epoch": 3.7068723702664794, "grad_norm": 0.2964232265949249, "learning_rate": 7.933347702447214e-06, "loss": 0.2555, "step": 5286 }, { "epoch": 3.7075736325385695, "grad_norm": 0.23706506192684174, "learning_rate": 7.925231787037532e-06, "loss": 0.0864, "step": 5287 }, { "epoch": 3.708274894810659, "grad_norm": 0.25082290172576904, "learning_rate": 7.917119243070117e-06, "loss": 0.0845, "step": 5288 }, { "epoch": 3.708976157082749, "grad_norm": 1.3260141611099243, "learning_rate": 7.909010072146803e-06, "loss": 0.6023, "step": 5289 }, { "epoch": 3.709677419354839, "grad_norm": 0.2367418110370636, "learning_rate": 7.90090427586877e-06, "loss": 0.0859, "step": 5290 }, { "epoch": 3.7103786816269286, "grad_norm": 0.23580990731716156, "learning_rate": 7.892801855836518e-06, "loss": 0.0861, "step": 5291 }, { "epoch": 3.7110799438990183, "grad_norm": 0.31397545337677, "learning_rate": 7.884702813649874e-06, "loss": 0.2512, "step": 5292 }, { "epoch": 3.711781206171108, "grad_norm": 0.2376631200313568, "learning_rate": 7.876607150908023e-06, "loss": 0.0858, "step": 5293 }, { "epoch": 3.7124824684431976, "grad_norm": 0.2370595782995224, "learning_rate": 7.868514869209458e-06, "loss": 0.0862, "step": 5294 }, { "epoch": 3.7131837307152873, "grad_norm": 0.2530239224433899, "learning_rate": 7.86042597015201e-06, "loss": 0.0847, "step": 5295 }, { "epoch": 3.7138849929873774, "grad_norm": 0.23795199394226074, "learning_rate": 7.85234045533286e-06, "loss": 0.0866, "step": 5296 }, { "epoch": 3.714586255259467, "grad_norm": 0.2624920904636383, "learning_rate": 7.844258326348488e-06, "loss": 0.0765, "step": 5297 }, { "epoch": 3.7152875175315567, "grad_norm": 0.8776510953903198, "learning_rate": 7.836179584794746e-06, "loss": 0.4372, "step": 5298 }, { "epoch": 3.715988779803647, "grad_norm": 0.23652999103069305, "learning_rate": 7.828104232266773e-06, "loss": 0.0867, "step": 5299 }, { "epoch": 3.7166900420757365, "grad_norm": 0.23729905486106873, "learning_rate": 7.82003227035909e-06, "loss": 0.0868, "step": 5300 }, { "epoch": 3.717391304347826, "grad_norm": 0.2991178631782532, "learning_rate": 7.811963700665487e-06, "loss": 0.257, "step": 5301 }, { "epoch": 3.718092566619916, "grad_norm": 0.25038373470306396, "learning_rate": 7.80389852477914e-06, "loss": 0.083, "step": 5302 }, { "epoch": 3.7187938288920055, "grad_norm": 0.23730604350566864, "learning_rate": 7.795836744292514e-06, "loss": 0.0852, "step": 5303 }, { "epoch": 3.719495091164095, "grad_norm": 0.23760293424129486, "learning_rate": 7.787778360797443e-06, "loss": 0.0857, "step": 5304 }, { "epoch": 3.7201963534361853, "grad_norm": 0.2517298758029938, "learning_rate": 7.779723375885045e-06, "loss": 0.0831, "step": 5305 }, { "epoch": 3.720897615708275, "grad_norm": 0.29759711027145386, "learning_rate": 7.771671791145813e-06, "loss": 0.2554, "step": 5306 }, { "epoch": 3.7215988779803646, "grad_norm": 0.30447134375572205, "learning_rate": 7.763623608169535e-06, "loss": 0.2505, "step": 5307 }, { "epoch": 3.7223001402524543, "grad_norm": 3.713252544403076, "learning_rate": 7.755578828545337e-06, "loss": 0.4459, "step": 5308 }, { "epoch": 3.7230014025245444, "grad_norm": 0.23849524557590485, "learning_rate": 7.747537453861672e-06, "loss": 0.0865, "step": 5309 }, { "epoch": 3.723702664796634, "grad_norm": 0.2700984477996826, "learning_rate": 7.739499485706334e-06, "loss": 0.0798, "step": 5310 }, { "epoch": 3.7244039270687237, "grad_norm": 0.24614618718624115, "learning_rate": 7.731464925666421e-06, "loss": 0.0821, "step": 5311 }, { "epoch": 3.7251051893408134, "grad_norm": 0.3019011914730072, "learning_rate": 7.723433775328384e-06, "loss": 0.2555, "step": 5312 }, { "epoch": 3.725806451612903, "grad_norm": 0.859934389591217, "learning_rate": 7.715406036277974e-06, "loss": 0.4303, "step": 5313 }, { "epoch": 3.726507713884993, "grad_norm": 1.8952268362045288, "learning_rate": 7.707381710100303e-06, "loss": 0.783, "step": 5314 }, { "epoch": 3.727208976157083, "grad_norm": 0.29198509454727173, "learning_rate": 7.69936079837976e-06, "loss": 0.2566, "step": 5315 }, { "epoch": 3.7279102384291725, "grad_norm": 0.26828423142433167, "learning_rate": 7.691343302700107e-06, "loss": 0.0783, "step": 5316 }, { "epoch": 3.728611500701262, "grad_norm": 0.23818759620189667, "learning_rate": 7.6833292246444e-06, "loss": 0.0869, "step": 5317 }, { "epoch": 3.7293127629733522, "grad_norm": 0.3168189227581024, "learning_rate": 7.675318565795044e-06, "loss": 0.2473, "step": 5318 }, { "epoch": 3.730014025245442, "grad_norm": 0.2376745045185089, "learning_rate": 7.667311327733748e-06, "loss": 0.0864, "step": 5319 }, { "epoch": 3.7307152875175316, "grad_norm": 0.24013473093509674, "learning_rate": 7.659307512041566e-06, "loss": 0.0867, "step": 5320 }, { "epoch": 3.7314165497896212, "grad_norm": 0.24592113494873047, "learning_rate": 7.65130712029886e-06, "loss": 0.0806, "step": 5321 }, { "epoch": 3.732117812061711, "grad_norm": 0.3021101951599121, "learning_rate": 7.643310154085318e-06, "loss": 0.2535, "step": 5322 }, { "epoch": 3.732819074333801, "grad_norm": 0.30350321531295776, "learning_rate": 7.635316614979945e-06, "loss": 0.2565, "step": 5323 }, { "epoch": 3.7335203366058907, "grad_norm": 0.8318786025047302, "learning_rate": 7.627326504561103e-06, "loss": 0.4256, "step": 5324 }, { "epoch": 3.7342215988779803, "grad_norm": 0.2974531650543213, "learning_rate": 7.61933982440643e-06, "loss": 0.0752, "step": 5325 }, { "epoch": 3.73492286115007, "grad_norm": 0.2990487813949585, "learning_rate": 7.611356576092926e-06, "loss": 0.251, "step": 5326 }, { "epoch": 3.73562412342216, "grad_norm": 0.23992572724819183, "learning_rate": 7.603376761196887e-06, "loss": 0.0849, "step": 5327 }, { "epoch": 3.7363253856942498, "grad_norm": 2.0102453231811523, "learning_rate": 7.5954003812939574e-06, "loss": 0.3249, "step": 5328 }, { "epoch": 3.7370266479663394, "grad_norm": 0.23843656480312347, "learning_rate": 7.5874274379590606e-06, "loss": 0.0871, "step": 5329 }, { "epoch": 3.737727910238429, "grad_norm": 0.7955589890480042, "learning_rate": 7.57945793276649e-06, "loss": 0.4267, "step": 5330 }, { "epoch": 3.7384291725105188, "grad_norm": 0.8455499410629272, "learning_rate": 7.5714918672898205e-06, "loss": 0.4324, "step": 5331 }, { "epoch": 3.7391304347826084, "grad_norm": 1.3658174276351929, "learning_rate": 7.563529243101985e-06, "loss": 0.6019, "step": 5332 }, { "epoch": 3.7398316970546985, "grad_norm": 0.787856936454773, "learning_rate": 7.555570061775199e-06, "loss": 0.4245, "step": 5333 }, { "epoch": 3.740532959326788, "grad_norm": 0.23902763426303864, "learning_rate": 7.547614324881033e-06, "loss": 0.087, "step": 5334 }, { "epoch": 3.741234221598878, "grad_norm": 0.23892851173877716, "learning_rate": 7.5396620339903524e-06, "loss": 0.0873, "step": 5335 }, { "epoch": 3.741935483870968, "grad_norm": 0.307401180267334, "learning_rate": 7.531713190673342e-06, "loss": 0.2585, "step": 5336 }, { "epoch": 3.7426367461430576, "grad_norm": 0.29233935475349426, "learning_rate": 7.523767796499534e-06, "loss": 0.2511, "step": 5337 }, { "epoch": 3.7433380084151473, "grad_norm": 0.24133867025375366, "learning_rate": 7.515825853037739e-06, "loss": 0.0882, "step": 5338 }, { "epoch": 3.744039270687237, "grad_norm": 0.28097254037857056, "learning_rate": 7.507887361856128e-06, "loss": 0.0808, "step": 5339 }, { "epoch": 3.7447405329593266, "grad_norm": 0.25436538457870483, "learning_rate": 7.499952324522158e-06, "loss": 0.0868, "step": 5340 }, { "epoch": 3.7454417952314163, "grad_norm": 0.2513497471809387, "learning_rate": 7.492020742602609e-06, "loss": 0.0855, "step": 5341 }, { "epoch": 3.7461430575035064, "grad_norm": 0.2878636419773102, "learning_rate": 7.484092617663604e-06, "loss": 0.2558, "step": 5342 }, { "epoch": 3.746844319775596, "grad_norm": 0.2956228256225586, "learning_rate": 7.476167951270554e-06, "loss": 0.2544, "step": 5343 }, { "epoch": 3.7475455820476857, "grad_norm": 0.24021697044372559, "learning_rate": 7.468246744988192e-06, "loss": 0.0878, "step": 5344 }, { "epoch": 3.748246844319776, "grad_norm": 0.29994964599609375, "learning_rate": 7.460329000380587e-06, "loss": 0.251, "step": 5345 }, { "epoch": 3.7489481065918655, "grad_norm": 0.27320167422294617, "learning_rate": 7.452414719011097e-06, "loss": 0.0816, "step": 5346 }, { "epoch": 3.749649368863955, "grad_norm": 0.24984502792358398, "learning_rate": 7.444503902442426e-06, "loss": 0.084, "step": 5347 }, { "epoch": 3.750350631136045, "grad_norm": 1.9495490789413452, "learning_rate": 7.436596552236563e-06, "loss": 0.4731, "step": 5348 }, { "epoch": 3.7510518934081345, "grad_norm": 0.25271207094192505, "learning_rate": 7.428692669954851e-06, "loss": 0.0847, "step": 5349 }, { "epoch": 3.751753155680224, "grad_norm": 0.3545506000518799, "learning_rate": 7.420792257157893e-06, "loss": 0.2618, "step": 5350 }, { "epoch": 3.7524544179523143, "grad_norm": 0.2409367710351944, "learning_rate": 7.412895315405666e-06, "loss": 0.0881, "step": 5351 }, { "epoch": 3.753155680224404, "grad_norm": 0.2504686415195465, "learning_rate": 7.4050018462574145e-06, "loss": 0.0853, "step": 5352 }, { "epoch": 3.7538569424964936, "grad_norm": 0.3071235716342926, "learning_rate": 7.397111851271735e-06, "loss": 0.2507, "step": 5353 }, { "epoch": 3.7545582047685837, "grad_norm": 0.24277228116989136, "learning_rate": 7.3892253320065055e-06, "loss": 0.0885, "step": 5354 }, { "epoch": 3.7552594670406734, "grad_norm": 0.2895934581756592, "learning_rate": 7.381342290018947e-06, "loss": 0.2506, "step": 5355 }, { "epoch": 3.755960729312763, "grad_norm": 0.7865241169929504, "learning_rate": 7.373462726865574e-06, "loss": 0.4188, "step": 5356 }, { "epoch": 3.7566619915848527, "grad_norm": 0.2408694177865982, "learning_rate": 7.365586644102215e-06, "loss": 0.0883, "step": 5357 }, { "epoch": 3.7573632538569424, "grad_norm": 0.2923837900161743, "learning_rate": 7.357714043284017e-06, "loss": 0.25, "step": 5358 }, { "epoch": 3.758064516129032, "grad_norm": 0.33781710267066956, "learning_rate": 7.349844925965446e-06, "loss": 0.2469, "step": 5359 }, { "epoch": 3.758765778401122, "grad_norm": 0.285579651594162, "learning_rate": 7.34197929370026e-06, "loss": 0.2546, "step": 5360 }, { "epoch": 3.759467040673212, "grad_norm": 0.2565334141254425, "learning_rate": 7.334117148041555e-06, "loss": 0.0862, "step": 5361 }, { "epoch": 3.7601683029453015, "grad_norm": 0.25807279348373413, "learning_rate": 7.326258490541715e-06, "loss": 0.0845, "step": 5362 }, { "epoch": 3.7608695652173916, "grad_norm": 0.2584235668182373, "learning_rate": 7.318403322752465e-06, "loss": 0.0861, "step": 5363 }, { "epoch": 3.7615708274894812, "grad_norm": 0.2408040165901184, "learning_rate": 7.310551646224789e-06, "loss": 0.0883, "step": 5364 }, { "epoch": 3.762272089761571, "grad_norm": 0.24366158246994019, "learning_rate": 7.302703462509042e-06, "loss": 0.0879, "step": 5365 }, { "epoch": 3.7629733520336606, "grad_norm": 0.2883383333683014, "learning_rate": 7.2948587731548415e-06, "loss": 0.2523, "step": 5366 }, { "epoch": 3.7636746143057502, "grad_norm": 0.3013887405395508, "learning_rate": 7.2870175797111504e-06, "loss": 0.2462, "step": 5367 }, { "epoch": 3.76437587657784, "grad_norm": 0.25450029969215393, "learning_rate": 7.279179883726212e-06, "loss": 0.0854, "step": 5368 }, { "epoch": 3.76507713884993, "grad_norm": 0.30943676829338074, "learning_rate": 7.2713456867476075e-06, "loss": 0.2563, "step": 5369 }, { "epoch": 3.7657784011220197, "grad_norm": 1.2894996404647827, "learning_rate": 7.263514990322207e-06, "loss": 0.5901, "step": 5370 }, { "epoch": 3.7664796633941093, "grad_norm": 0.2421075850725174, "learning_rate": 7.2556877959961935e-06, "loss": 0.0888, "step": 5371 }, { "epoch": 3.767180925666199, "grad_norm": 0.25335219502449036, "learning_rate": 7.24786410531505e-06, "loss": 0.086, "step": 5372 }, { "epoch": 3.767882187938289, "grad_norm": 0.2872631549835205, "learning_rate": 7.240043919823595e-06, "loss": 0.2506, "step": 5373 }, { "epoch": 3.7685834502103788, "grad_norm": 0.24305406212806702, "learning_rate": 7.232227241065923e-06, "loss": 0.0886, "step": 5374 }, { "epoch": 3.7692847124824684, "grad_norm": 0.25585490465164185, "learning_rate": 7.224414070585464e-06, "loss": 0.0851, "step": 5375 }, { "epoch": 3.769985974754558, "grad_norm": 0.2509823143482208, "learning_rate": 7.216604409924929e-06, "loss": 0.0844, "step": 5376 }, { "epoch": 3.7706872370266478, "grad_norm": 0.24228189885616302, "learning_rate": 7.20879826062637e-06, "loss": 0.0889, "step": 5377 }, { "epoch": 3.771388499298738, "grad_norm": 0.2871370017528534, "learning_rate": 7.200995624231094e-06, "loss": 0.25, "step": 5378 }, { "epoch": 3.7720897615708275, "grad_norm": 0.2546713054180145, "learning_rate": 7.19319650227977e-06, "loss": 0.0851, "step": 5379 }, { "epoch": 3.772791023842917, "grad_norm": 0.252826064825058, "learning_rate": 7.185400896312328e-06, "loss": 0.0853, "step": 5380 }, { "epoch": 3.773492286115007, "grad_norm": 0.2885427474975586, "learning_rate": 7.177608807868044e-06, "loss": 0.2505, "step": 5381 }, { "epoch": 3.774193548387097, "grad_norm": 0.2558538317680359, "learning_rate": 7.169820238485461e-06, "loss": 0.0851, "step": 5382 }, { "epoch": 3.7748948106591866, "grad_norm": 0.25203651189804077, "learning_rate": 7.162035189702462e-06, "loss": 0.0845, "step": 5383 }, { "epoch": 3.7755960729312763, "grad_norm": 0.2910119295120239, "learning_rate": 7.154253663056212e-06, "loss": 0.2501, "step": 5384 }, { "epoch": 3.776297335203366, "grad_norm": 0.253018319606781, "learning_rate": 7.146475660083176e-06, "loss": 0.0844, "step": 5385 }, { "epoch": 3.7769985974754556, "grad_norm": 0.2457464188337326, "learning_rate": 7.138701182319152e-06, "loss": 0.0818, "step": 5386 }, { "epoch": 3.7776998597475457, "grad_norm": 0.27884235978126526, "learning_rate": 7.130930231299215e-06, "loss": 0.0806, "step": 5387 }, { "epoch": 3.7784011220196354, "grad_norm": 0.254220575094223, "learning_rate": 7.123162808557743e-06, "loss": 0.0845, "step": 5388 }, { "epoch": 3.779102384291725, "grad_norm": 0.2422652691602707, "learning_rate": 7.115398915628446e-06, "loss": 0.0876, "step": 5389 }, { "epoch": 3.7798036465638147, "grad_norm": 0.28697991371154785, "learning_rate": 7.107638554044302e-06, "loss": 0.2547, "step": 5390 }, { "epoch": 3.780504908835905, "grad_norm": 0.24159656465053558, "learning_rate": 7.099881725337621e-06, "loss": 0.0882, "step": 5391 }, { "epoch": 3.7812061711079945, "grad_norm": 0.29962876439094543, "learning_rate": 7.092128431039993e-06, "loss": 0.2532, "step": 5392 }, { "epoch": 3.781907433380084, "grad_norm": 0.3101854622364044, "learning_rate": 7.0843786726823156e-06, "loss": 0.251, "step": 5393 }, { "epoch": 3.782608695652174, "grad_norm": 0.24066154658794403, "learning_rate": 7.076632451794807e-06, "loss": 0.0866, "step": 5394 }, { "epoch": 3.7833099579242635, "grad_norm": 0.2981026768684387, "learning_rate": 7.068889769906955e-06, "loss": 0.2572, "step": 5395 }, { "epoch": 3.784011220196353, "grad_norm": 0.2769355773925781, "learning_rate": 7.06115062854758e-06, "loss": 0.0796, "step": 5396 }, { "epoch": 3.7847124824684433, "grad_norm": 0.27793318033218384, "learning_rate": 7.053415029244775e-06, "loss": 0.0826, "step": 5397 }, { "epoch": 3.785413744740533, "grad_norm": 0.24728776514530182, "learning_rate": 7.045682973525969e-06, "loss": 0.0865, "step": 5398 }, { "epoch": 3.7861150070126226, "grad_norm": 0.2906951308250427, "learning_rate": 7.037954462917839e-06, "loss": 0.254, "step": 5399 }, { "epoch": 3.7868162692847127, "grad_norm": 0.2417083978652954, "learning_rate": 7.030229498946417e-06, "loss": 0.088, "step": 5400 }, { "epoch": 3.7875175315568024, "grad_norm": 0.31219759583473206, "learning_rate": 7.022508083136995e-06, "loss": 0.259, "step": 5401 }, { "epoch": 3.788218793828892, "grad_norm": 0.2546223998069763, "learning_rate": 7.014790217014194e-06, "loss": 0.084, "step": 5402 }, { "epoch": 3.7889200561009817, "grad_norm": 0.29225781559944153, "learning_rate": 7.007075902101903e-06, "loss": 0.2544, "step": 5403 }, { "epoch": 3.7896213183730714, "grad_norm": 0.25236186385154724, "learning_rate": 6.999365139923347e-06, "loss": 0.0836, "step": 5404 }, { "epoch": 3.790322580645161, "grad_norm": 0.24030935764312744, "learning_rate": 6.99165793200102e-06, "loss": 0.0875, "step": 5405 }, { "epoch": 3.791023842917251, "grad_norm": 0.24150829017162323, "learning_rate": 6.983954279856719e-06, "loss": 0.0878, "step": 5406 }, { "epoch": 3.791725105189341, "grad_norm": 0.24169166386127472, "learning_rate": 6.976254185011538e-06, "loss": 0.0869, "step": 5407 }, { "epoch": 3.7924263674614305, "grad_norm": 0.25208738446235657, "learning_rate": 6.968557648985893e-06, "loss": 0.0826, "step": 5408 }, { "epoch": 3.7931276297335206, "grad_norm": 0.25720009207725525, "learning_rate": 6.960864673299458e-06, "loss": 0.083, "step": 5409 }, { "epoch": 3.7938288920056102, "grad_norm": 0.39066189527511597, "learning_rate": 6.9531752594712425e-06, "loss": 0.266, "step": 5410 }, { "epoch": 3.7945301542777, "grad_norm": 0.2989625334739685, "learning_rate": 6.945489409019518e-06, "loss": 0.2568, "step": 5411 }, { "epoch": 3.7952314165497896, "grad_norm": 0.24072355031967163, "learning_rate": 6.9378071234618926e-06, "loss": 0.086, "step": 5412 }, { "epoch": 3.7959326788218792, "grad_norm": 0.2396738976240158, "learning_rate": 6.930128404315214e-06, "loss": 0.0862, "step": 5413 }, { "epoch": 3.796633941093969, "grad_norm": 0.23966102302074432, "learning_rate": 6.922453253095684e-06, "loss": 0.0865, "step": 5414 }, { "epoch": 3.797335203366059, "grad_norm": 0.7963161468505859, "learning_rate": 6.914781671318757e-06, "loss": 0.4243, "step": 5415 }, { "epoch": 3.7980364656381487, "grad_norm": 2.6236226558685303, "learning_rate": 6.907113660499218e-06, "loss": 0.3602, "step": 5416 }, { "epoch": 3.7987377279102383, "grad_norm": 0.24251660704612732, "learning_rate": 6.899449222151108e-06, "loss": 0.0872, "step": 5417 }, { "epoch": 3.7994389901823284, "grad_norm": 0.39198195934295654, "learning_rate": 6.8917883577878065e-06, "loss": 0.2594, "step": 5418 }, { "epoch": 3.800140252454418, "grad_norm": 0.24905410408973694, "learning_rate": 6.884131068921951e-06, "loss": 0.083, "step": 5419 }, { "epoch": 3.8008415147265078, "grad_norm": 0.24023479223251343, "learning_rate": 6.876477357065489e-06, "loss": 0.0799, "step": 5420 }, { "epoch": 3.8015427769985974, "grad_norm": 0.30056673288345337, "learning_rate": 6.8688272237296515e-06, "loss": 0.2563, "step": 5421 }, { "epoch": 3.802244039270687, "grad_norm": 0.248845636844635, "learning_rate": 6.861180670424983e-06, "loss": 0.0829, "step": 5422 }, { "epoch": 3.8029453015427768, "grad_norm": 0.3264840543270111, "learning_rate": 6.8535376986612946e-06, "loss": 0.2543, "step": 5423 }, { "epoch": 3.803646563814867, "grad_norm": 0.24794748425483704, "learning_rate": 6.845898309947721e-06, "loss": 0.082, "step": 5424 }, { "epoch": 3.8043478260869565, "grad_norm": 0.3143104314804077, "learning_rate": 6.838262505792656e-06, "loss": 0.2581, "step": 5425 }, { "epoch": 3.805049088359046, "grad_norm": 0.3282707929611206, "learning_rate": 6.8306302877038265e-06, "loss": 0.243, "step": 5426 }, { "epoch": 3.8057503506311363, "grad_norm": 0.2575676441192627, "learning_rate": 6.823001657188194e-06, "loss": 0.0853, "step": 5427 }, { "epoch": 3.806451612903226, "grad_norm": 0.3211805820465088, "learning_rate": 6.81537661575207e-06, "loss": 0.2462, "step": 5428 }, { "epoch": 3.8071528751753156, "grad_norm": 0.36368685960769653, "learning_rate": 6.807755164901014e-06, "loss": 0.079, "step": 5429 }, { "epoch": 3.8078541374474053, "grad_norm": 0.2527666389942169, "learning_rate": 6.800137306139911e-06, "loss": 0.0818, "step": 5430 }, { "epoch": 3.808555399719495, "grad_norm": 0.3000592291355133, "learning_rate": 6.792523040972909e-06, "loss": 0.2581, "step": 5431 }, { "epoch": 3.8092566619915846, "grad_norm": 0.29655757546424866, "learning_rate": 6.784912370903468e-06, "loss": 0.2559, "step": 5432 }, { "epoch": 3.8099579242636747, "grad_norm": 0.2402981072664261, "learning_rate": 6.777305297434319e-06, "loss": 0.0869, "step": 5433 }, { "epoch": 3.8106591865357644, "grad_norm": 2.9631478786468506, "learning_rate": 6.76970182206749e-06, "loss": 0.5748, "step": 5434 }, { "epoch": 3.811360448807854, "grad_norm": 0.30487048625946045, "learning_rate": 6.762101946304311e-06, "loss": 0.2493, "step": 5435 }, { "epoch": 3.8120617110799437, "grad_norm": 0.2734115421772003, "learning_rate": 6.7545056716453834e-06, "loss": 0.0797, "step": 5436 }, { "epoch": 3.812762973352034, "grad_norm": 0.2430063635110855, "learning_rate": 6.746912999590601e-06, "loss": 0.087, "step": 5437 }, { "epoch": 3.8134642356241235, "grad_norm": 0.2507326602935791, "learning_rate": 6.7393239316391606e-06, "loss": 0.0832, "step": 5438 }, { "epoch": 3.814165497896213, "grad_norm": 0.25206923484802246, "learning_rate": 6.731738469289528e-06, "loss": 0.0827, "step": 5439 }, { "epoch": 3.814866760168303, "grad_norm": 2.534153938293457, "learning_rate": 6.724156614039462e-06, "loss": 0.3839, "step": 5440 }, { "epoch": 3.8155680224403925, "grad_norm": 0.24198079109191895, "learning_rate": 6.716578367386029e-06, "loss": 0.0872, "step": 5441 }, { "epoch": 3.8162692847124826, "grad_norm": 0.24186435341835022, "learning_rate": 6.709003730825547e-06, "loss": 0.0809, "step": 5442 }, { "epoch": 3.8169705469845723, "grad_norm": 0.24099087715148926, "learning_rate": 6.701432705853658e-06, "loss": 0.0875, "step": 5443 }, { "epoch": 3.817671809256662, "grad_norm": 0.29906195402145386, "learning_rate": 6.693865293965262e-06, "loss": 0.2589, "step": 5444 }, { "epoch": 3.8183730715287516, "grad_norm": 0.24395059049129486, "learning_rate": 6.686301496654568e-06, "loss": 0.0821, "step": 5445 }, { "epoch": 3.8190743338008417, "grad_norm": 0.294448584318161, "learning_rate": 6.678741315415055e-06, "loss": 0.2565, "step": 5446 }, { "epoch": 3.8197755960729314, "grad_norm": 0.2510043978691101, "learning_rate": 6.671184751739493e-06, "loss": 0.0827, "step": 5447 }, { "epoch": 3.820476858345021, "grad_norm": 0.2407764047384262, "learning_rate": 6.663631807119933e-06, "loss": 0.0873, "step": 5448 }, { "epoch": 3.8211781206171107, "grad_norm": 0.37755078077316284, "learning_rate": 6.656082483047729e-06, "loss": 0.2624, "step": 5449 }, { "epoch": 3.8218793828892004, "grad_norm": 0.30696776509284973, "learning_rate": 6.648536781013495e-06, "loss": 0.2486, "step": 5450 }, { "epoch": 3.8225806451612905, "grad_norm": 0.29723286628723145, "learning_rate": 6.6409947025071585e-06, "loss": 0.2511, "step": 5451 }, { "epoch": 3.82328190743338, "grad_norm": 0.3046914339065552, "learning_rate": 6.6334562490179e-06, "loss": 0.2528, "step": 5452 }, { "epoch": 3.82398316970547, "grad_norm": 0.30437102913856506, "learning_rate": 6.6259214220342205e-06, "loss": 0.2537, "step": 5453 }, { "epoch": 3.8246844319775595, "grad_norm": 2.151183605194092, "learning_rate": 6.6183902230438575e-06, "loss": 0.3411, "step": 5454 }, { "epoch": 3.8253856942496496, "grad_norm": 0.25445878505706787, "learning_rate": 6.610862653533881e-06, "loss": 0.0759, "step": 5455 }, { "epoch": 3.8260869565217392, "grad_norm": 0.2953350245952606, "learning_rate": 6.603338714990606e-06, "loss": 0.2507, "step": 5456 }, { "epoch": 3.826788218793829, "grad_norm": 0.8019499182701111, "learning_rate": 6.595818408899662e-06, "loss": 0.4255, "step": 5457 }, { "epoch": 3.8274894810659186, "grad_norm": 0.2982352674007416, "learning_rate": 6.588301736745933e-06, "loss": 0.2588, "step": 5458 }, { "epoch": 3.8281907433380082, "grad_norm": 0.30549386143684387, "learning_rate": 6.580788700013615e-06, "loss": 0.0765, "step": 5459 }, { "epoch": 3.828892005610098, "grad_norm": 0.30980184674263, "learning_rate": 6.573279300186161e-06, "loss": 0.2533, "step": 5460 }, { "epoch": 3.829593267882188, "grad_norm": 0.25458770990371704, "learning_rate": 6.565773538746312e-06, "loss": 0.0843, "step": 5461 }, { "epoch": 3.8302945301542777, "grad_norm": 0.24110259115695953, "learning_rate": 6.55827141717609e-06, "loss": 0.0881, "step": 5462 }, { "epoch": 3.8309957924263673, "grad_norm": 0.8568136096000671, "learning_rate": 6.5507729369568145e-06, "loss": 0.4341, "step": 5463 }, { "epoch": 3.8316970546984574, "grad_norm": 0.31577152013778687, "learning_rate": 6.543278099569059e-06, "loss": 0.2512, "step": 5464 }, { "epoch": 3.832398316970547, "grad_norm": 0.27407580614089966, "learning_rate": 6.535786906492708e-06, "loss": 0.0809, "step": 5465 }, { "epoch": 3.833099579242637, "grad_norm": 0.2525399923324585, "learning_rate": 6.5282993592068934e-06, "loss": 0.0843, "step": 5466 }, { "epoch": 3.8338008415147264, "grad_norm": 0.24164234101772308, "learning_rate": 6.520815459190066e-06, "loss": 0.0879, "step": 5467 }, { "epoch": 3.834502103786816, "grad_norm": 0.3035660684108734, "learning_rate": 6.51333520791991e-06, "loss": 0.2566, "step": 5468 }, { "epoch": 3.8352033660589058, "grad_norm": 0.2504511773586273, "learning_rate": 6.505858606873433e-06, "loss": 0.0848, "step": 5469 }, { "epoch": 3.835904628330996, "grad_norm": 0.30439844727516174, "learning_rate": 6.4983856575268844e-06, "loss": 0.2532, "step": 5470 }, { "epoch": 3.8366058906030855, "grad_norm": 0.2819840610027313, "learning_rate": 6.490916361355831e-06, "loss": 0.2512, "step": 5471 }, { "epoch": 3.837307152875175, "grad_norm": 0.24117663502693176, "learning_rate": 6.4834507198350825e-06, "loss": 0.0879, "step": 5472 }, { "epoch": 3.8380084151472653, "grad_norm": 0.24505361914634705, "learning_rate": 6.475988734438754e-06, "loss": 0.0878, "step": 5473 }, { "epoch": 3.838709677419355, "grad_norm": 0.28255611658096313, "learning_rate": 6.468530406640222e-06, "loss": 0.082, "step": 5474 }, { "epoch": 3.8394109396914446, "grad_norm": 0.29785287380218506, "learning_rate": 6.461075737912148e-06, "loss": 0.2532, "step": 5475 }, { "epoch": 3.8401122019635343, "grad_norm": 0.25210559368133545, "learning_rate": 6.453624729726459e-06, "loss": 0.0835, "step": 5476 }, { "epoch": 3.840813464235624, "grad_norm": 0.273162841796875, "learning_rate": 6.446177383554386e-06, "loss": 0.0806, "step": 5477 }, { "epoch": 3.8415147265077136, "grad_norm": 0.34906965494155884, "learning_rate": 6.4387337008664035e-06, "loss": 0.2391, "step": 5478 }, { "epoch": 3.8422159887798037, "grad_norm": 0.29496318101882935, "learning_rate": 6.4312936831322945e-06, "loss": 0.2553, "step": 5479 }, { "epoch": 3.8429172510518934, "grad_norm": 0.2545210123062134, "learning_rate": 6.42385733182109e-06, "loss": 0.0842, "step": 5480 }, { "epoch": 3.843618513323983, "grad_norm": 2.392252206802368, "learning_rate": 6.416424648401126e-06, "loss": 0.3489, "step": 5481 }, { "epoch": 3.844319775596073, "grad_norm": 2.073601007461548, "learning_rate": 6.408995634339987e-06, "loss": 0.3453, "step": 5482 }, { "epoch": 3.845021037868163, "grad_norm": 0.2966483533382416, "learning_rate": 6.401570291104541e-06, "loss": 0.2509, "step": 5483 }, { "epoch": 3.8457223001402525, "grad_norm": 0.8012504577636719, "learning_rate": 6.39414862016095e-06, "loss": 0.4208, "step": 5484 }, { "epoch": 3.846423562412342, "grad_norm": 0.24130100011825562, "learning_rate": 6.386730622974624e-06, "loss": 0.0879, "step": 5485 }, { "epoch": 3.847124824684432, "grad_norm": 0.24362802505493164, "learning_rate": 6.379316301010255e-06, "loss": 0.0872, "step": 5486 }, { "epoch": 3.8478260869565215, "grad_norm": 0.29202643036842346, "learning_rate": 6.371905655731828e-06, "loss": 0.2569, "step": 5487 }, { "epoch": 3.8485273492286116, "grad_norm": 0.7996147871017456, "learning_rate": 6.36449868860258e-06, "loss": 0.4215, "step": 5488 }, { "epoch": 3.8492286115007013, "grad_norm": 0.28016263246536255, "learning_rate": 6.357095401085023e-06, "loss": 0.0802, "step": 5489 }, { "epoch": 3.849929873772791, "grad_norm": 0.2807230055332184, "learning_rate": 6.349695794640961e-06, "loss": 0.0816, "step": 5490 }, { "epoch": 3.850631136044881, "grad_norm": 0.24320797622203827, "learning_rate": 6.342299870731447e-06, "loss": 0.0886, "step": 5491 }, { "epoch": 3.8513323983169707, "grad_norm": 0.24206963181495667, "learning_rate": 6.334907630816833e-06, "loss": 0.0885, "step": 5492 }, { "epoch": 3.8520336605890604, "grad_norm": 0.2451656311750412, "learning_rate": 6.327519076356714e-06, "loss": 0.0896, "step": 5493 }, { "epoch": 3.85273492286115, "grad_norm": 0.3063894808292389, "learning_rate": 6.320134208809986e-06, "loss": 0.2444, "step": 5494 }, { "epoch": 3.8534361851332397, "grad_norm": 0.2440517097711563, "learning_rate": 6.312753029634799e-06, "loss": 0.0875, "step": 5495 }, { "epoch": 3.8541374474053294, "grad_norm": 0.7825078368186951, "learning_rate": 6.305375540288577e-06, "loss": 0.4217, "step": 5496 }, { "epoch": 3.8548387096774195, "grad_norm": 0.3188439607620239, "learning_rate": 6.298001742228013e-06, "loss": 0.2485, "step": 5497 }, { "epoch": 3.855539971949509, "grad_norm": 0.2870579659938812, "learning_rate": 6.290631636909092e-06, "loss": 0.2536, "step": 5498 }, { "epoch": 3.856241234221599, "grad_norm": 0.25272512435913086, "learning_rate": 6.283265225787036e-06, "loss": 0.085, "step": 5499 }, { "epoch": 3.8569424964936885, "grad_norm": 0.24212782084941864, "learning_rate": 6.27590251031637e-06, "loss": 0.0881, "step": 5500 }, { "epoch": 3.8576437587657786, "grad_norm": 0.25536277890205383, "learning_rate": 6.268543491950862e-06, "loss": 0.0849, "step": 5501 }, { "epoch": 3.8583450210378682, "grad_norm": 0.7907336950302124, "learning_rate": 6.261188172143584e-06, "loss": 0.4235, "step": 5502 }, { "epoch": 3.859046283309958, "grad_norm": 0.24267630279064178, "learning_rate": 6.253836552346828e-06, "loss": 0.0884, "step": 5503 }, { "epoch": 3.8597475455820476, "grad_norm": 0.29104068875312805, "learning_rate": 6.246488634012204e-06, "loss": 0.2527, "step": 5504 }, { "epoch": 3.8604488078541372, "grad_norm": 0.24489395320415497, "learning_rate": 6.239144418590559e-06, "loss": 0.0886, "step": 5505 }, { "epoch": 3.8611500701262274, "grad_norm": 0.27668237686157227, "learning_rate": 6.2318039075320325e-06, "loss": 0.0812, "step": 5506 }, { "epoch": 3.861851332398317, "grad_norm": 0.24189415574073792, "learning_rate": 6.224467102286011e-06, "loss": 0.0889, "step": 5507 }, { "epoch": 3.8625525946704067, "grad_norm": 0.2509620785713196, "learning_rate": 6.21713400430117e-06, "loss": 0.0847, "step": 5508 }, { "epoch": 3.8632538569424963, "grad_norm": 0.2518733739852905, "learning_rate": 6.209804615025436e-06, "loss": 0.0833, "step": 5509 }, { "epoch": 3.8639551192145865, "grad_norm": 0.2821063995361328, "learning_rate": 6.202478935906008e-06, "loss": 0.2539, "step": 5510 }, { "epoch": 3.864656381486676, "grad_norm": 0.2841748893260956, "learning_rate": 6.1951569683893516e-06, "loss": 0.0841, "step": 5511 }, { "epoch": 3.865357643758766, "grad_norm": 0.2450958490371704, "learning_rate": 6.187838713921212e-06, "loss": 0.0874, "step": 5512 }, { "epoch": 3.8660589060308554, "grad_norm": 0.30584990978240967, "learning_rate": 6.180524173946575e-06, "loss": 0.2501, "step": 5513 }, { "epoch": 3.866760168302945, "grad_norm": 0.24309182167053223, "learning_rate": 6.173213349909729e-06, "loss": 0.0888, "step": 5514 }, { "epoch": 3.867461430575035, "grad_norm": 0.2533075511455536, "learning_rate": 6.165906243254191e-06, "loss": 0.0851, "step": 5515 }, { "epoch": 3.868162692847125, "grad_norm": 0.281949520111084, "learning_rate": 6.158602855422782e-06, "loss": 0.0839, "step": 5516 }, { "epoch": 3.8688639551192145, "grad_norm": 0.30994129180908203, "learning_rate": 6.151303187857541e-06, "loss": 0.2427, "step": 5517 }, { "epoch": 3.869565217391304, "grad_norm": 0.7940595149993896, "learning_rate": 6.1440072419998244e-06, "loss": 0.416, "step": 5518 }, { "epoch": 3.8702664796633943, "grad_norm": 0.31110796332359314, "learning_rate": 6.136715019290209e-06, "loss": 0.2481, "step": 5519 }, { "epoch": 3.870967741935484, "grad_norm": 0.25587502121925354, "learning_rate": 6.12942652116858e-06, "loss": 0.0839, "step": 5520 }, { "epoch": 3.8716690042075736, "grad_norm": 0.25076258182525635, "learning_rate": 6.12214174907404e-06, "loss": 0.0834, "step": 5521 }, { "epoch": 3.8723702664796633, "grad_norm": 0.2980094850063324, "learning_rate": 6.114860704444997e-06, "loss": 0.2518, "step": 5522 }, { "epoch": 3.873071528751753, "grad_norm": 0.2542690932750702, "learning_rate": 6.107583388719101e-06, "loss": 0.0844, "step": 5523 }, { "epoch": 3.8737727910238426, "grad_norm": 0.35412654280662537, "learning_rate": 6.100309803333268e-06, "loss": 0.2569, "step": 5524 }, { "epoch": 3.8744740532959328, "grad_norm": 0.24449874460697174, "learning_rate": 6.093039949723675e-06, "loss": 0.0889, "step": 5525 }, { "epoch": 3.8751753155680224, "grad_norm": 0.2694639563560486, "learning_rate": 6.085773829325781e-06, "loss": 0.0809, "step": 5526 }, { "epoch": 3.875876577840112, "grad_norm": 0.24457278847694397, "learning_rate": 6.078511443574278e-06, "loss": 0.0886, "step": 5527 }, { "epoch": 3.876577840112202, "grad_norm": 0.24923065304756165, "learning_rate": 6.071252793903151e-06, "loss": 0.0837, "step": 5528 }, { "epoch": 3.877279102384292, "grad_norm": 0.24167853593826294, "learning_rate": 6.063997881745617e-06, "loss": 0.0886, "step": 5529 }, { "epoch": 3.8779803646563815, "grad_norm": 0.2972221374511719, "learning_rate": 6.056746708534192e-06, "loss": 0.2497, "step": 5530 }, { "epoch": 3.878681626928471, "grad_norm": 0.28531473875045776, "learning_rate": 6.049499275700618e-06, "loss": 0.2536, "step": 5531 }, { "epoch": 3.879382889200561, "grad_norm": 0.30139246582984924, "learning_rate": 6.042255584675907e-06, "loss": 0.2535, "step": 5532 }, { "epoch": 3.8800841514726505, "grad_norm": 0.25647175312042236, "learning_rate": 6.035015636890356e-06, "loss": 0.0843, "step": 5533 }, { "epoch": 3.8807854137447406, "grad_norm": 0.24983102083206177, "learning_rate": 6.027779433773498e-06, "loss": 0.083, "step": 5534 }, { "epoch": 3.8814866760168303, "grad_norm": 0.25172004103660583, "learning_rate": 6.020546976754121e-06, "loss": 0.0829, "step": 5535 }, { "epoch": 3.88218793828892, "grad_norm": 0.23890523612499237, "learning_rate": 6.013318267260307e-06, "loss": 0.0805, "step": 5536 }, { "epoch": 3.88288920056101, "grad_norm": 0.2516293525695801, "learning_rate": 6.006093306719366e-06, "loss": 0.0841, "step": 5537 }, { "epoch": 3.8835904628330997, "grad_norm": 0.3116834759712219, "learning_rate": 5.998872096557873e-06, "loss": 0.2411, "step": 5538 }, { "epoch": 3.8842917251051894, "grad_norm": 4.80709981918335, "learning_rate": 5.99165463820168e-06, "loss": 0.6594, "step": 5539 }, { "epoch": 3.884992987377279, "grad_norm": 0.29628023505210876, "learning_rate": 5.984440933075877e-06, "loss": 0.2567, "step": 5540 }, { "epoch": 3.8856942496493687, "grad_norm": 0.3060000240802765, "learning_rate": 5.977230982604834e-06, "loss": 0.2516, "step": 5541 }, { "epoch": 3.8863955119214584, "grad_norm": 0.8105089068412781, "learning_rate": 5.970024788212153e-06, "loss": 0.4249, "step": 5542 }, { "epoch": 3.8870967741935485, "grad_norm": 0.2504207193851471, "learning_rate": 5.962822351320724e-06, "loss": 0.0839, "step": 5543 }, { "epoch": 3.887798036465638, "grad_norm": 0.2509833872318268, "learning_rate": 5.955623673352676e-06, "loss": 0.0841, "step": 5544 }, { "epoch": 3.888499298737728, "grad_norm": 0.31106042861938477, "learning_rate": 5.948428755729396e-06, "loss": 0.2507, "step": 5545 }, { "epoch": 3.889200561009818, "grad_norm": 0.2934924364089966, "learning_rate": 5.941237599871532e-06, "loss": 0.2538, "step": 5546 }, { "epoch": 3.8899018232819076, "grad_norm": 0.2439972460269928, "learning_rate": 5.934050207198999e-06, "loss": 0.0881, "step": 5547 }, { "epoch": 3.8906030855539973, "grad_norm": 0.3660133481025696, "learning_rate": 5.926866579130946e-06, "loss": 0.2614, "step": 5548 }, { "epoch": 3.891304347826087, "grad_norm": 0.27210909128189087, "learning_rate": 5.919686717085806e-06, "loss": 0.0806, "step": 5549 }, { "epoch": 3.8920056100981766, "grad_norm": 0.28409552574157715, "learning_rate": 5.912510622481249e-06, "loss": 0.0716, "step": 5550 }, { "epoch": 3.8927068723702662, "grad_norm": 0.8064035773277283, "learning_rate": 5.905338296734217e-06, "loss": 0.4069, "step": 5551 }, { "epoch": 3.8934081346423564, "grad_norm": 0.24380213022232056, "learning_rate": 5.8981697412608785e-06, "loss": 0.0883, "step": 5552 }, { "epoch": 3.894109396914446, "grad_norm": 0.25192129611968994, "learning_rate": 5.8910049574766925e-06, "loss": 0.0833, "step": 5553 }, { "epoch": 3.8948106591865357, "grad_norm": 0.24323661625385284, "learning_rate": 5.883843946796349e-06, "loss": 0.0887, "step": 5554 }, { "epoch": 3.895511921458626, "grad_norm": 0.2529640197753906, "learning_rate": 5.8766867106338105e-06, "loss": 0.0832, "step": 5555 }, { "epoch": 3.8962131837307155, "grad_norm": 0.24206623435020447, "learning_rate": 5.869533250402276e-06, "loss": 0.0876, "step": 5556 }, { "epoch": 3.896914446002805, "grad_norm": 0.30776816606521606, "learning_rate": 5.862383567514226e-06, "loss": 0.2572, "step": 5557 }, { "epoch": 3.897615708274895, "grad_norm": 0.25334253907203674, "learning_rate": 5.855237663381352e-06, "loss": 0.0824, "step": 5558 }, { "epoch": 3.8983169705469845, "grad_norm": 0.8019287586212158, "learning_rate": 5.848095539414647e-06, "loss": 0.4263, "step": 5559 }, { "epoch": 3.899018232819074, "grad_norm": 0.25009700655937195, "learning_rate": 5.8409571970243184e-06, "loss": 0.0743, "step": 5560 }, { "epoch": 3.8997194950911642, "grad_norm": 0.8473889231681824, "learning_rate": 5.833822637619859e-06, "loss": 0.4177, "step": 5561 }, { "epoch": 3.900420757363254, "grad_norm": 0.2373170703649521, "learning_rate": 5.826691862609987e-06, "loss": 0.0801, "step": 5562 }, { "epoch": 3.9011220196353436, "grad_norm": 0.26695549488067627, "learning_rate": 5.8195648734027e-06, "loss": 0.0786, "step": 5563 }, { "epoch": 3.901823281907433, "grad_norm": 0.32505708932876587, "learning_rate": 5.812441671405228e-06, "loss": 0.245, "step": 5564 }, { "epoch": 3.9025245441795233, "grad_norm": 0.24413877725601196, "learning_rate": 5.805322258024057e-06, "loss": 0.088, "step": 5565 }, { "epoch": 3.903225806451613, "grad_norm": 0.24516332149505615, "learning_rate": 5.798206634664921e-06, "loss": 0.0873, "step": 5566 }, { "epoch": 3.9039270687237027, "grad_norm": 0.2527678906917572, "learning_rate": 5.791094802732827e-06, "loss": 0.0835, "step": 5567 }, { "epoch": 3.9046283309957923, "grad_norm": 0.27011287212371826, "learning_rate": 5.783986763632005e-06, "loss": 0.0786, "step": 5568 }, { "epoch": 3.905329593267882, "grad_norm": 0.3035660684108734, "learning_rate": 5.776882518765961e-06, "loss": 0.2533, "step": 5569 }, { "epoch": 3.906030855539972, "grad_norm": 0.7914127707481384, "learning_rate": 5.76978206953743e-06, "loss": 0.418, "step": 5570 }, { "epoch": 3.9067321178120618, "grad_norm": 0.24273476004600525, "learning_rate": 5.7626854173484175e-06, "loss": 0.0884, "step": 5571 }, { "epoch": 3.9074333800841514, "grad_norm": 0.31821808218955994, "learning_rate": 5.7555925636001675e-06, "loss": 0.2502, "step": 5572 }, { "epoch": 3.908134642356241, "grad_norm": 2.804966926574707, "learning_rate": 5.748503509693174e-06, "loss": 0.404, "step": 5573 }, { "epoch": 3.908835904628331, "grad_norm": 0.3007810711860657, "learning_rate": 5.741418257027173e-06, "loss": 0.2582, "step": 5574 }, { "epoch": 3.909537166900421, "grad_norm": 0.2962087392807007, "learning_rate": 5.734336807001178e-06, "loss": 0.2573, "step": 5575 }, { "epoch": 3.9102384291725105, "grad_norm": 0.3070695400238037, "learning_rate": 5.727259161013418e-06, "loss": 0.2432, "step": 5576 }, { "epoch": 3.9109396914446, "grad_norm": 0.26503098011016846, "learning_rate": 5.720185320461399e-06, "loss": 0.0781, "step": 5577 }, { "epoch": 3.91164095371669, "grad_norm": 0.3217892050743103, "learning_rate": 5.713115286741858e-06, "loss": 0.2464, "step": 5578 }, { "epoch": 3.91234221598878, "grad_norm": 0.2866806983947754, "learning_rate": 5.706049061250776e-06, "loss": 0.2517, "step": 5579 }, { "epoch": 3.9130434782608696, "grad_norm": 0.24625633656978607, "learning_rate": 5.698986645383408e-06, "loss": 0.0825, "step": 5580 }, { "epoch": 3.9137447405329593, "grad_norm": 0.3120870292186737, "learning_rate": 5.691928040534225e-06, "loss": 0.2456, "step": 5581 }, { "epoch": 3.914446002805049, "grad_norm": 0.2515731155872345, "learning_rate": 5.684873248096975e-06, "loss": 0.0836, "step": 5582 }, { "epoch": 3.915147265077139, "grad_norm": 0.2522079050540924, "learning_rate": 5.6778222694646275e-06, "loss": 0.0844, "step": 5583 }, { "epoch": 3.9158485273492287, "grad_norm": 0.24569150805473328, "learning_rate": 5.670775106029411e-06, "loss": 0.0887, "step": 5584 }, { "epoch": 3.9165497896213184, "grad_norm": 0.24837933480739594, "learning_rate": 5.6637317591828085e-06, "loss": 0.0834, "step": 5585 }, { "epoch": 3.917251051893408, "grad_norm": 0.30844101309776306, "learning_rate": 5.656692230315535e-06, "loss": 0.2512, "step": 5586 }, { "epoch": 3.9179523141654977, "grad_norm": 2.8838679790496826, "learning_rate": 5.649656520817554e-06, "loss": 0.5485, "step": 5587 }, { "epoch": 3.9186535764375874, "grad_norm": 0.28453344106674194, "learning_rate": 5.642624632078086e-06, "loss": 0.2524, "step": 5588 }, { "epoch": 3.9193548387096775, "grad_norm": 2.7780158519744873, "learning_rate": 5.635596565485582e-06, "loss": 0.5396, "step": 5589 }, { "epoch": 3.920056100981767, "grad_norm": 0.30188995599746704, "learning_rate": 5.628572322427755e-06, "loss": 0.2538, "step": 5590 }, { "epoch": 3.920757363253857, "grad_norm": 0.2500148117542267, "learning_rate": 5.621551904291542e-06, "loss": 0.083, "step": 5591 }, { "epoch": 3.921458625525947, "grad_norm": 0.35978156328201294, "learning_rate": 5.61453531246316e-06, "loss": 0.2517, "step": 5592 }, { "epoch": 3.9221598877980366, "grad_norm": 0.25363031029701233, "learning_rate": 5.607522548328012e-06, "loss": 0.085, "step": 5593 }, { "epoch": 3.9228611500701263, "grad_norm": 0.25353437662124634, "learning_rate": 5.6005136132708084e-06, "loss": 0.0838, "step": 5594 }, { "epoch": 3.923562412342216, "grad_norm": 0.24717316031455994, "learning_rate": 5.593508508675458e-06, "loss": 0.0886, "step": 5595 }, { "epoch": 3.9242636746143056, "grad_norm": 0.7919451594352722, "learning_rate": 5.586507235925143e-06, "loss": 0.4222, "step": 5596 }, { "epoch": 3.9249649368863953, "grad_norm": 0.2710493206977844, "learning_rate": 5.579509796402269e-06, "loss": 0.0785, "step": 5597 }, { "epoch": 3.9256661991584854, "grad_norm": 0.24747146666049957, "learning_rate": 5.5725161914884996e-06, "loss": 0.0889, "step": 5598 }, { "epoch": 3.926367461430575, "grad_norm": 0.7828565835952759, "learning_rate": 5.565526422564732e-06, "loss": 0.4163, "step": 5599 }, { "epoch": 3.9270687237026647, "grad_norm": 0.2677403390407562, "learning_rate": 5.558540491011105e-06, "loss": 0.0781, "step": 5600 }, { "epoch": 3.927769985974755, "grad_norm": 0.26049989461898804, "learning_rate": 5.551558398206997e-06, "loss": 0.0867, "step": 5601 }, { "epoch": 3.9284712482468445, "grad_norm": 0.25788527727127075, "learning_rate": 5.544580145531048e-06, "loss": 0.0843, "step": 5602 }, { "epoch": 3.929172510518934, "grad_norm": 0.7844809889793396, "learning_rate": 5.537605734361112e-06, "loss": 0.4125, "step": 5603 }, { "epoch": 3.929873772791024, "grad_norm": 0.2961110472679138, "learning_rate": 5.5306351660743135e-06, "loss": 0.2453, "step": 5604 }, { "epoch": 3.9305750350631135, "grad_norm": 0.24656379222869873, "learning_rate": 5.5236684420469864e-06, "loss": 0.0893, "step": 5605 }, { "epoch": 3.931276297335203, "grad_norm": 0.27586647868156433, "learning_rate": 5.516705563654748e-06, "loss": 0.0807, "step": 5606 }, { "epoch": 3.9319775596072932, "grad_norm": 0.2547518312931061, "learning_rate": 5.509746532272397e-06, "loss": 0.0844, "step": 5607 }, { "epoch": 3.932678821879383, "grad_norm": 0.25822994112968445, "learning_rate": 5.50279134927403e-06, "loss": 0.0774, "step": 5608 }, { "epoch": 3.9333800841514726, "grad_norm": 0.2619676887989044, "learning_rate": 5.4958400160329465e-06, "loss": 0.0844, "step": 5609 }, { "epoch": 3.9340813464235627, "grad_norm": 0.24509936571121216, "learning_rate": 5.488892533921711e-06, "loss": 0.0889, "step": 5610 }, { "epoch": 3.9347826086956523, "grad_norm": 0.25014743208885193, "learning_rate": 5.481948904312104e-06, "loss": 0.0841, "step": 5611 }, { "epoch": 3.935483870967742, "grad_norm": 0.24945703148841858, "learning_rate": 5.47500912857517e-06, "loss": 0.084, "step": 5612 }, { "epoch": 3.9361851332398317, "grad_norm": 0.249843567609787, "learning_rate": 5.468073208081173e-06, "loss": 0.0878, "step": 5613 }, { "epoch": 3.9368863955119213, "grad_norm": 0.2869492471218109, "learning_rate": 5.461141144199622e-06, "loss": 0.2548, "step": 5614 }, { "epoch": 3.937587657784011, "grad_norm": 0.28092360496520996, "learning_rate": 5.454212938299255e-06, "loss": 0.2532, "step": 5615 }, { "epoch": 3.938288920056101, "grad_norm": 0.2928541600704193, "learning_rate": 5.4472885917480785e-06, "loss": 0.25, "step": 5616 }, { "epoch": 3.9389901823281908, "grad_norm": 0.27682915329933167, "learning_rate": 5.440368105913299e-06, "loss": 0.0799, "step": 5617 }, { "epoch": 3.9396914446002804, "grad_norm": 0.24626363813877106, "learning_rate": 5.433451482161392e-06, "loss": 0.0891, "step": 5618 }, { "epoch": 3.9403927068723705, "grad_norm": 0.24645210802555084, "learning_rate": 5.426538721858043e-06, "loss": 0.0887, "step": 5619 }, { "epoch": 3.94109396914446, "grad_norm": 0.24647319316864014, "learning_rate": 5.419629826368208e-06, "loss": 0.0893, "step": 5620 }, { "epoch": 3.94179523141655, "grad_norm": 0.3069833219051361, "learning_rate": 5.412724797056037e-06, "loss": 0.2521, "step": 5621 }, { "epoch": 3.9424964936886395, "grad_norm": 0.2715540826320648, "learning_rate": 5.4058236352849566e-06, "loss": 0.0784, "step": 5622 }, { "epoch": 3.943197755960729, "grad_norm": 0.7876101732254028, "learning_rate": 5.398926342417598e-06, "loss": 0.422, "step": 5623 }, { "epoch": 3.943899018232819, "grad_norm": 0.4056159555912018, "learning_rate": 5.392032919815862e-06, "loss": 0.2431, "step": 5624 }, { "epoch": 3.944600280504909, "grad_norm": 0.2913985848426819, "learning_rate": 5.3851433688408506e-06, "loss": 0.2483, "step": 5625 }, { "epoch": 3.9453015427769986, "grad_norm": 0.2448694258928299, "learning_rate": 5.378257690852931e-06, "loss": 0.0882, "step": 5626 }, { "epoch": 3.9460028050490883, "grad_norm": 0.24801039695739746, "learning_rate": 5.371375887211686e-06, "loss": 0.0883, "step": 5627 }, { "epoch": 3.946704067321178, "grad_norm": 0.321454793214798, "learning_rate": 5.364497959275929e-06, "loss": 0.2449, "step": 5628 }, { "epoch": 3.947405329593268, "grad_norm": 0.8465668559074402, "learning_rate": 5.357623908403739e-06, "loss": 0.4222, "step": 5629 }, { "epoch": 3.9481065918653577, "grad_norm": 0.28978946805000305, "learning_rate": 5.35075373595239e-06, "loss": 0.2558, "step": 5630 }, { "epoch": 3.9488078541374474, "grad_norm": 0.8088217377662659, "learning_rate": 5.343887443278425e-06, "loss": 0.4233, "step": 5631 }, { "epoch": 3.949509116409537, "grad_norm": 0.28686389327049255, "learning_rate": 5.337025031737599e-06, "loss": 0.2543, "step": 5632 }, { "epoch": 3.9502103786816267, "grad_norm": 0.26920831203460693, "learning_rate": 5.3301665026849e-06, "loss": 0.0774, "step": 5633 }, { "epoch": 3.950911640953717, "grad_norm": 0.38477036356925964, "learning_rate": 5.323311857474567e-06, "loss": 0.066, "step": 5634 }, { "epoch": 3.9516129032258065, "grad_norm": 0.7880785465240479, "learning_rate": 5.31646109746006e-06, "loss": 0.4155, "step": 5635 }, { "epoch": 3.952314165497896, "grad_norm": 0.30280810594558716, "learning_rate": 5.309614223994061e-06, "loss": 0.2505, "step": 5636 }, { "epoch": 3.953015427769986, "grad_norm": 1.3159157037734985, "learning_rate": 5.3027712384285135e-06, "loss": 0.5843, "step": 5637 }, { "epoch": 3.953716690042076, "grad_norm": 0.27902814745903015, "learning_rate": 5.295932142114562e-06, "loss": 0.2487, "step": 5638 }, { "epoch": 3.9544179523141656, "grad_norm": 0.383092999458313, "learning_rate": 5.289096936402616e-06, "loss": 0.262, "step": 5639 }, { "epoch": 3.9551192145862553, "grad_norm": 0.24708417057991028, "learning_rate": 5.2822656226422765e-06, "loss": 0.0886, "step": 5640 }, { "epoch": 3.955820476858345, "grad_norm": 0.25562748312950134, "learning_rate": 5.275438202182425e-06, "loss": 0.0841, "step": 5641 }, { "epoch": 3.9565217391304346, "grad_norm": 0.2635519206523895, "learning_rate": 5.26861467637112e-06, "loss": 0.0862, "step": 5642 }, { "epoch": 3.9572230014025247, "grad_norm": 0.24588286876678467, "learning_rate": 5.261795046555695e-06, "loss": 0.0887, "step": 5643 }, { "epoch": 3.9579242636746144, "grad_norm": 0.2776218056678772, "learning_rate": 5.254979314082686e-06, "loss": 0.2513, "step": 5644 }, { "epoch": 3.958625525946704, "grad_norm": 0.24796617031097412, "learning_rate": 5.248167480297886e-06, "loss": 0.0891, "step": 5645 }, { "epoch": 3.9593267882187937, "grad_norm": 0.25495973229408264, "learning_rate": 5.241359546546287e-06, "loss": 0.0833, "step": 5646 }, { "epoch": 3.960028050490884, "grad_norm": 0.2534923553466797, "learning_rate": 5.234555514172143e-06, "loss": 0.083, "step": 5647 }, { "epoch": 3.9607293127629735, "grad_norm": 0.28872817754745483, "learning_rate": 5.2277553845189145e-06, "loss": 0.2538, "step": 5648 }, { "epoch": 3.961430575035063, "grad_norm": 0.37909433245658875, "learning_rate": 5.2209591589292976e-06, "loss": 0.2481, "step": 5649 }, { "epoch": 3.962131837307153, "grad_norm": 0.2888875901699066, "learning_rate": 5.214166838745213e-06, "loss": 0.255, "step": 5650 }, { "epoch": 3.9628330995792425, "grad_norm": 0.2494824379682541, "learning_rate": 5.20737842530783e-06, "loss": 0.0836, "step": 5651 }, { "epoch": 3.963534361851332, "grad_norm": 0.3086130917072296, "learning_rate": 5.200593919957517e-06, "loss": 0.2482, "step": 5652 }, { "epoch": 3.9642356241234222, "grad_norm": 3.929161310195923, "learning_rate": 5.193813324033902e-06, "loss": 0.4563, "step": 5653 }, { "epoch": 3.964936886395512, "grad_norm": 0.2833893895149231, "learning_rate": 5.1870366388758115e-06, "loss": 0.2531, "step": 5654 }, { "epoch": 3.9656381486676016, "grad_norm": 0.8085541129112244, "learning_rate": 5.180263865821333e-06, "loss": 0.4148, "step": 5655 }, { "epoch": 3.9663394109396917, "grad_norm": 0.24961678683757782, "learning_rate": 5.173495006207734e-06, "loss": 0.0888, "step": 5656 }, { "epoch": 3.9670406732117813, "grad_norm": 0.25388070940971375, "learning_rate": 5.166730061371561e-06, "loss": 0.0838, "step": 5657 }, { "epoch": 3.967741935483871, "grad_norm": 0.2574358284473419, "learning_rate": 5.159969032648545e-06, "loss": 0.0847, "step": 5658 }, { "epoch": 3.9684431977559607, "grad_norm": 0.2500324249267578, "learning_rate": 5.153211921373685e-06, "loss": 0.0897, "step": 5659 }, { "epoch": 3.9691444600280503, "grad_norm": 0.2456590086221695, "learning_rate": 5.1464587288811624e-06, "loss": 0.0814, "step": 5660 }, { "epoch": 3.96984572230014, "grad_norm": 0.24571751058101654, "learning_rate": 5.139709456504421e-06, "loss": 0.0891, "step": 5661 }, { "epoch": 3.97054698457223, "grad_norm": 0.29422399401664734, "learning_rate": 5.132964105576116e-06, "loss": 0.2474, "step": 5662 }, { "epoch": 3.9712482468443198, "grad_norm": 0.24923716485500336, "learning_rate": 5.126222677428122e-06, "loss": 0.0892, "step": 5663 }, { "epoch": 3.9719495091164094, "grad_norm": 0.2549764811992645, "learning_rate": 5.119485173391542e-06, "loss": 0.0843, "step": 5664 }, { "epoch": 3.9726507713884995, "grad_norm": 0.2535695433616638, "learning_rate": 5.112751594796717e-06, "loss": 0.0839, "step": 5665 }, { "epoch": 3.973352033660589, "grad_norm": 0.37322622537612915, "learning_rate": 5.106021942973196e-06, "loss": 0.2633, "step": 5666 }, { "epoch": 3.974053295932679, "grad_norm": 0.24952176213264465, "learning_rate": 5.099296219249772e-06, "loss": 0.084, "step": 5667 }, { "epoch": 3.9747545582047685, "grad_norm": 0.24865023791790009, "learning_rate": 5.092574424954433e-06, "loss": 0.0897, "step": 5668 }, { "epoch": 3.975455820476858, "grad_norm": 0.2554607689380646, "learning_rate": 5.085856561414435e-06, "loss": 0.0878, "step": 5669 }, { "epoch": 3.976157082748948, "grad_norm": 0.29084885120391846, "learning_rate": 5.079142629956202e-06, "loss": 0.2562, "step": 5670 }, { "epoch": 3.976858345021038, "grad_norm": 0.2577477991580963, "learning_rate": 5.07243263190543e-06, "loss": 0.0845, "step": 5671 }, { "epoch": 3.9775596072931276, "grad_norm": 0.2577143609523773, "learning_rate": 5.065726568587009e-06, "loss": 0.083, "step": 5672 }, { "epoch": 3.9782608695652173, "grad_norm": 0.28888067603111267, "learning_rate": 5.059024441325072e-06, "loss": 0.2546, "step": 5673 }, { "epoch": 3.9789621318373074, "grad_norm": 0.31200888752937317, "learning_rate": 5.05232625144296e-06, "loss": 0.2406, "step": 5674 }, { "epoch": 3.979663394109397, "grad_norm": 0.251380056142807, "learning_rate": 5.045632000263245e-06, "loss": 0.083, "step": 5675 }, { "epoch": 3.9803646563814867, "grad_norm": 0.24966438114643097, "learning_rate": 5.038941689107721e-06, "loss": 0.0834, "step": 5676 }, { "epoch": 3.9810659186535764, "grad_norm": 0.2523241937160492, "learning_rate": 5.032255319297391e-06, "loss": 0.0833, "step": 5677 }, { "epoch": 3.981767180925666, "grad_norm": 0.25005200505256653, "learning_rate": 5.0255728921525e-06, "loss": 0.0886, "step": 5678 }, { "epoch": 3.9824684431977557, "grad_norm": 0.24757644534111023, "learning_rate": 5.018894408992497e-06, "loss": 0.0892, "step": 5679 }, { "epoch": 3.983169705469846, "grad_norm": 0.30002549290657043, "learning_rate": 5.012219871136071e-06, "loss": 0.2573, "step": 5680 }, { "epoch": 3.9838709677419355, "grad_norm": 0.300220251083374, "learning_rate": 5.005549279901115e-06, "loss": 0.2496, "step": 5681 }, { "epoch": 3.984572230014025, "grad_norm": 0.29249101877212524, "learning_rate": 4.998882636604746e-06, "loss": 0.2468, "step": 5682 }, { "epoch": 3.9852734922861153, "grad_norm": 0.28324103355407715, "learning_rate": 4.992219942563301e-06, "loss": 0.0806, "step": 5683 }, { "epoch": 3.985974754558205, "grad_norm": 0.3086399435997009, "learning_rate": 4.985561199092353e-06, "loss": 0.2487, "step": 5684 }, { "epoch": 3.9866760168302946, "grad_norm": 0.24883660674095154, "learning_rate": 4.978906407506667e-06, "loss": 0.0894, "step": 5685 }, { "epoch": 3.9873772791023843, "grad_norm": 0.30361708998680115, "learning_rate": 4.972255569120257e-06, "loss": 0.251, "step": 5686 }, { "epoch": 3.988078541374474, "grad_norm": 0.2536047101020813, "learning_rate": 4.965608685246331e-06, "loss": 0.0836, "step": 5687 }, { "epoch": 3.9887798036465636, "grad_norm": 0.3167737126350403, "learning_rate": 4.958965757197337e-06, "loss": 0.2453, "step": 5688 }, { "epoch": 3.9894810659186537, "grad_norm": 0.24895183742046356, "learning_rate": 4.95232678628493e-06, "loss": 0.089, "step": 5689 }, { "epoch": 3.9901823281907434, "grad_norm": 0.2471824288368225, "learning_rate": 4.945691773819985e-06, "loss": 0.089, "step": 5690 }, { "epoch": 3.990883590462833, "grad_norm": 0.2750858962535858, "learning_rate": 4.93906072111259e-06, "loss": 0.0807, "step": 5691 }, { "epoch": 3.9915848527349227, "grad_norm": 0.2523891031742096, "learning_rate": 4.932433629472069e-06, "loss": 0.0839, "step": 5692 }, { "epoch": 3.992286115007013, "grad_norm": 0.24687796831130981, "learning_rate": 4.925810500206943e-06, "loss": 0.0891, "step": 5693 }, { "epoch": 3.9929873772791025, "grad_norm": 0.3121369183063507, "learning_rate": 4.91919133462497e-06, "loss": 0.2599, "step": 5694 }, { "epoch": 3.993688639551192, "grad_norm": 1.3135590553283691, "learning_rate": 4.9125761340331065e-06, "loss": 0.5783, "step": 5695 }, { "epoch": 3.994389901823282, "grad_norm": 3.1887402534484863, "learning_rate": 4.9059648997375505e-06, "loss": 0.7089, "step": 5696 }, { "epoch": 3.9950911640953715, "grad_norm": 2.9315531253814697, "learning_rate": 4.899357633043675e-06, "loss": 0.4077, "step": 5697 }, { "epoch": 3.9957924263674616, "grad_norm": 0.2556568682193756, "learning_rate": 4.892754335256119e-06, "loss": 0.0833, "step": 5698 }, { "epoch": 3.9964936886395512, "grad_norm": 0.26401907205581665, "learning_rate": 4.886155007678703e-06, "loss": 0.0748, "step": 5699 }, { "epoch": 3.997194950911641, "grad_norm": 0.24884934723377228, "learning_rate": 4.879559651614482e-06, "loss": 0.0824, "step": 5700 }, { "epoch": 3.9978962131837306, "grad_norm": 0.28453245759010315, "learning_rate": 4.872968268365716e-06, "loss": 0.0707, "step": 5701 }, { "epoch": 3.9985974754558207, "grad_norm": 0.27852222323417664, "learning_rate": 4.86638085923389e-06, "loss": 0.0783, "step": 5702 }, { "epoch": 3.9992987377279103, "grad_norm": 0.2551226317882538, "learning_rate": 4.8597974255196995e-06, "loss": 0.0835, "step": 5703 }, { "epoch": 4.0, "grad_norm": 0.3644556403160095, "learning_rate": 4.853217968523049e-06, "loss": 0.069, "step": 5704 }, { "epoch": 4.0, "eval_f1 (minor class)": 0.0, "eval_loss": 0.17387865483760834, "eval_roc_auc": 0.5271407324201483, "eval_runtime": 231.7397, "eval_samples_per_second": 5.472, "eval_steps_per_second": 1.368, "step": 5704 }, { "epoch": 4.00070126227209, "grad_norm": 0.2965504229068756, "learning_rate": 4.846642489543063e-06, "loss": 0.2507, "step": 5705 }, { "epoch": 4.001402524544179, "grad_norm": 0.24597890675067902, "learning_rate": 4.8400709898780896e-06, "loss": 0.0889, "step": 5706 }, { "epoch": 4.002103786816269, "grad_norm": 0.307608038187027, "learning_rate": 4.83350347082567e-06, "loss": 0.2589, "step": 5707 }, { "epoch": 4.002805049088359, "grad_norm": 0.8032119870185852, "learning_rate": 4.826939933682587e-06, "loss": 0.4269, "step": 5708 }, { "epoch": 4.003506311360449, "grad_norm": 0.265754371881485, "learning_rate": 4.820380379744807e-06, "loss": 0.0771, "step": 5709 }, { "epoch": 4.004207573632539, "grad_norm": 0.28957629203796387, "learning_rate": 4.813824810307546e-06, "loss": 0.2561, "step": 5710 }, { "epoch": 4.0049088359046285, "grad_norm": 0.2517434060573578, "learning_rate": 4.8072732266651874e-06, "loss": 0.0889, "step": 5711 }, { "epoch": 4.005610098176718, "grad_norm": 0.2733239531517029, "learning_rate": 4.800725630111369e-06, "loss": 0.0787, "step": 5712 }, { "epoch": 4.006311360448808, "grad_norm": 0.24643711745738983, "learning_rate": 4.794182021938917e-06, "loss": 0.0888, "step": 5713 }, { "epoch": 4.0070126227208975, "grad_norm": 0.24928128719329834, "learning_rate": 4.787642403439885e-06, "loss": 0.0885, "step": 5714 }, { "epoch": 4.007713884992987, "grad_norm": 0.8453323841094971, "learning_rate": 4.781106775905525e-06, "loss": 0.4206, "step": 5715 }, { "epoch": 4.008415147265077, "grad_norm": 0.24977052211761475, "learning_rate": 4.7745751406263165e-06, "loss": 0.0833, "step": 5716 }, { "epoch": 4.0091164095371665, "grad_norm": 0.2548271417617798, "learning_rate": 4.768047498891937e-06, "loss": 0.0838, "step": 5717 }, { "epoch": 4.009817671809257, "grad_norm": 0.24385394155979156, "learning_rate": 4.761523851991281e-06, "loss": 0.0812, "step": 5718 }, { "epoch": 4.010518934081347, "grad_norm": 0.2537045180797577, "learning_rate": 4.755004201212446e-06, "loss": 0.0828, "step": 5719 }, { "epoch": 4.011220196353436, "grad_norm": 0.2581824064254761, "learning_rate": 4.748488547842761e-06, "loss": 0.0761, "step": 5720 }, { "epoch": 4.011921458625526, "grad_norm": 0.8012170791625977, "learning_rate": 4.741976893168742e-06, "loss": 0.4155, "step": 5721 }, { "epoch": 4.012622720897616, "grad_norm": 0.2859339118003845, "learning_rate": 4.7354692384761395e-06, "loss": 0.2528, "step": 5722 }, { "epoch": 4.013323983169705, "grad_norm": 0.25030839443206787, "learning_rate": 4.728965585049885e-06, "loss": 0.0824, "step": 5723 }, { "epoch": 4.014025245441795, "grad_norm": 0.2872909903526306, "learning_rate": 4.722465934174153e-06, "loss": 0.2541, "step": 5724 }, { "epoch": 4.014726507713885, "grad_norm": 0.24851122498512268, "learning_rate": 4.715970287132301e-06, "loss": 0.082, "step": 5725 }, { "epoch": 4.015427769985974, "grad_norm": 0.8064838647842407, "learning_rate": 4.709478645206902e-06, "loss": 0.4161, "step": 5726 }, { "epoch": 4.016129032258065, "grad_norm": 0.3049415946006775, "learning_rate": 4.7029910096797495e-06, "loss": 0.2561, "step": 5727 }, { "epoch": 4.016830294530155, "grad_norm": 0.24630972743034363, "learning_rate": 4.696507381831838e-06, "loss": 0.0892, "step": 5728 }, { "epoch": 4.017531556802244, "grad_norm": 0.24948835372924805, "learning_rate": 4.690027762943364e-06, "loss": 0.0828, "step": 5729 }, { "epoch": 4.018232819074334, "grad_norm": 0.2531147599220276, "learning_rate": 4.683552154293747e-06, "loss": 0.0822, "step": 5730 }, { "epoch": 4.018934081346424, "grad_norm": 0.3200483024120331, "learning_rate": 4.677080557161603e-06, "loss": 0.2417, "step": 5731 }, { "epoch": 4.019635343618513, "grad_norm": 0.30045759677886963, "learning_rate": 4.670612972824756e-06, "loss": 0.249, "step": 5732 }, { "epoch": 4.020336605890603, "grad_norm": 0.25900301337242126, "learning_rate": 4.664149402560252e-06, "loss": 0.0775, "step": 5733 }, { "epoch": 4.021037868162693, "grad_norm": 0.24953363835811615, "learning_rate": 4.657689847644322e-06, "loss": 0.0883, "step": 5734 }, { "epoch": 4.021739130434782, "grad_norm": 1.3077113628387451, "learning_rate": 4.651234309352429e-06, "loss": 0.5844, "step": 5735 }, { "epoch": 4.022440392706873, "grad_norm": 0.27235254645347595, "learning_rate": 4.644782788959218e-06, "loss": 0.0785, "step": 5736 }, { "epoch": 4.0231416549789625, "grad_norm": 0.255033940076828, "learning_rate": 4.638335287738565e-06, "loss": 0.0839, "step": 5737 }, { "epoch": 4.023842917251052, "grad_norm": 0.7896066904067993, "learning_rate": 4.631891806963531e-06, "loss": 0.4169, "step": 5738 }, { "epoch": 4.024544179523142, "grad_norm": 0.37336984276771545, "learning_rate": 4.625452347906397e-06, "loss": 0.257, "step": 5739 }, { "epoch": 4.0252454417952315, "grad_norm": 0.2503121793270111, "learning_rate": 4.619016911838639e-06, "loss": 0.0824, "step": 5740 }, { "epoch": 4.025946704067321, "grad_norm": 0.2472364753484726, "learning_rate": 4.612585500030952e-06, "loss": 0.0886, "step": 5741 }, { "epoch": 4.026647966339411, "grad_norm": 0.24969463050365448, "learning_rate": 4.60615811375322e-06, "loss": 0.0881, "step": 5742 }, { "epoch": 4.0273492286115005, "grad_norm": 0.283337265253067, "learning_rate": 4.599734754274557e-06, "loss": 0.0694, "step": 5743 }, { "epoch": 4.02805049088359, "grad_norm": 0.24788789451122284, "learning_rate": 4.593315422863248e-06, "loss": 0.0888, "step": 5744 }, { "epoch": 4.02875175315568, "grad_norm": 0.30344533920288086, "learning_rate": 4.586900120786825e-06, "loss": 0.2505, "step": 5745 }, { "epoch": 4.02945301542777, "grad_norm": 0.25051048398017883, "learning_rate": 4.58048884931197e-06, "loss": 0.0819, "step": 5746 }, { "epoch": 4.03015427769986, "grad_norm": 0.34335172176361084, "learning_rate": 4.574081609704623e-06, "loss": 0.2339, "step": 5747 }, { "epoch": 4.03085553997195, "grad_norm": 0.2528415322303772, "learning_rate": 4.5676784032298915e-06, "loss": 0.0831, "step": 5748 }, { "epoch": 4.031556802244039, "grad_norm": 0.28541186451911926, "learning_rate": 4.561279231152107e-06, "loss": 0.2515, "step": 5749 }, { "epoch": 4.032258064516129, "grad_norm": 0.3232946991920471, "learning_rate": 4.554884094734793e-06, "loss": 0.2443, "step": 5750 }, { "epoch": 4.032959326788219, "grad_norm": 0.8110306262969971, "learning_rate": 4.548492995240686e-06, "loss": 0.4171, "step": 5751 }, { "epoch": 4.033660589060308, "grad_norm": 0.2591044008731842, "learning_rate": 4.5421059339317146e-06, "loss": 0.084, "step": 5752 }, { "epoch": 4.034361851332398, "grad_norm": 0.24069979786872864, "learning_rate": 4.5357229120690185e-06, "loss": 0.0808, "step": 5753 }, { "epoch": 4.035063113604488, "grad_norm": 0.3022318184375763, "learning_rate": 4.529343930912924e-06, "loss": 0.0621, "step": 5754 }, { "epoch": 4.035764375876578, "grad_norm": 0.2515336573123932, "learning_rate": 4.52296899172299e-06, "loss": 0.0879, "step": 5755 }, { "epoch": 4.036465638148668, "grad_norm": 0.2473820298910141, "learning_rate": 4.5165980957579425e-06, "loss": 0.0889, "step": 5756 }, { "epoch": 4.0371669004207575, "grad_norm": 0.812288224697113, "learning_rate": 4.510231244275742e-06, "loss": 0.4125, "step": 5757 }, { "epoch": 4.037868162692847, "grad_norm": 0.25042524933815, "learning_rate": 4.5038684385335175e-06, "loss": 0.0893, "step": 5758 }, { "epoch": 4.038569424964937, "grad_norm": 0.2968321144580841, "learning_rate": 4.49750967978764e-06, "loss": 0.2546, "step": 5759 }, { "epoch": 4.0392706872370265, "grad_norm": 0.2555258572101593, "learning_rate": 4.491154969293629e-06, "loss": 0.0838, "step": 5760 }, { "epoch": 4.039971949509116, "grad_norm": 0.24911461770534515, "learning_rate": 4.4848043083062505e-06, "loss": 0.0823, "step": 5761 }, { "epoch": 4.040673211781206, "grad_norm": 0.9000613689422607, "learning_rate": 4.478457698079444e-06, "loss": 0.4371, "step": 5762 }, { "epoch": 4.0413744740532955, "grad_norm": 0.25038549304008484, "learning_rate": 4.472115139866367e-06, "loss": 0.0891, "step": 5763 }, { "epoch": 4.042075736325386, "grad_norm": 0.2490226775407791, "learning_rate": 4.4657766349193605e-06, "loss": 0.0882, "step": 5764 }, { "epoch": 4.042776998597476, "grad_norm": 0.3189921975135803, "learning_rate": 4.459442184489985e-06, "loss": 0.2386, "step": 5765 }, { "epoch": 4.043478260869565, "grad_norm": 0.2593978941440582, "learning_rate": 4.453111789828981e-06, "loss": 0.083, "step": 5766 }, { "epoch": 4.044179523141655, "grad_norm": 0.24974863231182098, "learning_rate": 4.446785452186294e-06, "loss": 0.0822, "step": 5767 }, { "epoch": 4.044880785413745, "grad_norm": 0.24955852329730988, "learning_rate": 4.440463172811069e-06, "loss": 0.0882, "step": 5768 }, { "epoch": 4.045582047685834, "grad_norm": 0.2513943016529083, "learning_rate": 4.434144952951658e-06, "loss": 0.0886, "step": 5769 }, { "epoch": 4.046283309957924, "grad_norm": 0.8050342798233032, "learning_rate": 4.427830793855594e-06, "loss": 0.4117, "step": 5770 }, { "epoch": 4.046984572230014, "grad_norm": 0.24977552890777588, "learning_rate": 4.421520696769632e-06, "loss": 0.0888, "step": 5771 }, { "epoch": 4.047685834502103, "grad_norm": 0.24789050221443176, "learning_rate": 4.415214662939698e-06, "loss": 0.0816, "step": 5772 }, { "epoch": 4.048387096774194, "grad_norm": 0.2478502243757248, "learning_rate": 4.4089126936109425e-06, "loss": 0.0826, "step": 5773 }, { "epoch": 4.049088359046284, "grad_norm": 0.3190988600254059, "learning_rate": 4.4026147900276925e-06, "loss": 0.2422, "step": 5774 }, { "epoch": 4.049789621318373, "grad_norm": 0.7967240810394287, "learning_rate": 4.396320953433475e-06, "loss": 0.4187, "step": 5775 }, { "epoch": 4.050490883590463, "grad_norm": 0.24754559993743896, "learning_rate": 4.390031185071028e-06, "loss": 0.0888, "step": 5776 }, { "epoch": 4.051192145862553, "grad_norm": 0.2485278993844986, "learning_rate": 4.383745486182278e-06, "loss": 0.0889, "step": 5777 }, { "epoch": 4.051893408134642, "grad_norm": 0.24039366841316223, "learning_rate": 4.377463858008332e-06, "loss": 0.0799, "step": 5778 }, { "epoch": 4.052594670406732, "grad_norm": 0.25070691108703613, "learning_rate": 4.371186301789529e-06, "loss": 0.0819, "step": 5779 }, { "epoch": 4.053295932678822, "grad_norm": 0.26653456687927246, "learning_rate": 4.3649128187653705e-06, "loss": 0.0748, "step": 5780 }, { "epoch": 4.053997194950911, "grad_norm": 0.2484004646539688, "learning_rate": 4.358643410174565e-06, "loss": 0.0825, "step": 5781 }, { "epoch": 4.054698457223002, "grad_norm": 0.2576996088027954, "learning_rate": 4.352378077255026e-06, "loss": 0.0818, "step": 5782 }, { "epoch": 4.0553997194950915, "grad_norm": 0.25277620553970337, "learning_rate": 4.346116821243845e-06, "loss": 0.0873, "step": 5783 }, { "epoch": 4.056100981767181, "grad_norm": 0.25108903646469116, "learning_rate": 4.339859643377331e-06, "loss": 0.088, "step": 5784 }, { "epoch": 4.056802244039271, "grad_norm": 0.3068084716796875, "learning_rate": 4.333606544890959e-06, "loss": 0.2501, "step": 5785 }, { "epoch": 4.0575035063113605, "grad_norm": 0.41084355115890503, "learning_rate": 4.327357527019427e-06, "loss": 0.2549, "step": 5786 }, { "epoch": 4.05820476858345, "grad_norm": 0.295719712972641, "learning_rate": 4.321112590996609e-06, "loss": 0.2538, "step": 5787 }, { "epoch": 4.05890603085554, "grad_norm": 0.25174078345298767, "learning_rate": 4.314871738055579e-06, "loss": 0.0882, "step": 5788 }, { "epoch": 4.0596072931276295, "grad_norm": 0.25598081946372986, "learning_rate": 4.308634969428596e-06, "loss": 0.0815, "step": 5789 }, { "epoch": 4.060308555399719, "grad_norm": 0.2454797625541687, "learning_rate": 4.302402286347135e-06, "loss": 0.0812, "step": 5790 }, { "epoch": 4.06100981767181, "grad_norm": 0.26130932569503784, "learning_rate": 4.2961736900418355e-06, "loss": 0.0734, "step": 5791 }, { "epoch": 4.061711079943899, "grad_norm": 0.8142228722572327, "learning_rate": 4.289949181742559e-06, "loss": 0.4143, "step": 5792 }, { "epoch": 4.062412342215989, "grad_norm": 0.3045930564403534, "learning_rate": 4.283728762678332e-06, "loss": 0.2498, "step": 5793 }, { "epoch": 4.063113604488079, "grad_norm": 0.24579106271266937, "learning_rate": 4.2775124340774045e-06, "loss": 0.0879, "step": 5794 }, { "epoch": 4.063814866760168, "grad_norm": 0.8143286108970642, "learning_rate": 4.2713001971671804e-06, "loss": 0.4212, "step": 5795 }, { "epoch": 4.064516129032258, "grad_norm": 0.29954802989959717, "learning_rate": 4.265092053174291e-06, "loss": 0.2553, "step": 5796 }, { "epoch": 4.065217391304348, "grad_norm": 0.24847067892551422, "learning_rate": 4.258888003324532e-06, "loss": 0.0817, "step": 5797 }, { "epoch": 4.065918653576437, "grad_norm": 0.24352017045021057, "learning_rate": 4.25268804884292e-06, "loss": 0.0807, "step": 5798 }, { "epoch": 4.066619915848527, "grad_norm": 0.29581987857818604, "learning_rate": 4.24649219095363e-06, "loss": 0.2505, "step": 5799 }, { "epoch": 4.067321178120617, "grad_norm": 0.35044682025909424, "learning_rate": 4.240300430880062e-06, "loss": 0.2349, "step": 5800 }, { "epoch": 4.068022440392707, "grad_norm": 0.2492382675409317, "learning_rate": 4.2341127698447806e-06, "loss": 0.0888, "step": 5801 }, { "epoch": 4.068723702664797, "grad_norm": 0.8296859264373779, "learning_rate": 4.2279292090695475e-06, "loss": 0.4068, "step": 5802 }, { "epoch": 4.0694249649368865, "grad_norm": 4.074202060699463, "learning_rate": 4.221749749775317e-06, "loss": 0.5047, "step": 5803 }, { "epoch": 4.070126227208976, "grad_norm": 0.24869957566261292, "learning_rate": 4.215574393182242e-06, "loss": 0.0873, "step": 5804 }, { "epoch": 4.070827489481066, "grad_norm": 0.2500712275505066, "learning_rate": 4.209403140509646e-06, "loss": 0.088, "step": 5805 }, { "epoch": 4.0715287517531555, "grad_norm": 0.2501826286315918, "learning_rate": 4.203235992976065e-06, "loss": 0.0877, "step": 5806 }, { "epoch": 4.072230014025245, "grad_norm": 0.255280077457428, "learning_rate": 4.197072951799211e-06, "loss": 0.0828, "step": 5807 }, { "epoch": 4.072931276297335, "grad_norm": 0.24778547883033752, "learning_rate": 4.1909140181959795e-06, "loss": 0.0823, "step": 5808 }, { "epoch": 4.0736325385694245, "grad_norm": 0.3038765490055084, "learning_rate": 4.184759193382462e-06, "loss": 0.2559, "step": 5809 }, { "epoch": 4.074333800841515, "grad_norm": 0.25027957558631897, "learning_rate": 4.1786084785739535e-06, "loss": 0.0884, "step": 5810 }, { "epoch": 4.075035063113605, "grad_norm": 0.2594670355319977, "learning_rate": 4.1724618749849035e-06, "loss": 0.0749, "step": 5811 }, { "epoch": 4.075736325385694, "grad_norm": 0.2519768476486206, "learning_rate": 4.166319383828987e-06, "loss": 0.0873, "step": 5812 }, { "epoch": 4.076437587657784, "grad_norm": 0.248381569981575, "learning_rate": 4.160181006319034e-06, "loss": 0.0805, "step": 5813 }, { "epoch": 4.077138849929874, "grad_norm": 0.2501336634159088, "learning_rate": 4.1540467436670946e-06, "loss": 0.0826, "step": 5814 }, { "epoch": 4.077840112201963, "grad_norm": 0.2658451199531555, "learning_rate": 4.147916597084378e-06, "loss": 0.0756, "step": 5815 }, { "epoch": 4.078541374474053, "grad_norm": 0.2538871169090271, "learning_rate": 4.141790567781295e-06, "loss": 0.0818, "step": 5816 }, { "epoch": 4.079242636746143, "grad_norm": 0.3047396242618561, "learning_rate": 4.135668656967434e-06, "loss": 0.2466, "step": 5817 }, { "epoch": 4.079943899018232, "grad_norm": 2.886746406555176, "learning_rate": 4.129550865851592e-06, "loss": 0.4062, "step": 5818 }, { "epoch": 4.080645161290323, "grad_norm": 0.3437857925891876, "learning_rate": 4.123437195641719e-06, "loss": 0.2378, "step": 5819 }, { "epoch": 4.081346423562413, "grad_norm": 0.2459256500005722, "learning_rate": 4.117327647544986e-06, "loss": 0.0722, "step": 5820 }, { "epoch": 4.082047685834502, "grad_norm": 0.2599603831768036, "learning_rate": 4.111222222767727e-06, "loss": 0.0787, "step": 5821 }, { "epoch": 4.082748948106592, "grad_norm": 1.3375189304351807, "learning_rate": 4.105120922515462e-06, "loss": 0.5873, "step": 5822 }, { "epoch": 4.083450210378682, "grad_norm": 0.24948474764823914, "learning_rate": 4.0990237479929165e-06, "loss": 0.0889, "step": 5823 }, { "epoch": 4.084151472650771, "grad_norm": 0.3324835002422333, "learning_rate": 4.092930700403974e-06, "loss": 0.2435, "step": 5824 }, { "epoch": 4.084852734922861, "grad_norm": 0.24932758510112762, "learning_rate": 4.08684178095173e-06, "loss": 0.088, "step": 5825 }, { "epoch": 4.085553997194951, "grad_norm": 0.2508793771266937, "learning_rate": 4.08075699083845e-06, "loss": 0.0884, "step": 5826 }, { "epoch": 4.08625525946704, "grad_norm": 0.2523331046104431, "learning_rate": 4.074676331265573e-06, "loss": 0.0745, "step": 5827 }, { "epoch": 4.086956521739131, "grad_norm": 0.2468249648809433, "learning_rate": 4.068599803433753e-06, "loss": 0.0815, "step": 5828 }, { "epoch": 4.0876577840112205, "grad_norm": 0.24983416497707367, "learning_rate": 4.0625274085428015e-06, "loss": 0.0812, "step": 5829 }, { "epoch": 4.08835904628331, "grad_norm": 0.30029353499412537, "learning_rate": 4.056459147791722e-06, "loss": 0.2555, "step": 5830 }, { "epoch": 4.0890603085554, "grad_norm": 4.239546298980713, "learning_rate": 4.050395022378709e-06, "loss": 0.5076, "step": 5831 }, { "epoch": 4.0897615708274895, "grad_norm": 0.30428382754325867, "learning_rate": 4.044335033501126e-06, "loss": 0.2475, "step": 5832 }, { "epoch": 4.090462833099579, "grad_norm": 0.2493182271718979, "learning_rate": 4.0382791823555396e-06, "loss": 0.0864, "step": 5833 }, { "epoch": 4.091164095371669, "grad_norm": 0.2503967583179474, "learning_rate": 4.032227470137676e-06, "loss": 0.0815, "step": 5834 }, { "epoch": 4.0918653576437585, "grad_norm": 0.24458736181259155, "learning_rate": 4.0261798980424755e-06, "loss": 0.0732, "step": 5835 }, { "epoch": 4.092566619915848, "grad_norm": 0.28488343954086304, "learning_rate": 4.020136467264016e-06, "loss": 0.0691, "step": 5836 }, { "epoch": 4.093267882187939, "grad_norm": 4.134540557861328, "learning_rate": 4.014097178995599e-06, "loss": 0.674, "step": 5837 }, { "epoch": 4.093969144460028, "grad_norm": 0.30438366532325745, "learning_rate": 4.008062034429685e-06, "loss": 0.2559, "step": 5838 }, { "epoch": 4.094670406732118, "grad_norm": 0.283843994140625, "learning_rate": 4.002031034757933e-06, "loss": 0.0679, "step": 5839 }, { "epoch": 4.095371669004208, "grad_norm": 0.8302257657051086, "learning_rate": 3.996004181171164e-06, "loss": 0.4257, "step": 5840 }, { "epoch": 4.096072931276297, "grad_norm": 0.29738134145736694, "learning_rate": 3.9899814748594e-06, "loss": 0.2551, "step": 5841 }, { "epoch": 4.096774193548387, "grad_norm": 0.3099164664745331, "learning_rate": 3.98396291701183e-06, "loss": 0.2595, "step": 5842 }, { "epoch": 4.097475455820477, "grad_norm": 0.3241735100746155, "learning_rate": 3.9779485088168284e-06, "loss": 0.2435, "step": 5843 }, { "epoch": 4.098176718092566, "grad_norm": 0.2968786060810089, "learning_rate": 3.9719382514619455e-06, "loss": 0.2537, "step": 5844 }, { "epoch": 4.098877980364656, "grad_norm": 0.2473820596933365, "learning_rate": 3.965932146133927e-06, "loss": 0.0813, "step": 5845 }, { "epoch": 4.099579242636747, "grad_norm": 0.2516311705112457, "learning_rate": 3.95993019401868e-06, "loss": 0.0821, "step": 5846 }, { "epoch": 4.100280504908836, "grad_norm": 0.2503243088722229, "learning_rate": 3.953932396301307e-06, "loss": 0.0819, "step": 5847 }, { "epoch": 4.100981767180926, "grad_norm": 4.187382221221924, "learning_rate": 3.947938754166075e-06, "loss": 0.6441, "step": 5848 }, { "epoch": 4.1016830294530155, "grad_norm": 0.24899664521217346, "learning_rate": 3.941949268796457e-06, "loss": 0.0879, "step": 5849 }, { "epoch": 4.102384291725105, "grad_norm": 0.30890583992004395, "learning_rate": 3.935963941375065e-06, "loss": 0.2507, "step": 5850 }, { "epoch": 4.103085553997195, "grad_norm": 0.8101698160171509, "learning_rate": 3.929982773083724e-06, "loss": 0.4217, "step": 5851 }, { "epoch": 4.1037868162692845, "grad_norm": 0.2528694272041321, "learning_rate": 3.92400576510342e-06, "loss": 0.0825, "step": 5852 }, { "epoch": 4.104488078541374, "grad_norm": 0.2974216639995575, "learning_rate": 3.918032918614331e-06, "loss": 0.2578, "step": 5853 }, { "epoch": 4.105189340813464, "grad_norm": 0.250931054353714, "learning_rate": 3.912064234795795e-06, "loss": 0.0884, "step": 5854 }, { "epoch": 4.105890603085554, "grad_norm": 0.24701976776123047, "learning_rate": 3.906099714826353e-06, "loss": 0.0815, "step": 5855 }, { "epoch": 4.106591865357644, "grad_norm": 0.24551080167293549, "learning_rate": 3.900139359883703e-06, "loss": 0.081, "step": 5856 }, { "epoch": 4.107293127629734, "grad_norm": 0.29384931921958923, "learning_rate": 3.894183171144727e-06, "loss": 0.2445, "step": 5857 }, { "epoch": 4.107994389901823, "grad_norm": 0.26297104358673096, "learning_rate": 3.888231149785476e-06, "loss": 0.0755, "step": 5858 }, { "epoch": 4.108695652173913, "grad_norm": 0.30656152963638306, "learning_rate": 3.882283296981204e-06, "loss": 0.2465, "step": 5859 }, { "epoch": 4.109396914446003, "grad_norm": 0.24993754923343658, "learning_rate": 3.8763396139063075e-06, "loss": 0.0821, "step": 5860 }, { "epoch": 4.110098176718092, "grad_norm": 0.2976532280445099, "learning_rate": 3.870400101734395e-06, "loss": 0.2559, "step": 5861 }, { "epoch": 4.110799438990182, "grad_norm": 0.25019851326942444, "learning_rate": 3.864464761638214e-06, "loss": 0.0882, "step": 5862 }, { "epoch": 4.111500701262272, "grad_norm": 0.2991834282875061, "learning_rate": 3.858533594789726e-06, "loss": 0.2555, "step": 5863 }, { "epoch": 4.112201963534362, "grad_norm": 0.24843671917915344, "learning_rate": 3.85260660236004e-06, "loss": 0.0889, "step": 5864 }, { "epoch": 4.112903225806452, "grad_norm": 0.3029654622077942, "learning_rate": 3.8466837855194505e-06, "loss": 0.2496, "step": 5865 }, { "epoch": 4.113604488078542, "grad_norm": 0.29882362484931946, "learning_rate": 3.8407651454374246e-06, "loss": 0.2471, "step": 5866 }, { "epoch": 4.114305750350631, "grad_norm": 0.28849726915359497, "learning_rate": 3.8348506832826204e-06, "loss": 0.2536, "step": 5867 }, { "epoch": 4.115007012622721, "grad_norm": 0.24875859916210175, "learning_rate": 3.828940400222844e-06, "loss": 0.088, "step": 5868 }, { "epoch": 4.115708274894811, "grad_norm": 0.2990838885307312, "learning_rate": 3.8230342974251045e-06, "loss": 0.2548, "step": 5869 }, { "epoch": 4.1164095371669, "grad_norm": 0.24937130510807037, "learning_rate": 3.817132376055565e-06, "loss": 0.0882, "step": 5870 }, { "epoch": 4.11711079943899, "grad_norm": 0.24886859953403473, "learning_rate": 3.8112346372795647e-06, "loss": 0.0891, "step": 5871 }, { "epoch": 4.11781206171108, "grad_norm": 0.2544393837451935, "learning_rate": 3.805341082261635e-06, "loss": 0.0826, "step": 5872 }, { "epoch": 4.118513323983169, "grad_norm": 0.24877671897411346, "learning_rate": 3.7994517121654556e-06, "loss": 0.089, "step": 5873 }, { "epoch": 4.11921458625526, "grad_norm": 0.32561951875686646, "learning_rate": 3.793566528153902e-06, "loss": 0.2396, "step": 5874 }, { "epoch": 4.1199158485273495, "grad_norm": 0.2531135082244873, "learning_rate": 3.7876855313890154e-06, "loss": 0.0831, "step": 5875 }, { "epoch": 4.120617110799439, "grad_norm": 4.768926620483398, "learning_rate": 3.781808723031993e-06, "loss": 0.5155, "step": 5876 }, { "epoch": 4.121318373071529, "grad_norm": 0.2496476024389267, "learning_rate": 3.7759361042432385e-06, "loss": 0.0813, "step": 5877 }, { "epoch": 4.1220196353436185, "grad_norm": 0.24750098586082458, "learning_rate": 3.770067676182304e-06, "loss": 0.0889, "step": 5878 }, { "epoch": 4.122720897615708, "grad_norm": 0.3002033531665802, "learning_rate": 3.7642034400079156e-06, "loss": 0.2477, "step": 5879 }, { "epoch": 4.123422159887798, "grad_norm": 0.30816519260406494, "learning_rate": 3.7583433968779864e-06, "loss": 0.2484, "step": 5880 }, { "epoch": 4.1241234221598875, "grad_norm": 0.24901530146598816, "learning_rate": 3.752487547949582e-06, "loss": 0.0896, "step": 5881 }, { "epoch": 4.124824684431977, "grad_norm": 0.3244262635707855, "learning_rate": 3.746635894378958e-06, "loss": 0.2602, "step": 5882 }, { "epoch": 4.125525946704068, "grad_norm": 0.2522665560245514, "learning_rate": 3.7407884373215266e-06, "loss": 0.088, "step": 5883 }, { "epoch": 4.126227208976157, "grad_norm": 0.2420736700296402, "learning_rate": 3.7349451779318924e-06, "loss": 0.08, "step": 5884 }, { "epoch": 4.126928471248247, "grad_norm": 0.25346246361732483, "learning_rate": 3.7291061173637952e-06, "loss": 0.0742, "step": 5885 }, { "epoch": 4.127629733520337, "grad_norm": 0.25338229537010193, "learning_rate": 3.7232712567701862e-06, "loss": 0.0834, "step": 5886 }, { "epoch": 4.128330995792426, "grad_norm": 0.29692599177360535, "learning_rate": 3.717440597303154e-06, "loss": 0.2561, "step": 5887 }, { "epoch": 4.129032258064516, "grad_norm": 0.24966026842594147, "learning_rate": 3.7116141401139863e-06, "loss": 0.0822, "step": 5888 }, { "epoch": 4.129733520336606, "grad_norm": 0.2511516213417053, "learning_rate": 3.7057918863531134e-06, "loss": 0.0829, "step": 5889 }, { "epoch": 4.130434782608695, "grad_norm": 0.2497526854276657, "learning_rate": 3.699973837170162e-06, "loss": 0.0867, "step": 5890 }, { "epoch": 4.131136044880785, "grad_norm": 0.3021191656589508, "learning_rate": 3.694159993713911e-06, "loss": 0.2485, "step": 5891 }, { "epoch": 4.131837307152876, "grad_norm": 0.24953016638755798, "learning_rate": 3.6883503571323116e-06, "loss": 0.0888, "step": 5892 }, { "epoch": 4.132538569424965, "grad_norm": 0.28925609588623047, "learning_rate": 3.682544928572482e-06, "loss": 0.2495, "step": 5893 }, { "epoch": 4.133239831697055, "grad_norm": 0.2644340693950653, "learning_rate": 3.676743709180727e-06, "loss": 0.0764, "step": 5894 }, { "epoch": 4.1339410939691446, "grad_norm": 0.2658447325229645, "learning_rate": 3.6709467001024925e-06, "loss": 0.0773, "step": 5895 }, { "epoch": 4.134642356241234, "grad_norm": 0.2490805983543396, "learning_rate": 3.665153902482421e-06, "loss": 0.0891, "step": 5896 }, { "epoch": 4.135343618513324, "grad_norm": 5.890732288360596, "learning_rate": 3.6593653174643027e-06, "loss": 0.6274, "step": 5897 }, { "epoch": 4.1360448807854135, "grad_norm": 0.2614748179912567, "learning_rate": 3.6535809461911124e-06, "loss": 0.0751, "step": 5898 }, { "epoch": 4.136746143057503, "grad_norm": 0.33268433809280396, "learning_rate": 3.64780078980497e-06, "loss": 0.2444, "step": 5899 }, { "epoch": 4.137447405329593, "grad_norm": 0.25172853469848633, "learning_rate": 3.642024849447187e-06, "loss": 0.0806, "step": 5900 }, { "epoch": 4.138148667601683, "grad_norm": 0.24851657450199127, "learning_rate": 3.6362531262582267e-06, "loss": 0.0886, "step": 5901 }, { "epoch": 4.138849929873773, "grad_norm": 0.24859291315078735, "learning_rate": 3.6304856213777357e-06, "loss": 0.0807, "step": 5902 }, { "epoch": 4.139551192145863, "grad_norm": 0.2475435882806778, "learning_rate": 3.6247223359445053e-06, "loss": 0.0883, "step": 5903 }, { "epoch": 4.140252454417952, "grad_norm": 0.25150784850120544, "learning_rate": 3.6189632710965176e-06, "loss": 0.0817, "step": 5904 }, { "epoch": 4.140953716690042, "grad_norm": 0.2559283971786499, "learning_rate": 3.613208427970907e-06, "loss": 0.0745, "step": 5905 }, { "epoch": 4.141654978962132, "grad_norm": 0.247567817568779, "learning_rate": 3.6074578077039734e-06, "loss": 0.0806, "step": 5906 }, { "epoch": 4.142356241234221, "grad_norm": 0.24891328811645508, "learning_rate": 3.60171141143118e-06, "loss": 0.0888, "step": 5907 }, { "epoch": 4.143057503506311, "grad_norm": 0.2474217563867569, "learning_rate": 3.5959692402871782e-06, "loss": 0.0807, "step": 5908 }, { "epoch": 4.143758765778401, "grad_norm": 0.24892878532409668, "learning_rate": 3.5902312954057577e-06, "loss": 0.088, "step": 5909 }, { "epoch": 4.144460028050491, "grad_norm": 0.24796482920646667, "learning_rate": 3.584497577919893e-06, "loss": 0.0879, "step": 5910 }, { "epoch": 4.145161290322581, "grad_norm": 0.2487564980983734, "learning_rate": 3.578768088961709e-06, "loss": 0.0809, "step": 5911 }, { "epoch": 4.145862552594671, "grad_norm": 3.792659282684326, "learning_rate": 3.5730428296625125e-06, "loss": 0.6268, "step": 5912 }, { "epoch": 4.14656381486676, "grad_norm": 0.2874276340007782, "learning_rate": 3.5673218011527593e-06, "loss": 0.2496, "step": 5913 }, { "epoch": 4.14726507713885, "grad_norm": 0.2978585660457611, "learning_rate": 3.5616050045620795e-06, "loss": 0.2524, "step": 5914 }, { "epoch": 4.14796633941094, "grad_norm": 0.3069474995136261, "learning_rate": 3.5558924410192566e-06, "loss": 0.2464, "step": 5915 }, { "epoch": 4.148667601683029, "grad_norm": 0.24819837510585785, "learning_rate": 3.5501841116522577e-06, "loss": 0.0877, "step": 5916 }, { "epoch": 4.149368863955119, "grad_norm": 0.24551431834697723, "learning_rate": 3.5444800175881894e-06, "loss": 0.0806, "step": 5917 }, { "epoch": 4.150070126227209, "grad_norm": 0.25561073422431946, "learning_rate": 3.5387801599533475e-06, "loss": 0.0828, "step": 5918 }, { "epoch": 4.150771388499299, "grad_norm": 0.2609238624572754, "learning_rate": 3.5330845398731745e-06, "loss": 0.0742, "step": 5919 }, { "epoch": 4.151472650771389, "grad_norm": 0.30486226081848145, "learning_rate": 3.5273931584722724e-06, "loss": 0.2581, "step": 5920 }, { "epoch": 4.1521739130434785, "grad_norm": 0.25384432077407837, "learning_rate": 3.5217060168744293e-06, "loss": 0.073, "step": 5921 }, { "epoch": 4.152875175315568, "grad_norm": 0.24935638904571533, "learning_rate": 3.516023116202563e-06, "loss": 0.0874, "step": 5922 }, { "epoch": 4.153576437587658, "grad_norm": 0.3002772629261017, "learning_rate": 3.51034445757879e-06, "loss": 0.2544, "step": 5923 }, { "epoch": 4.1542776998597475, "grad_norm": 0.8076561689376831, "learning_rate": 3.504670042124361e-06, "loss": 0.4239, "step": 5924 }, { "epoch": 4.154978962131837, "grad_norm": 3.6259467601776123, "learning_rate": 3.498999870959696e-06, "loss": 0.4487, "step": 5925 }, { "epoch": 4.155680224403927, "grad_norm": 0.2508493959903717, "learning_rate": 3.493333945204391e-06, "loss": 0.0866, "step": 5926 }, { "epoch": 4.1563814866760165, "grad_norm": 3.648942232131958, "learning_rate": 3.487672265977185e-06, "loss": 0.4786, "step": 5927 }, { "epoch": 4.157082748948106, "grad_norm": 0.2497093379497528, "learning_rate": 3.482014834395983e-06, "loss": 0.0821, "step": 5928 }, { "epoch": 4.157784011220197, "grad_norm": 0.24763254821300507, "learning_rate": 3.476361651577867e-06, "loss": 0.0884, "step": 5929 }, { "epoch": 4.158485273492286, "grad_norm": 0.29862886667251587, "learning_rate": 3.470712718639052e-06, "loss": 0.2552, "step": 5930 }, { "epoch": 4.159186535764376, "grad_norm": 0.2590291202068329, "learning_rate": 3.4650680366949433e-06, "loss": 0.0748, "step": 5931 }, { "epoch": 4.159887798036466, "grad_norm": 0.8064046502113342, "learning_rate": 3.4594276068600885e-06, "loss": 0.4218, "step": 5932 }, { "epoch": 4.160589060308555, "grad_norm": 0.24757632613182068, "learning_rate": 3.4537914302481962e-06, "loss": 0.0881, "step": 5933 }, { "epoch": 4.161290322580645, "grad_norm": 0.2510097622871399, "learning_rate": 3.44815950797214e-06, "loss": 0.0807, "step": 5934 }, { "epoch": 4.161991584852735, "grad_norm": 0.24634169042110443, "learning_rate": 3.442531841143956e-06, "loss": 0.0811, "step": 5935 }, { "epoch": 4.162692847124824, "grad_norm": 0.4487155079841614, "learning_rate": 3.436908430874833e-06, "loss": 0.2695, "step": 5936 }, { "epoch": 4.163394109396915, "grad_norm": 0.2526090443134308, "learning_rate": 3.4312892782751294e-06, "loss": 0.0811, "step": 5937 }, { "epoch": 4.164095371669005, "grad_norm": 0.24850957095623016, "learning_rate": 3.425674384454347e-06, "loss": 0.0819, "step": 5938 }, { "epoch": 4.164796633941094, "grad_norm": 0.24836300313472748, "learning_rate": 3.420063750521174e-06, "loss": 0.0814, "step": 5939 }, { "epoch": 4.165497896213184, "grad_norm": 0.27393627166748047, "learning_rate": 3.4144573775834134e-06, "loss": 0.0744, "step": 5940 }, { "epoch": 4.166199158485274, "grad_norm": 0.2525957226753235, "learning_rate": 3.4088552667480735e-06, "loss": 0.0859, "step": 5941 }, { "epoch": 4.166900420757363, "grad_norm": 0.24793075025081635, "learning_rate": 3.403257419121289e-06, "loss": 0.0884, "step": 5942 }, { "epoch": 4.167601683029453, "grad_norm": 0.250102162361145, "learning_rate": 3.3976638358083764e-06, "loss": 0.0826, "step": 5943 }, { "epoch": 4.1683029453015426, "grad_norm": 0.3106345534324646, "learning_rate": 3.3920745179137845e-06, "loss": 0.2545, "step": 5944 }, { "epoch": 4.169004207573632, "grad_norm": 0.2443302720785141, "learning_rate": 3.3864894665411454e-06, "loss": 0.0809, "step": 5945 }, { "epoch": 4.169705469845722, "grad_norm": 0.23910538852214813, "learning_rate": 3.380908682793235e-06, "loss": 0.0801, "step": 5946 }, { "epoch": 4.170406732117812, "grad_norm": 0.24919860064983368, "learning_rate": 3.375332167771983e-06, "loss": 0.0735, "step": 5947 }, { "epoch": 4.171107994389902, "grad_norm": 0.2512182295322418, "learning_rate": 3.3697599225784803e-06, "loss": 0.0883, "step": 5948 }, { "epoch": 4.171809256661992, "grad_norm": 0.3018202483654022, "learning_rate": 3.364191948312989e-06, "loss": 0.2558, "step": 5949 }, { "epoch": 4.172510518934081, "grad_norm": 0.31503576040267944, "learning_rate": 3.358628246074899e-06, "loss": 0.2492, "step": 5950 }, { "epoch": 4.173211781206171, "grad_norm": 0.31838706135749817, "learning_rate": 3.3530688169627873e-06, "loss": 0.2511, "step": 5951 }, { "epoch": 4.173913043478261, "grad_norm": 0.24688784778118134, "learning_rate": 3.3475136620743614e-06, "loss": 0.0879, "step": 5952 }, { "epoch": 4.17461430575035, "grad_norm": 0.2533946931362152, "learning_rate": 3.3419627825065107e-06, "loss": 0.0812, "step": 5953 }, { "epoch": 4.17531556802244, "grad_norm": 0.24618199467658997, "learning_rate": 3.336416179355245e-06, "loss": 0.0878, "step": 5954 }, { "epoch": 4.17601683029453, "grad_norm": 0.3148205578327179, "learning_rate": 3.3308738537157687e-06, "loss": 0.2499, "step": 5955 }, { "epoch": 4.17671809256662, "grad_norm": 0.3244711458683014, "learning_rate": 3.3253358066824115e-06, "loss": 0.2384, "step": 5956 }, { "epoch": 4.17741935483871, "grad_norm": 0.2480463683605194, "learning_rate": 3.3198020393486797e-06, "loss": 0.0878, "step": 5957 }, { "epoch": 4.1781206171108, "grad_norm": 0.2489292174577713, "learning_rate": 3.3142725528072153e-06, "loss": 0.0863, "step": 5958 }, { "epoch": 4.178821879382889, "grad_norm": 0.2485436052083969, "learning_rate": 3.3087473481498376e-06, "loss": 0.0874, "step": 5959 }, { "epoch": 4.179523141654979, "grad_norm": 0.3239317238330841, "learning_rate": 3.303226426467498e-06, "loss": 0.2449, "step": 5960 }, { "epoch": 4.180224403927069, "grad_norm": 0.4583340585231781, "learning_rate": 3.297709788850317e-06, "loss": 0.2669, "step": 5961 }, { "epoch": 4.180925666199158, "grad_norm": 0.25026005506515503, "learning_rate": 3.292197436387556e-06, "loss": 0.0865, "step": 5962 }, { "epoch": 4.181626928471248, "grad_norm": 0.25016841292381287, "learning_rate": 3.2866893701676503e-06, "loss": 0.0811, "step": 5963 }, { "epoch": 4.182328190743338, "grad_norm": 0.24390393495559692, "learning_rate": 3.281185591278163e-06, "loss": 0.0807, "step": 5964 }, { "epoch": 4.183029453015428, "grad_norm": 0.9222164154052734, "learning_rate": 3.2756861008058415e-06, "loss": 0.4276, "step": 5965 }, { "epoch": 4.183730715287518, "grad_norm": 0.3034336566925049, "learning_rate": 3.270190899836553e-06, "loss": 0.2563, "step": 5966 }, { "epoch": 4.1844319775596075, "grad_norm": 0.42723846435546875, "learning_rate": 3.2646999894553472e-06, "loss": 0.2639, "step": 5967 }, { "epoch": 4.185133239831697, "grad_norm": 0.24713203310966492, "learning_rate": 3.259213370746411e-06, "loss": 0.0871, "step": 5968 }, { "epoch": 4.185834502103787, "grad_norm": 0.31520864367485046, "learning_rate": 3.2537310447930763e-06, "loss": 0.2486, "step": 5969 }, { "epoch": 4.1865357643758765, "grad_norm": 0.23675750195980072, "learning_rate": 3.2482530126778514e-06, "loss": 0.0797, "step": 5970 }, { "epoch": 4.187237026647966, "grad_norm": 0.31575673818588257, "learning_rate": 3.2427792754823742e-06, "loss": 0.2495, "step": 5971 }, { "epoch": 4.187938288920056, "grad_norm": 0.4598257541656494, "learning_rate": 3.2373098342874527e-06, "loss": 0.2625, "step": 5972 }, { "epoch": 4.1886395511921455, "grad_norm": 3.5431504249572754, "learning_rate": 3.2318446901730304e-06, "loss": 0.4594, "step": 5973 }, { "epoch": 4.189340813464236, "grad_norm": 3.542818784713745, "learning_rate": 3.226383844218209e-06, "loss": 0.444, "step": 5974 }, { "epoch": 4.190042075736326, "grad_norm": 0.9176009893417358, "learning_rate": 3.2209272975012387e-06, "loss": 0.4307, "step": 5975 }, { "epoch": 4.190743338008415, "grad_norm": 0.24541908502578735, "learning_rate": 3.2154750510995367e-06, "loss": 0.0814, "step": 5976 }, { "epoch": 4.191444600280505, "grad_norm": 3.6178035736083984, "learning_rate": 3.2100271060896415e-06, "loss": 0.7542, "step": 5977 }, { "epoch": 4.192145862552595, "grad_norm": 0.2910712659358978, "learning_rate": 3.2045834635472756e-06, "loss": 0.2547, "step": 5978 }, { "epoch": 4.192847124824684, "grad_norm": 0.251441091299057, "learning_rate": 3.199144124547282e-06, "loss": 0.0826, "step": 5979 }, { "epoch": 4.193548387096774, "grad_norm": 0.2505572736263275, "learning_rate": 3.1937090901636795e-06, "loss": 0.0857, "step": 5980 }, { "epoch": 4.194249649368864, "grad_norm": 0.2594067454338074, "learning_rate": 3.188278361469621e-06, "loss": 0.0831, "step": 5981 }, { "epoch": 4.194950911640953, "grad_norm": 0.26762205362319946, "learning_rate": 3.1828519395374097e-06, "loss": 0.0772, "step": 5982 }, { "epoch": 4.195652173913044, "grad_norm": 0.2523270845413208, "learning_rate": 3.1774298254384966e-06, "loss": 0.0824, "step": 5983 }, { "epoch": 4.196353436185134, "grad_norm": 0.2980082631111145, "learning_rate": 3.172012020243503e-06, "loss": 0.2544, "step": 5984 }, { "epoch": 4.197054698457223, "grad_norm": 0.24729079008102417, "learning_rate": 3.166598525022166e-06, "loss": 0.0893, "step": 5985 }, { "epoch": 4.197755960729313, "grad_norm": 0.2465604692697525, "learning_rate": 3.161189340843407e-06, "loss": 0.0815, "step": 5986 }, { "epoch": 4.198457223001403, "grad_norm": 0.8046618103981018, "learning_rate": 3.1557844687752638e-06, "loss": 0.4095, "step": 5987 }, { "epoch": 4.199158485273492, "grad_norm": 0.2478058785200119, "learning_rate": 3.1503839098849545e-06, "loss": 0.0882, "step": 5988 }, { "epoch": 4.199859747545582, "grad_norm": 0.30591529607772827, "learning_rate": 3.1449876652388104e-06, "loss": 0.2495, "step": 5989 }, { "epoch": 4.2005610098176716, "grad_norm": 1.312147855758667, "learning_rate": 3.13959573590234e-06, "loss": 0.5901, "step": 5990 }, { "epoch": 4.201262272089761, "grad_norm": 0.264967679977417, "learning_rate": 3.1342081229401825e-06, "loss": 0.0771, "step": 5991 }, { "epoch": 4.201963534361852, "grad_norm": 0.29831433296203613, "learning_rate": 3.128824827416138e-06, "loss": 0.2474, "step": 5992 }, { "epoch": 4.202664796633941, "grad_norm": 0.3108043074607849, "learning_rate": 3.123445850393142e-06, "loss": 0.2498, "step": 5993 }, { "epoch": 4.203366058906031, "grad_norm": 0.2941657602787018, "learning_rate": 3.118071192933289e-06, "loss": 0.257, "step": 5994 }, { "epoch": 4.204067321178121, "grad_norm": 0.2536158263683319, "learning_rate": 3.112700856097811e-06, "loss": 0.0837, "step": 5995 }, { "epoch": 4.20476858345021, "grad_norm": 0.24727113544940948, "learning_rate": 3.1073348409470905e-06, "loss": 0.0888, "step": 5996 }, { "epoch": 4.2054698457223, "grad_norm": 0.24946151673793793, "learning_rate": 3.1019731485406515e-06, "loss": 0.0882, "step": 5997 }, { "epoch": 4.20617110799439, "grad_norm": 0.24832071363925934, "learning_rate": 3.0966157799371758e-06, "loss": 0.0881, "step": 5998 }, { "epoch": 4.206872370266479, "grad_norm": 0.2624160945415497, "learning_rate": 3.0912627361944808e-06, "loss": 0.0849, "step": 5999 }, { "epoch": 4.207573632538569, "grad_norm": 0.30426353216171265, "learning_rate": 3.08591401836954e-06, "loss": 0.2479, "step": 6000 }, { "epoch": 4.208274894810659, "grad_norm": 0.2949659526348114, "learning_rate": 3.0805696275184594e-06, "loss": 0.2504, "step": 6001 }, { "epoch": 4.208976157082749, "grad_norm": 0.2652573585510254, "learning_rate": 3.075229564696511e-06, "loss": 0.0782, "step": 6002 }, { "epoch": 4.209677419354839, "grad_norm": 0.24819552898406982, "learning_rate": 3.069893830958079e-06, "loss": 0.0891, "step": 6003 }, { "epoch": 4.210378681626929, "grad_norm": 0.7998207211494446, "learning_rate": 3.0645624273567292e-06, "loss": 0.4219, "step": 6004 }, { "epoch": 4.211079943899018, "grad_norm": 0.26104968786239624, "learning_rate": 3.0592353549451473e-06, "loss": 0.0841, "step": 6005 }, { "epoch": 4.211781206171108, "grad_norm": 3.039684295654297, "learning_rate": 3.053912614775184e-06, "loss": 0.4001, "step": 6006 }, { "epoch": 4.212482468443198, "grad_norm": 0.2501271367073059, "learning_rate": 3.048594207897809e-06, "loss": 0.082, "step": 6007 }, { "epoch": 4.213183730715287, "grad_norm": 0.29163122177124023, "learning_rate": 3.0432801353631664e-06, "loss": 0.2566, "step": 6008 }, { "epoch": 4.213884992987377, "grad_norm": 0.3145267963409424, "learning_rate": 3.0379703982205186e-06, "loss": 0.2501, "step": 6009 }, { "epoch": 4.214586255259467, "grad_norm": 0.30040407180786133, "learning_rate": 3.0326649975182865e-06, "loss": 0.2473, "step": 6010 }, { "epoch": 4.215287517531557, "grad_norm": 0.2545495629310608, "learning_rate": 3.027363934304023e-06, "loss": 0.0834, "step": 6011 }, { "epoch": 4.215988779803647, "grad_norm": 0.2893745005130768, "learning_rate": 3.0220672096244405e-06, "loss": 0.2527, "step": 6012 }, { "epoch": 4.2166900420757365, "grad_norm": 0.2546980679035187, "learning_rate": 3.0167748245253813e-06, "loss": 0.0833, "step": 6013 }, { "epoch": 4.217391304347826, "grad_norm": 0.24932225048542023, "learning_rate": 3.011486780051845e-06, "loss": 0.0828, "step": 6014 }, { "epoch": 4.218092566619916, "grad_norm": 0.24783575534820557, "learning_rate": 3.0062030772479517e-06, "loss": 0.089, "step": 6015 }, { "epoch": 4.2187938288920055, "grad_norm": 0.24586524069309235, "learning_rate": 3.0009237171569894e-06, "loss": 0.0887, "step": 6016 }, { "epoch": 4.219495091164095, "grad_norm": 0.8180702328681946, "learning_rate": 2.9956487008213715e-06, "loss": 0.415, "step": 6017 }, { "epoch": 4.220196353436185, "grad_norm": 0.24869385361671448, "learning_rate": 2.9903780292826532e-06, "loss": 0.0889, "step": 6018 }, { "epoch": 4.2208976157082745, "grad_norm": 0.8155221939086914, "learning_rate": 2.98511170358155e-06, "loss": 0.4213, "step": 6019 }, { "epoch": 4.221598877980365, "grad_norm": 0.29073402285575867, "learning_rate": 2.9798497247578965e-06, "loss": 0.2524, "step": 6020 }, { "epoch": 4.222300140252455, "grad_norm": 3.0879108905792236, "learning_rate": 2.9745920938506895e-06, "loss": 0.4127, "step": 6021 }, { "epoch": 4.223001402524544, "grad_norm": 0.2469778060913086, "learning_rate": 2.969338811898048e-06, "loss": 0.089, "step": 6022 }, { "epoch": 4.223702664796634, "grad_norm": 0.30015406012535095, "learning_rate": 2.9640898799372487e-06, "loss": 0.2469, "step": 6023 }, { "epoch": 4.224403927068724, "grad_norm": 0.24767890572547913, "learning_rate": 2.958845299004692e-06, "loss": 0.0891, "step": 6024 }, { "epoch": 4.225105189340813, "grad_norm": 0.2541695833206177, "learning_rate": 2.95360507013594e-06, "loss": 0.0837, "step": 6025 }, { "epoch": 4.225806451612903, "grad_norm": 0.30895256996154785, "learning_rate": 2.9483691943656746e-06, "loss": 0.2409, "step": 6026 }, { "epoch": 4.226507713884993, "grad_norm": 0.31033793091773987, "learning_rate": 2.9431376727277416e-06, "loss": 0.247, "step": 6027 }, { "epoch": 4.227208976157082, "grad_norm": 0.24837663769721985, "learning_rate": 2.937910506255101e-06, "loss": 0.0889, "step": 6028 }, { "epoch": 4.227910238429173, "grad_norm": 0.24470308423042297, "learning_rate": 2.932687695979877e-06, "loss": 0.0815, "step": 6029 }, { "epoch": 4.228611500701263, "grad_norm": 0.2910500466823578, "learning_rate": 2.927469242933317e-06, "loss": 0.2538, "step": 6030 }, { "epoch": 4.229312762973352, "grad_norm": 0.24182873964309692, "learning_rate": 2.9222551481458136e-06, "loss": 0.0816, "step": 6031 }, { "epoch": 4.230014025245442, "grad_norm": 0.25667500495910645, "learning_rate": 2.9170454126468932e-06, "loss": 0.0836, "step": 6032 }, { "epoch": 4.230715287517532, "grad_norm": 0.3225513696670532, "learning_rate": 2.9118400374652384e-06, "loss": 0.241, "step": 6033 }, { "epoch": 4.231416549789621, "grad_norm": 0.285481333732605, "learning_rate": 2.906639023628649e-06, "loss": 0.252, "step": 6034 }, { "epoch": 4.232117812061711, "grad_norm": 0.2707556486129761, "learning_rate": 2.9014423721640845e-06, "loss": 0.0759, "step": 6035 }, { "epoch": 4.232819074333801, "grad_norm": 2.9735546112060547, "learning_rate": 2.8962500840976183e-06, "loss": 0.4118, "step": 6036 }, { "epoch": 4.23352033660589, "grad_norm": 0.2542475163936615, "learning_rate": 2.8910621604544998e-06, "loss": 0.0837, "step": 6037 }, { "epoch": 4.234221598877981, "grad_norm": 0.801137387752533, "learning_rate": 2.885878602259065e-06, "loss": 0.4027, "step": 6038 }, { "epoch": 4.23492286115007, "grad_norm": 0.2619597911834717, "learning_rate": 2.8806994105348373e-06, "loss": 0.0857, "step": 6039 }, { "epoch": 4.23562412342216, "grad_norm": 0.2538188099861145, "learning_rate": 2.8755245863044426e-06, "loss": 0.0836, "step": 6040 }, { "epoch": 4.23632538569425, "grad_norm": 0.24954839050769806, "learning_rate": 2.8703541305896725e-06, "loss": 0.0889, "step": 6041 }, { "epoch": 4.237026647966339, "grad_norm": 0.255811482667923, "learning_rate": 2.8651880444114322e-06, "loss": 0.0824, "step": 6042 }, { "epoch": 4.237727910238429, "grad_norm": 0.24831026792526245, "learning_rate": 2.8600263287897807e-06, "loss": 0.0895, "step": 6043 }, { "epoch": 4.238429172510519, "grad_norm": 0.268655389547348, "learning_rate": 2.854868984743905e-06, "loss": 0.0774, "step": 6044 }, { "epoch": 4.239130434782608, "grad_norm": 0.2490920126438141, "learning_rate": 2.8497160132921336e-06, "loss": 0.0892, "step": 6045 }, { "epoch": 4.239831697054698, "grad_norm": 0.24695143103599548, "learning_rate": 2.8445674154519208e-06, "loss": 0.0889, "step": 6046 }, { "epoch": 4.240532959326789, "grad_norm": 0.3166784942150116, "learning_rate": 2.839423192239879e-06, "loss": 0.2414, "step": 6047 }, { "epoch": 4.241234221598878, "grad_norm": 0.29516929388046265, "learning_rate": 2.8342833446717317e-06, "loss": 0.2504, "step": 6048 }, { "epoch": 4.241935483870968, "grad_norm": 0.306128591299057, "learning_rate": 2.829147873762361e-06, "loss": 0.2487, "step": 6049 }, { "epoch": 4.242636746143058, "grad_norm": 0.24880942702293396, "learning_rate": 2.824016780525765e-06, "loss": 0.0885, "step": 6050 }, { "epoch": 4.243338008415147, "grad_norm": 0.3169955015182495, "learning_rate": 2.8188900659751023e-06, "loss": 0.239, "step": 6051 }, { "epoch": 4.244039270687237, "grad_norm": 0.24815909564495087, "learning_rate": 2.8137677311226296e-06, "loss": 0.0893, "step": 6052 }, { "epoch": 4.244740532959327, "grad_norm": 0.2892846465110779, "learning_rate": 2.8086497769797772e-06, "loss": 0.2467, "step": 6053 }, { "epoch": 4.245441795231416, "grad_norm": 0.3251498341560364, "learning_rate": 2.8035362045570834e-06, "loss": 0.2418, "step": 6054 }, { "epoch": 4.246143057503506, "grad_norm": 0.3131673336029053, "learning_rate": 2.798427014864244e-06, "loss": 0.2535, "step": 6055 }, { "epoch": 4.246844319775596, "grad_norm": 0.2970285415649414, "learning_rate": 2.7933222089100654e-06, "loss": 0.2478, "step": 6056 }, { "epoch": 4.247545582047686, "grad_norm": 0.2988693416118622, "learning_rate": 2.7882217877025107e-06, "loss": 0.2477, "step": 6057 }, { "epoch": 4.248246844319776, "grad_norm": 0.2506411671638489, "learning_rate": 2.7831257522486583e-06, "loss": 0.0892, "step": 6058 }, { "epoch": 4.2489481065918655, "grad_norm": 0.2963600158691406, "learning_rate": 2.778034103554736e-06, "loss": 0.2562, "step": 6059 }, { "epoch": 4.249649368863955, "grad_norm": 1.3287103176116943, "learning_rate": 2.772946842626087e-06, "loss": 0.5996, "step": 6060 }, { "epoch": 4.250350631136045, "grad_norm": 0.25341928005218506, "learning_rate": 2.767863970467216e-06, "loss": 0.0838, "step": 6061 }, { "epoch": 4.2510518934081345, "grad_norm": 0.3187142312526703, "learning_rate": 2.762785488081729e-06, "loss": 0.2461, "step": 6062 }, { "epoch": 4.251753155680224, "grad_norm": 0.25635024905204773, "learning_rate": 2.757711396472393e-06, "loss": 0.0832, "step": 6063 }, { "epoch": 4.252454417952314, "grad_norm": 0.2512172758579254, "learning_rate": 2.7526416966410923e-06, "loss": 0.0894, "step": 6064 }, { "epoch": 4.253155680224404, "grad_norm": 0.25963953137397766, "learning_rate": 2.7475763895888433e-06, "loss": 0.076, "step": 6065 }, { "epoch": 4.253856942496494, "grad_norm": 0.30338579416275024, "learning_rate": 2.742515476315807e-06, "loss": 0.2485, "step": 6066 }, { "epoch": 4.254558204768584, "grad_norm": 0.3131161034107208, "learning_rate": 2.7374589578212596e-06, "loss": 0.2581, "step": 6067 }, { "epoch": 4.255259467040673, "grad_norm": 0.3016390800476074, "learning_rate": 2.7324068351036317e-06, "loss": 0.2506, "step": 6068 }, { "epoch": 4.255960729312763, "grad_norm": 0.3161974847316742, "learning_rate": 2.7273591091604684e-06, "loss": 0.2422, "step": 6069 }, { "epoch": 4.256661991584853, "grad_norm": 0.253461629152298, "learning_rate": 2.7223157809884436e-06, "loss": 0.082, "step": 6070 }, { "epoch": 4.257363253856942, "grad_norm": 0.24886532127857208, "learning_rate": 2.717276851583386e-06, "loss": 0.0885, "step": 6071 }, { "epoch": 4.258064516129032, "grad_norm": 0.2622552216053009, "learning_rate": 2.712242321940231e-06, "loss": 0.0778, "step": 6072 }, { "epoch": 4.258765778401122, "grad_norm": 0.3796968460083008, "learning_rate": 2.707212193053055e-06, "loss": 0.2559, "step": 6073 }, { "epoch": 4.259467040673211, "grad_norm": 0.24993984401226044, "learning_rate": 2.7021864659150714e-06, "loss": 0.0819, "step": 6074 }, { "epoch": 4.260168302945302, "grad_norm": 0.29259738326072693, "learning_rate": 2.6971651415186093e-06, "loss": 0.2543, "step": 6075 }, { "epoch": 4.260869565217392, "grad_norm": 0.3036783039569855, "learning_rate": 2.692148220855148e-06, "loss": 0.2491, "step": 6076 }, { "epoch": 4.261570827489481, "grad_norm": 0.2730666399002075, "learning_rate": 2.6871357049152805e-06, "loss": 0.0769, "step": 6077 }, { "epoch": 4.262272089761571, "grad_norm": 0.2506813704967499, "learning_rate": 2.6821275946887463e-06, "loss": 0.0892, "step": 6078 }, { "epoch": 4.262973352033661, "grad_norm": 0.2606774866580963, "learning_rate": 2.67712389116439e-06, "loss": 0.0752, "step": 6079 }, { "epoch": 4.26367461430575, "grad_norm": 0.25004833936691284, "learning_rate": 2.672124595330214e-06, "loss": 0.0887, "step": 6080 }, { "epoch": 4.26437587657784, "grad_norm": 0.25239425897598267, "learning_rate": 2.667129708173327e-06, "loss": 0.0898, "step": 6081 }, { "epoch": 4.26507713884993, "grad_norm": 0.38535547256469727, "learning_rate": 2.6621392306799884e-06, "loss": 0.2612, "step": 6082 }, { "epoch": 4.265778401122019, "grad_norm": 0.3187364637851715, "learning_rate": 2.657153163835571e-06, "loss": 0.2427, "step": 6083 }, { "epoch": 4.26647966339411, "grad_norm": 0.8006865978240967, "learning_rate": 2.6521715086245875e-06, "loss": 0.4156, "step": 6084 }, { "epoch": 4.267180925666199, "grad_norm": 0.2562813460826874, "learning_rate": 2.647194266030667e-06, "loss": 0.0834, "step": 6085 }, { "epoch": 4.267882187938289, "grad_norm": 0.2602197527885437, "learning_rate": 2.642221437036582e-06, "loss": 0.0761, "step": 6086 }, { "epoch": 4.268583450210379, "grad_norm": 0.24701790511608124, "learning_rate": 2.637253022624217e-06, "loss": 0.0887, "step": 6087 }, { "epoch": 4.269284712482468, "grad_norm": 0.24806024134159088, "learning_rate": 2.6322890237746005e-06, "loss": 0.0897, "step": 6088 }, { "epoch": 4.269985974754558, "grad_norm": 0.25073006749153137, "learning_rate": 2.6273294414678796e-06, "loss": 0.0894, "step": 6089 }, { "epoch": 4.270687237026648, "grad_norm": 0.25115203857421875, "learning_rate": 2.622374276683337e-06, "loss": 0.0885, "step": 6090 }, { "epoch": 4.271388499298737, "grad_norm": 0.2903360426425934, "learning_rate": 2.617423530399371e-06, "loss": 0.2528, "step": 6091 }, { "epoch": 4.272089761570827, "grad_norm": 0.2526794970035553, "learning_rate": 2.6124772035935304e-06, "loss": 0.0873, "step": 6092 }, { "epoch": 4.272791023842918, "grad_norm": 0.30102822184562683, "learning_rate": 2.6075352972424526e-06, "loss": 0.2453, "step": 6093 }, { "epoch": 4.273492286115007, "grad_norm": 0.27452462911605835, "learning_rate": 2.602597812321947e-06, "loss": 0.0768, "step": 6094 }, { "epoch": 4.274193548387097, "grad_norm": 0.25933757424354553, "learning_rate": 2.5976647498069107e-06, "loss": 0.084, "step": 6095 }, { "epoch": 4.274894810659187, "grad_norm": 0.2985166013240814, "learning_rate": 2.5927361106713982e-06, "loss": 0.2565, "step": 6096 }, { "epoch": 4.275596072931276, "grad_norm": 0.2502393126487732, "learning_rate": 2.58781189588857e-06, "loss": 0.0891, "step": 6097 }, { "epoch": 4.276297335203366, "grad_norm": 0.3144022524356842, "learning_rate": 2.5828921064307288e-06, "loss": 0.2424, "step": 6098 }, { "epoch": 4.276998597475456, "grad_norm": 0.24550673365592957, "learning_rate": 2.577976743269292e-06, "loss": 0.0816, "step": 6099 }, { "epoch": 4.277699859747545, "grad_norm": 0.2432558834552765, "learning_rate": 2.5730658073748025e-06, "loss": 0.0812, "step": 6100 }, { "epoch": 4.278401122019635, "grad_norm": 0.3186744749546051, "learning_rate": 2.568159299716932e-06, "loss": 0.2587, "step": 6101 }, { "epoch": 4.2791023842917255, "grad_norm": 0.2923533320426941, "learning_rate": 2.5632572212644897e-06, "loss": 0.2537, "step": 6102 }, { "epoch": 4.279803646563815, "grad_norm": 0.2486860156059265, "learning_rate": 2.558359572985386e-06, "loss": 0.0892, "step": 6103 }, { "epoch": 4.280504908835905, "grad_norm": 0.3243277668952942, "learning_rate": 2.5534663558466817e-06, "loss": 0.2444, "step": 6104 }, { "epoch": 4.2812061711079945, "grad_norm": 4.9777655601501465, "learning_rate": 2.5485775708145393e-06, "loss": 0.6994, "step": 6105 }, { "epoch": 4.281907433380084, "grad_norm": 0.284822940826416, "learning_rate": 2.5436932188542705e-06, "loss": 0.2509, "step": 6106 }, { "epoch": 4.282608695652174, "grad_norm": 0.29969555139541626, "learning_rate": 2.53881330093029e-06, "loss": 0.2477, "step": 6107 }, { "epoch": 4.2833099579242635, "grad_norm": 0.3041766583919525, "learning_rate": 2.5339378180061518e-06, "loss": 0.2475, "step": 6108 }, { "epoch": 4.284011220196353, "grad_norm": 0.2953599691390991, "learning_rate": 2.529066771044519e-06, "loss": 0.2559, "step": 6109 }, { "epoch": 4.284712482468443, "grad_norm": 0.2487362176179886, "learning_rate": 2.524200161007198e-06, "loss": 0.0893, "step": 6110 }, { "epoch": 4.2854137447405325, "grad_norm": 0.2605412006378174, "learning_rate": 2.5193379888551e-06, "loss": 0.0834, "step": 6111 }, { "epoch": 4.286115007012623, "grad_norm": 0.25851744413375854, "learning_rate": 2.514480255548282e-06, "loss": 0.0837, "step": 6112 }, { "epoch": 4.286816269284713, "grad_norm": 0.24910366535186768, "learning_rate": 2.5096269620459022e-06, "loss": 0.0888, "step": 6113 }, { "epoch": 4.287517531556802, "grad_norm": 0.2484053373336792, "learning_rate": 2.504778109306252e-06, "loss": 0.0883, "step": 6114 }, { "epoch": 4.288218793828892, "grad_norm": 0.2478773295879364, "learning_rate": 2.4999336982867495e-06, "loss": 0.0887, "step": 6115 }, { "epoch": 4.288920056100982, "grad_norm": 3.461773633956909, "learning_rate": 2.4950937299439285e-06, "loss": 0.6059, "step": 6116 }, { "epoch": 4.289621318373071, "grad_norm": 0.24990756809711456, "learning_rate": 2.4902582052334533e-06, "loss": 0.0821, "step": 6117 }, { "epoch": 4.290322580645161, "grad_norm": 0.2727130949497223, "learning_rate": 2.485427125110104e-06, "loss": 0.079, "step": 6118 }, { "epoch": 4.291023842917251, "grad_norm": 0.25224509835243225, "learning_rate": 2.480600490527782e-06, "loss": 0.0894, "step": 6119 }, { "epoch": 4.291725105189341, "grad_norm": 0.2514670789241791, "learning_rate": 2.475778302439524e-06, "loss": 0.0885, "step": 6120 }, { "epoch": 4.292426367461431, "grad_norm": 0.26719337701797485, "learning_rate": 2.4709605617974723e-06, "loss": 0.0769, "step": 6121 }, { "epoch": 4.293127629733521, "grad_norm": 0.28893184661865234, "learning_rate": 2.466147269552893e-06, "loss": 0.2532, "step": 6122 }, { "epoch": 4.29382889200561, "grad_norm": 0.247762531042099, "learning_rate": 2.461338426656193e-06, "loss": 0.0892, "step": 6123 }, { "epoch": 4.2945301542777, "grad_norm": 0.3004249334335327, "learning_rate": 2.4565340340568703e-06, "loss": 0.2574, "step": 6124 }, { "epoch": 4.29523141654979, "grad_norm": 0.25976184010505676, "learning_rate": 2.4517340927035754e-06, "loss": 0.0762, "step": 6125 }, { "epoch": 4.295932678821879, "grad_norm": 0.2586849629878998, "learning_rate": 2.446938603544055e-06, "loss": 0.0819, "step": 6126 }, { "epoch": 4.296633941093969, "grad_norm": 0.3058205246925354, "learning_rate": 2.442147567525199e-06, "loss": 0.2489, "step": 6127 }, { "epoch": 4.297335203366059, "grad_norm": 0.4246414303779602, "learning_rate": 2.4373609855929874e-06, "loss": 0.2627, "step": 6128 }, { "epoch": 4.298036465638148, "grad_norm": 0.249914288520813, "learning_rate": 2.4325788586925523e-06, "loss": 0.0887, "step": 6129 }, { "epoch": 4.298737727910239, "grad_norm": 0.30084115266799927, "learning_rate": 2.427801187768125e-06, "loss": 0.2546, "step": 6130 }, { "epoch": 4.2994389901823284, "grad_norm": 0.4166683256626129, "learning_rate": 2.423027973763073e-06, "loss": 0.2591, "step": 6131 }, { "epoch": 4.300140252454418, "grad_norm": 0.30790117383003235, "learning_rate": 2.418259217619867e-06, "loss": 0.2493, "step": 6132 }, { "epoch": 4.300841514726508, "grad_norm": 0.2503269612789154, "learning_rate": 2.4134949202801155e-06, "loss": 0.0894, "step": 6133 }, { "epoch": 4.301542776998597, "grad_norm": 0.24886909127235413, "learning_rate": 2.408735082684532e-06, "loss": 0.0886, "step": 6134 }, { "epoch": 4.302244039270687, "grad_norm": 0.3304554224014282, "learning_rate": 2.4039797057729546e-06, "loss": 0.2413, "step": 6135 }, { "epoch": 4.302945301542777, "grad_norm": 0.2498704493045807, "learning_rate": 2.3992287904843367e-06, "loss": 0.0883, "step": 6136 }, { "epoch": 4.303646563814866, "grad_norm": 0.26206645369529724, "learning_rate": 2.394482337756762e-06, "loss": 0.0724, "step": 6137 }, { "epoch": 4.304347826086957, "grad_norm": 0.2961658239364624, "learning_rate": 2.3897403485274195e-06, "loss": 0.2529, "step": 6138 }, { "epoch": 4.305049088359047, "grad_norm": 0.29172974824905396, "learning_rate": 2.3850028237326295e-06, "loss": 0.2528, "step": 6139 }, { "epoch": 4.305750350631136, "grad_norm": 0.2514079809188843, "learning_rate": 2.380269764307819e-06, "loss": 0.088, "step": 6140 }, { "epoch": 4.306451612903226, "grad_norm": 0.3090280294418335, "learning_rate": 2.3755411711875482e-06, "loss": 0.2506, "step": 6141 }, { "epoch": 4.307152875175316, "grad_norm": 0.24681220948696136, "learning_rate": 2.3708170453054677e-06, "loss": 0.0893, "step": 6142 }, { "epoch": 4.307854137447405, "grad_norm": 0.25295963883399963, "learning_rate": 2.366097387594382e-06, "loss": 0.0749, "step": 6143 }, { "epoch": 4.308555399719495, "grad_norm": 0.2545658051967621, "learning_rate": 2.3613821989861814e-06, "loss": 0.0841, "step": 6144 }, { "epoch": 4.309256661991585, "grad_norm": 0.3052094876766205, "learning_rate": 2.3566714804119012e-06, "loss": 0.2555, "step": 6145 }, { "epoch": 4.309957924263674, "grad_norm": 0.25374943017959595, "learning_rate": 2.35196523280167e-06, "loss": 0.0833, "step": 6146 }, { "epoch": 4.310659186535764, "grad_norm": 0.2490466684103012, "learning_rate": 2.3472634570847557e-06, "loss": 0.0891, "step": 6147 }, { "epoch": 4.3113604488078545, "grad_norm": 0.2934637665748596, "learning_rate": 2.3425661541895237e-06, "loss": 0.254, "step": 6148 }, { "epoch": 4.312061711079944, "grad_norm": 0.7969584465026855, "learning_rate": 2.3378733250434657e-06, "loss": 0.417, "step": 6149 }, { "epoch": 4.312762973352034, "grad_norm": 0.29860591888427734, "learning_rate": 2.333184970573188e-06, "loss": 0.2545, "step": 6150 }, { "epoch": 4.3134642356241235, "grad_norm": 0.7991604804992676, "learning_rate": 2.3285010917044185e-06, "loss": 0.412, "step": 6151 }, { "epoch": 4.314165497896213, "grad_norm": 0.2477491945028305, "learning_rate": 2.323821689361988e-06, "loss": 0.089, "step": 6152 }, { "epoch": 4.314866760168303, "grad_norm": 0.25033554434776306, "learning_rate": 2.319146764469865e-06, "loss": 0.089, "step": 6153 }, { "epoch": 4.3155680224403925, "grad_norm": 0.25269562005996704, "learning_rate": 2.3144763179511113e-06, "loss": 0.0892, "step": 6154 }, { "epoch": 4.316269284712482, "grad_norm": 0.320545494556427, "learning_rate": 2.3098103507279227e-06, "loss": 0.244, "step": 6155 }, { "epoch": 4.316970546984572, "grad_norm": 0.2524207532405853, "learning_rate": 2.3051488637216014e-06, "loss": 0.0838, "step": 6156 }, { "epoch": 4.317671809256662, "grad_norm": 2.92364764213562, "learning_rate": 2.30049185785256e-06, "loss": 0.3994, "step": 6157 }, { "epoch": 4.318373071528752, "grad_norm": 0.26306891441345215, "learning_rate": 2.2958393340403313e-06, "loss": 0.0755, "step": 6158 }, { "epoch": 4.319074333800842, "grad_norm": 0.32044288516044617, "learning_rate": 2.2911912932035707e-06, "loss": 0.2452, "step": 6159 }, { "epoch": 4.319775596072931, "grad_norm": 0.25044557452201843, "learning_rate": 2.286547736260036e-06, "loss": 0.09, "step": 6160 }, { "epoch": 4.320476858345021, "grad_norm": 0.29052677750587463, "learning_rate": 2.2819086641266143e-06, "loss": 0.2533, "step": 6161 }, { "epoch": 4.321178120617111, "grad_norm": 1.3062512874603271, "learning_rate": 2.2772740777192907e-06, "loss": 0.5832, "step": 6162 }, { "epoch": 4.3218793828892, "grad_norm": 0.2517637610435486, "learning_rate": 2.272643977953173e-06, "loss": 0.0882, "step": 6163 }, { "epoch": 4.32258064516129, "grad_norm": 0.27183234691619873, "learning_rate": 2.268018365742486e-06, "loss": 0.0781, "step": 6164 }, { "epoch": 4.32328190743338, "grad_norm": 0.37730672955513, "learning_rate": 2.2633972420005562e-06, "loss": 0.2496, "step": 6165 }, { "epoch": 4.32398316970547, "grad_norm": 0.2596323490142822, "learning_rate": 2.258780607639846e-06, "loss": 0.0768, "step": 6166 }, { "epoch": 4.32468443197756, "grad_norm": 0.2525579333305359, "learning_rate": 2.2541684635719095e-06, "loss": 0.0894, "step": 6167 }, { "epoch": 4.32538569424965, "grad_norm": 0.24973350763320923, "learning_rate": 2.2495608107074186e-06, "loss": 0.0892, "step": 6168 }, { "epoch": 4.326086956521739, "grad_norm": 0.2717740535736084, "learning_rate": 2.2449576499561724e-06, "loss": 0.0795, "step": 6169 }, { "epoch": 4.326788218793829, "grad_norm": 0.2675091326236725, "learning_rate": 2.240358982227067e-06, "loss": 0.0775, "step": 6170 }, { "epoch": 4.327489481065919, "grad_norm": 0.8049453496932983, "learning_rate": 2.2357648084281117e-06, "loss": 0.4225, "step": 6171 }, { "epoch": 4.328190743338008, "grad_norm": 0.2507096827030182, "learning_rate": 2.231175129466445e-06, "loss": 0.0899, "step": 6172 }, { "epoch": 4.328892005610098, "grad_norm": 0.30314335227012634, "learning_rate": 2.2265899462483002e-06, "loss": 0.2466, "step": 6173 }, { "epoch": 4.329593267882188, "grad_norm": 0.24955686926841736, "learning_rate": 2.2220092596790336e-06, "loss": 0.0894, "step": 6174 }, { "epoch": 4.330294530154278, "grad_norm": 0.8099368214607239, "learning_rate": 2.2174330706631023e-06, "loss": 0.4163, "step": 6175 }, { "epoch": 4.330995792426368, "grad_norm": 0.2518582344055176, "learning_rate": 2.2128613801040975e-06, "loss": 0.0894, "step": 6176 }, { "epoch": 4.3316970546984574, "grad_norm": 0.7851727604866028, "learning_rate": 2.2082941889046886e-06, "loss": 0.4091, "step": 6177 }, { "epoch": 4.332398316970547, "grad_norm": 0.3152585029602051, "learning_rate": 2.2037314979666878e-06, "loss": 0.0656, "step": 6178 }, { "epoch": 4.333099579242637, "grad_norm": 0.2934049069881439, "learning_rate": 2.1991733081909994e-06, "loss": 0.2526, "step": 6179 }, { "epoch": 4.333800841514726, "grad_norm": 0.25206246972084045, "learning_rate": 2.194619620477653e-06, "loss": 0.0896, "step": 6180 }, { "epoch": 4.334502103786816, "grad_norm": 0.2910260558128357, "learning_rate": 2.1900704357257717e-06, "loss": 0.2544, "step": 6181 }, { "epoch": 4.335203366058906, "grad_norm": 0.320509135723114, "learning_rate": 2.1855257548336144e-06, "loss": 0.2395, "step": 6182 }, { "epoch": 4.335904628330995, "grad_norm": 0.2538139224052429, "learning_rate": 2.1809855786985243e-06, "loss": 0.0831, "step": 6183 }, { "epoch": 4.336605890603085, "grad_norm": 0.24971863627433777, "learning_rate": 2.1764499082169737e-06, "loss": 0.0895, "step": 6184 }, { "epoch": 4.337307152875176, "grad_norm": 0.29270923137664795, "learning_rate": 2.171918744284529e-06, "loss": 0.256, "step": 6185 }, { "epoch": 4.338008415147265, "grad_norm": 0.26015177369117737, "learning_rate": 2.1673920877958887e-06, "loss": 0.0847, "step": 6186 }, { "epoch": 4.338709677419355, "grad_norm": 0.2553452253341675, "learning_rate": 2.1628699396448377e-06, "loss": 0.0831, "step": 6187 }, { "epoch": 4.339410939691445, "grad_norm": 0.2498752474784851, "learning_rate": 2.158352300724292e-06, "loss": 0.0888, "step": 6188 }, { "epoch": 4.340112201963534, "grad_norm": 0.25097617506980896, "learning_rate": 2.153839171926264e-06, "loss": 0.0887, "step": 6189 }, { "epoch": 4.340813464235624, "grad_norm": 0.28563982248306274, "learning_rate": 2.1493305541418733e-06, "loss": 0.2515, "step": 6190 }, { "epoch": 4.341514726507714, "grad_norm": 0.30484092235565186, "learning_rate": 2.1448264482613584e-06, "loss": 0.2506, "step": 6191 }, { "epoch": 4.342215988779803, "grad_norm": 0.3077840209007263, "learning_rate": 2.140326855174063e-06, "loss": 0.2482, "step": 6192 }, { "epoch": 4.342917251051894, "grad_norm": 0.7971906661987305, "learning_rate": 2.1358317757684348e-06, "loss": 0.4186, "step": 6193 }, { "epoch": 4.3436185133239835, "grad_norm": 0.342014342546463, "learning_rate": 2.1313412109320446e-06, "loss": 0.234, "step": 6194 }, { "epoch": 4.344319775596073, "grad_norm": 0.2942405939102173, "learning_rate": 2.1268551615515495e-06, "loss": 0.255, "step": 6195 }, { "epoch": 4.345021037868163, "grad_norm": 0.25116896629333496, "learning_rate": 2.122373628512747e-06, "loss": 0.0888, "step": 6196 }, { "epoch": 4.3457223001402525, "grad_norm": 0.25250616669654846, "learning_rate": 2.1178966127004984e-06, "loss": 0.0887, "step": 6197 }, { "epoch": 4.346423562412342, "grad_norm": 0.2501662075519562, "learning_rate": 2.1134241149988187e-06, "loss": 0.0898, "step": 6198 }, { "epoch": 4.347124824684432, "grad_norm": 0.2532263994216919, "learning_rate": 2.1089561362907963e-06, "loss": 0.0897, "step": 6199 }, { "epoch": 4.3478260869565215, "grad_norm": 0.28664103150367737, "learning_rate": 2.1044926774586526e-06, "loss": 0.2543, "step": 6200 }, { "epoch": 4.348527349228611, "grad_norm": 0.7892490029335022, "learning_rate": 2.1000337393836944e-06, "loss": 0.4078, "step": 6201 }, { "epoch": 4.349228611500701, "grad_norm": 0.25999659299850464, "learning_rate": 2.095579322946356e-06, "loss": 0.0843, "step": 6202 }, { "epoch": 4.349929873772791, "grad_norm": 0.2980816066265106, "learning_rate": 2.0911294290261675e-06, "loss": 0.2461, "step": 6203 }, { "epoch": 4.350631136044881, "grad_norm": 0.2514423429965973, "learning_rate": 2.086684058501762e-06, "loss": 0.0891, "step": 6204 }, { "epoch": 4.351332398316971, "grad_norm": 0.28809046745300293, "learning_rate": 2.0822432122508922e-06, "loss": 0.2547, "step": 6205 }, { "epoch": 4.35203366058906, "grad_norm": 0.288139671087265, "learning_rate": 2.0778068911504105e-06, "loss": 0.2534, "step": 6206 }, { "epoch": 4.35273492286115, "grad_norm": 0.2576656937599182, "learning_rate": 2.0733750960762688e-06, "loss": 0.0837, "step": 6207 }, { "epoch": 4.35343618513324, "grad_norm": 3.879814386367798, "learning_rate": 2.0689478279035417e-06, "loss": 0.4818, "step": 6208 }, { "epoch": 4.354137447405329, "grad_norm": 0.32065826654434204, "learning_rate": 2.0645250875063943e-06, "loss": 0.2363, "step": 6209 }, { "epoch": 4.354838709677419, "grad_norm": 0.2531592845916748, "learning_rate": 2.060106875758111e-06, "loss": 0.0886, "step": 6210 }, { "epoch": 4.355539971949509, "grad_norm": 0.272353857755661, "learning_rate": 2.055693193531069e-06, "loss": 0.0781, "step": 6211 }, { "epoch": 4.356241234221599, "grad_norm": 0.30224311351776123, "learning_rate": 2.0512840416967576e-06, "loss": 0.2484, "step": 6212 }, { "epoch": 4.356942496493689, "grad_norm": 0.30895718932151794, "learning_rate": 2.04687942112578e-06, "loss": 0.2477, "step": 6213 }, { "epoch": 4.357643758765779, "grad_norm": 3.555805206298828, "learning_rate": 2.0424793326878216e-06, "loss": 0.4685, "step": 6214 }, { "epoch": 4.358345021037868, "grad_norm": 0.276447057723999, "learning_rate": 2.038083777251704e-06, "loss": 0.0691, "step": 6215 }, { "epoch": 4.359046283309958, "grad_norm": 0.27553659677505493, "learning_rate": 2.0336927556853235e-06, "loss": 0.0775, "step": 6216 }, { "epoch": 4.359747545582048, "grad_norm": 0.29008349776268005, "learning_rate": 2.0293062688557045e-06, "loss": 0.2499, "step": 6217 }, { "epoch": 4.360448807854137, "grad_norm": 0.2801239490509033, "learning_rate": 2.0249243176289566e-06, "loss": 0.2485, "step": 6218 }, { "epoch": 4.361150070126227, "grad_norm": 0.29493623971939087, "learning_rate": 2.02054690287031e-06, "loss": 0.2542, "step": 6219 }, { "epoch": 4.361851332398317, "grad_norm": 0.2504994571208954, "learning_rate": 2.0161740254440896e-06, "loss": 0.0882, "step": 6220 }, { "epoch": 4.362552594670407, "grad_norm": 0.8016871809959412, "learning_rate": 2.0118056862137357e-06, "loss": 0.4082, "step": 6221 }, { "epoch": 4.363253856942497, "grad_norm": 0.32755154371261597, "learning_rate": 2.007441886041772e-06, "loss": 0.2436, "step": 6222 }, { "epoch": 4.3639551192145865, "grad_norm": 0.24995768070220947, "learning_rate": 2.0030826257898484e-06, "loss": 0.082, "step": 6223 }, { "epoch": 4.364656381486676, "grad_norm": 0.24681755900382996, "learning_rate": 1.9987279063187066e-06, "loss": 0.0743, "step": 6224 }, { "epoch": 4.365357643758766, "grad_norm": 0.25503042340278625, "learning_rate": 1.994377728488189e-06, "loss": 0.0822, "step": 6225 }, { "epoch": 4.3660589060308554, "grad_norm": 0.2543928325176239, "learning_rate": 1.990032093157243e-06, "loss": 0.0885, "step": 6226 }, { "epoch": 4.366760168302945, "grad_norm": 0.2704312205314636, "learning_rate": 1.9856910011839342e-06, "loss": 0.0777, "step": 6227 }, { "epoch": 4.367461430575035, "grad_norm": 0.2551566958427429, "learning_rate": 1.9813544534254046e-06, "loss": 0.0841, "step": 6228 }, { "epoch": 4.368162692847124, "grad_norm": 0.3050813376903534, "learning_rate": 1.9770224507379226e-06, "loss": 0.2483, "step": 6229 }, { "epoch": 4.368863955119215, "grad_norm": 0.2519625425338745, "learning_rate": 1.972694993976845e-06, "loss": 0.0895, "step": 6230 }, { "epoch": 4.369565217391305, "grad_norm": 0.2523195743560791, "learning_rate": 1.9683720839966445e-06, "loss": 0.0884, "step": 6231 }, { "epoch": 4.370266479663394, "grad_norm": 0.2723191976547241, "learning_rate": 1.9640537216508712e-06, "loss": 0.0784, "step": 6232 }, { "epoch": 4.370967741935484, "grad_norm": 0.2537282109260559, "learning_rate": 1.9597399077922076e-06, "loss": 0.0751, "step": 6233 }, { "epoch": 4.371669004207574, "grad_norm": 0.29120227694511414, "learning_rate": 1.9554306432724103e-06, "loss": 0.2511, "step": 6234 }, { "epoch": 4.372370266479663, "grad_norm": 0.28703829646110535, "learning_rate": 1.951125928942363e-06, "loss": 0.2509, "step": 6235 }, { "epoch": 4.373071528751753, "grad_norm": 0.278201162815094, "learning_rate": 1.9468257656520327e-06, "loss": 0.0771, "step": 6236 }, { "epoch": 4.373772791023843, "grad_norm": 0.25105324387550354, "learning_rate": 1.9425301542504964e-06, "loss": 0.0896, "step": 6237 }, { "epoch": 4.374474053295932, "grad_norm": 0.25086256861686707, "learning_rate": 1.9382390955859337e-06, "loss": 0.0893, "step": 6238 }, { "epoch": 4.375175315568022, "grad_norm": 0.3006114959716797, "learning_rate": 1.9339525905056137e-06, "loss": 0.2559, "step": 6239 }, { "epoch": 4.3758765778401125, "grad_norm": 0.25104570388793945, "learning_rate": 1.9296706398559126e-06, "loss": 0.0899, "step": 6240 }, { "epoch": 4.376577840112202, "grad_norm": 0.4048531949520111, "learning_rate": 1.9253932444823182e-06, "loss": 0.2583, "step": 6241 }, { "epoch": 4.377279102384292, "grad_norm": 0.4009511470794678, "learning_rate": 1.9211204052294014e-06, "loss": 0.247, "step": 6242 }, { "epoch": 4.3779803646563815, "grad_norm": 0.24725262820720673, "learning_rate": 1.9168521229408516e-06, "loss": 0.0823, "step": 6243 }, { "epoch": 4.378681626928471, "grad_norm": 0.29262420535087585, "learning_rate": 1.9125883984594363e-06, "loss": 0.255, "step": 6244 }, { "epoch": 4.379382889200561, "grad_norm": 0.3211482763290405, "learning_rate": 1.9083292326270485e-06, "loss": 0.2416, "step": 6245 }, { "epoch": 4.3800841514726505, "grad_norm": 0.24870656430721283, "learning_rate": 1.904074626284652e-06, "loss": 0.0816, "step": 6246 }, { "epoch": 4.38078541374474, "grad_norm": 0.26549917459487915, "learning_rate": 1.8998245802723385e-06, "loss": 0.077, "step": 6247 }, { "epoch": 4.381486676016831, "grad_norm": 0.2519749402999878, "learning_rate": 1.8955790954292812e-06, "loss": 0.0893, "step": 6248 }, { "epoch": 4.38218793828892, "grad_norm": 0.31911343336105347, "learning_rate": 1.8913381725937622e-06, "loss": 0.25, "step": 6249 }, { "epoch": 4.38288920056101, "grad_norm": 0.2540830969810486, "learning_rate": 1.8871018126031536e-06, "loss": 0.0881, "step": 6250 }, { "epoch": 4.3835904628331, "grad_norm": 0.26085758209228516, "learning_rate": 1.882870016293939e-06, "loss": 0.0737, "step": 6251 }, { "epoch": 4.384291725105189, "grad_norm": 0.32037869095802307, "learning_rate": 1.8786427845016886e-06, "loss": 0.2362, "step": 6252 }, { "epoch": 4.384992987377279, "grad_norm": 0.2914690673351288, "learning_rate": 1.8744201180610738e-06, "loss": 0.2553, "step": 6253 }, { "epoch": 4.385694249649369, "grad_norm": 0.2553255558013916, "learning_rate": 1.8702020178058776e-06, "loss": 0.0892, "step": 6254 }, { "epoch": 4.386395511921458, "grad_norm": 0.3018356263637543, "learning_rate": 1.8659884845689645e-06, "loss": 0.2454, "step": 6255 }, { "epoch": 4.387096774193548, "grad_norm": 0.2544955611228943, "learning_rate": 1.8617795191822995e-06, "loss": 0.0831, "step": 6256 }, { "epoch": 4.387798036465638, "grad_norm": 0.2516225576400757, "learning_rate": 1.8575751224769594e-06, "loss": 0.0898, "step": 6257 }, { "epoch": 4.388499298737728, "grad_norm": 0.7978813052177429, "learning_rate": 1.8533752952830996e-06, "loss": 0.4192, "step": 6258 }, { "epoch": 4.389200561009818, "grad_norm": 0.28658491373062134, "learning_rate": 1.8491800384299957e-06, "loss": 0.2533, "step": 6259 }, { "epoch": 4.389901823281908, "grad_norm": 0.2506749629974365, "learning_rate": 1.844989352745999e-06, "loss": 0.0888, "step": 6260 }, { "epoch": 4.390603085553997, "grad_norm": 0.2524193823337555, "learning_rate": 1.8408032390585666e-06, "loss": 0.0892, "step": 6261 }, { "epoch": 4.391304347826087, "grad_norm": 0.25496697425842285, "learning_rate": 1.836621698194263e-06, "loss": 0.0836, "step": 6262 }, { "epoch": 4.392005610098177, "grad_norm": 0.3073676526546478, "learning_rate": 1.8324447309787274e-06, "loss": 0.2493, "step": 6263 }, { "epoch": 4.392706872370266, "grad_norm": 0.33081576228141785, "learning_rate": 1.8282723382367224e-06, "loss": 0.2428, "step": 6264 }, { "epoch": 4.393408134642356, "grad_norm": 0.4081215560436249, "learning_rate": 1.8241045207920888e-06, "loss": 0.2609, "step": 6265 }, { "epoch": 4.3941093969144465, "grad_norm": 0.2447034865617752, "learning_rate": 1.8199412794677655e-06, "loss": 0.0796, "step": 6266 }, { "epoch": 4.394810659186536, "grad_norm": 0.2549850642681122, "learning_rate": 1.8157826150857926e-06, "loss": 0.0843, "step": 6267 }, { "epoch": 4.395511921458626, "grad_norm": 0.25212597846984863, "learning_rate": 1.8116285284673128e-06, "loss": 0.0825, "step": 6268 }, { "epoch": 4.3962131837307155, "grad_norm": 0.2519287168979645, "learning_rate": 1.8074790204325481e-06, "loss": 0.0829, "step": 6269 }, { "epoch": 4.396914446002805, "grad_norm": 0.3180766701698303, "learning_rate": 1.8033340918008346e-06, "loss": 0.2399, "step": 6270 }, { "epoch": 4.397615708274895, "grad_norm": 0.3104209303855896, "learning_rate": 1.799193743390587e-06, "loss": 0.2514, "step": 6271 }, { "epoch": 4.3983169705469845, "grad_norm": 0.24822624027729034, "learning_rate": 1.795057976019332e-06, "loss": 0.0889, "step": 6272 }, { "epoch": 4.399018232819074, "grad_norm": 0.29733777046203613, "learning_rate": 1.7909267905036804e-06, "loss": 0.2466, "step": 6273 }, { "epoch": 4.399719495091164, "grad_norm": 5.952130317687988, "learning_rate": 1.7868001876593433e-06, "loss": 0.925, "step": 6274 }, { "epoch": 4.400420757363253, "grad_norm": 0.2691532373428345, "learning_rate": 1.782678168301119e-06, "loss": 0.0772, "step": 6275 }, { "epoch": 4.401122019635344, "grad_norm": 0.295486718416214, "learning_rate": 1.7785607332429178e-06, "loss": 0.254, "step": 6276 }, { "epoch": 4.401823281907434, "grad_norm": 0.2964518070220947, "learning_rate": 1.7744478832977252e-06, "loss": 0.2551, "step": 6277 }, { "epoch": 4.402524544179523, "grad_norm": 0.2665087580680847, "learning_rate": 1.770339619277636e-06, "loss": 0.0783, "step": 6278 }, { "epoch": 4.403225806451613, "grad_norm": 0.2522292137145996, "learning_rate": 1.7662359419938296e-06, "loss": 0.0886, "step": 6279 }, { "epoch": 4.403927068723703, "grad_norm": 0.2521476745605469, "learning_rate": 1.7621368522565962e-06, "loss": 0.0896, "step": 6280 }, { "epoch": 4.404628330995792, "grad_norm": 0.8000521063804626, "learning_rate": 1.7580423508752885e-06, "loss": 0.4149, "step": 6281 }, { "epoch": 4.405329593267882, "grad_norm": 0.346130907535553, "learning_rate": 1.7539524386583878e-06, "loss": 0.2363, "step": 6282 }, { "epoch": 4.406030855539972, "grad_norm": 0.25401049852371216, "learning_rate": 1.749867116413448e-06, "loss": 0.0833, "step": 6283 }, { "epoch": 4.406732117812061, "grad_norm": 0.8906673789024353, "learning_rate": 1.7457863849471262e-06, "loss": 0.4298, "step": 6284 }, { "epoch": 4.407433380084152, "grad_norm": 0.25144198536872864, "learning_rate": 1.7417102450651646e-06, "loss": 0.0902, "step": 6285 }, { "epoch": 4.4081346423562415, "grad_norm": 0.2621580958366394, "learning_rate": 1.7376386975724107e-06, "loss": 0.081, "step": 6286 }, { "epoch": 4.408835904628331, "grad_norm": 0.25617918372154236, "learning_rate": 1.7335717432727966e-06, "loss": 0.0749, "step": 6287 }, { "epoch": 4.409537166900421, "grad_norm": 0.30518004298210144, "learning_rate": 1.7295093829693465e-06, "loss": 0.2558, "step": 6288 }, { "epoch": 4.4102384291725105, "grad_norm": 0.2684614658355713, "learning_rate": 1.72545161746418e-06, "loss": 0.0681, "step": 6289 }, { "epoch": 4.4109396914446, "grad_norm": 0.25679725408554077, "learning_rate": 1.7213984475585144e-06, "loss": 0.082, "step": 6290 }, { "epoch": 4.41164095371669, "grad_norm": 0.31496453285217285, "learning_rate": 1.7173498740526483e-06, "loss": 0.245, "step": 6291 }, { "epoch": 4.4123422159887795, "grad_norm": 0.3979145288467407, "learning_rate": 1.7133058977459898e-06, "loss": 0.2468, "step": 6292 }, { "epoch": 4.413043478260869, "grad_norm": 0.2633626461029053, "learning_rate": 1.7092665194370166e-06, "loss": 0.0758, "step": 6293 }, { "epoch": 4.41374474053296, "grad_norm": 0.24956762790679932, "learning_rate": 1.705231739923327e-06, "loss": 0.0809, "step": 6294 }, { "epoch": 4.414446002805049, "grad_norm": 0.298572301864624, "learning_rate": 1.7012015600015751e-06, "loss": 0.2559, "step": 6295 }, { "epoch": 4.415147265077139, "grad_norm": 0.26999497413635254, "learning_rate": 1.6971759804675441e-06, "loss": 0.0773, "step": 6296 }, { "epoch": 4.415848527349229, "grad_norm": 0.2545521855354309, "learning_rate": 1.6931550021160786e-06, "loss": 0.0877, "step": 6297 }, { "epoch": 4.416549789621318, "grad_norm": 4.551060676574707, "learning_rate": 1.6891386257411378e-06, "loss": 0.532, "step": 6298 }, { "epoch": 4.417251051893408, "grad_norm": 0.30890288949012756, "learning_rate": 1.6851268521357544e-06, "loss": 0.2489, "step": 6299 }, { "epoch": 4.417952314165498, "grad_norm": 0.29152175784111023, "learning_rate": 1.6811196820920694e-06, "loss": 0.0699, "step": 6300 }, { "epoch": 4.418653576437587, "grad_norm": 0.25721538066864014, "learning_rate": 1.6771171164012972e-06, "loss": 0.0836, "step": 6301 }, { "epoch": 4.419354838709677, "grad_norm": 0.27276861667633057, "learning_rate": 1.673119155853753e-06, "loss": 0.0762, "step": 6302 }, { "epoch": 4.420056100981768, "grad_norm": 0.2516676187515259, "learning_rate": 1.6691258012388382e-06, "loss": 0.0829, "step": 6303 }, { "epoch": 4.420757363253857, "grad_norm": 0.25063982605934143, "learning_rate": 1.665137053345056e-06, "loss": 0.0884, "step": 6304 }, { "epoch": 4.421458625525947, "grad_norm": 0.2822166383266449, "learning_rate": 1.6611529129599818e-06, "loss": 0.0703, "step": 6305 }, { "epoch": 4.422159887798037, "grad_norm": 0.2659904956817627, "learning_rate": 1.6571733808702999e-06, "loss": 0.0765, "step": 6306 }, { "epoch": 4.422861150070126, "grad_norm": 0.912237286567688, "learning_rate": 1.6531984578617655e-06, "loss": 0.426, "step": 6307 }, { "epoch": 4.423562412342216, "grad_norm": 0.25806713104248047, "learning_rate": 1.6492281447192447e-06, "loss": 0.0824, "step": 6308 }, { "epoch": 4.424263674614306, "grad_norm": 2.8572537899017334, "learning_rate": 1.6452624422266805e-06, "loss": 0.3895, "step": 6309 }, { "epoch": 4.424964936886395, "grad_norm": 0.2550782859325409, "learning_rate": 1.6413013511670988e-06, "loss": 0.0823, "step": 6310 }, { "epoch": 4.425666199158485, "grad_norm": 3.213682174682617, "learning_rate": 1.6373448723226326e-06, "loss": 0.5687, "step": 6311 }, { "epoch": 4.426367461430575, "grad_norm": 0.2662953734397888, "learning_rate": 1.6333930064744902e-06, "loss": 0.0772, "step": 6312 }, { "epoch": 4.427068723702665, "grad_norm": 0.4248972237110138, "learning_rate": 1.6294457544029806e-06, "loss": 0.2615, "step": 6313 }, { "epoch": 4.427769985974755, "grad_norm": 0.24915719032287598, "learning_rate": 1.6255031168874917e-06, "loss": 0.0894, "step": 6314 }, { "epoch": 4.4284712482468445, "grad_norm": 0.2566870450973511, "learning_rate": 1.6215650947065037e-06, "loss": 0.0835, "step": 6315 }, { "epoch": 4.429172510518934, "grad_norm": 0.25292107462882996, "learning_rate": 1.617631688637583e-06, "loss": 0.0824, "step": 6316 }, { "epoch": 4.429873772791024, "grad_norm": 0.2499745786190033, "learning_rate": 1.613702899457395e-06, "loss": 0.0893, "step": 6317 }, { "epoch": 4.4305750350631135, "grad_norm": 0.31826847791671753, "learning_rate": 1.609778727941677e-06, "loss": 0.2495, "step": 6318 }, { "epoch": 4.431276297335203, "grad_norm": 0.25308820605278015, "learning_rate": 1.6058591748652735e-06, "loss": 0.0899, "step": 6319 }, { "epoch": 4.431977559607293, "grad_norm": 0.2550888955593109, "learning_rate": 1.6019442410020957e-06, "loss": 0.0818, "step": 6320 }, { "epoch": 4.432678821879383, "grad_norm": 0.25168469548225403, "learning_rate": 1.598033927125167e-06, "loss": 0.089, "step": 6321 }, { "epoch": 4.433380084151473, "grad_norm": 0.286304771900177, "learning_rate": 1.59412823400657e-06, "loss": 0.2504, "step": 6322 }, { "epoch": 4.434081346423563, "grad_norm": 0.2913591265678406, "learning_rate": 1.5902271624175041e-06, "loss": 0.2517, "step": 6323 }, { "epoch": 4.434782608695652, "grad_norm": 0.30940431356430054, "learning_rate": 1.586330713128234e-06, "loss": 0.2506, "step": 6324 }, { "epoch": 4.435483870967742, "grad_norm": 0.2868114411830902, "learning_rate": 1.5824388869081247e-06, "loss": 0.2521, "step": 6325 }, { "epoch": 4.436185133239832, "grad_norm": 0.29645398259162903, "learning_rate": 1.5785516845256226e-06, "loss": 0.2548, "step": 6326 }, { "epoch": 4.436886395511921, "grad_norm": 0.2915540635585785, "learning_rate": 1.574669106748264e-06, "loss": 0.2462, "step": 6327 }, { "epoch": 4.437587657784011, "grad_norm": 0.24885424971580505, "learning_rate": 1.5707911543426684e-06, "loss": 0.0816, "step": 6328 }, { "epoch": 4.438288920056101, "grad_norm": 0.31860777735710144, "learning_rate": 1.5669178280745462e-06, "loss": 0.2385, "step": 6329 }, { "epoch": 4.43899018232819, "grad_norm": 0.2619536519050598, "learning_rate": 1.563049128708688e-06, "loss": 0.0833, "step": 6330 }, { "epoch": 4.439691444600281, "grad_norm": 0.30918753147125244, "learning_rate": 1.55918505700898e-06, "loss": 0.2507, "step": 6331 }, { "epoch": 4.4403927068723705, "grad_norm": 0.29823562502861023, "learning_rate": 1.555325613738387e-06, "loss": 0.2464, "step": 6332 }, { "epoch": 4.44109396914446, "grad_norm": 0.2530180811882019, "learning_rate": 1.5514707996589662e-06, "loss": 0.0889, "step": 6333 }, { "epoch": 4.44179523141655, "grad_norm": 0.25167015194892883, "learning_rate": 1.54762061553185e-06, "loss": 0.0884, "step": 6334 }, { "epoch": 4.4424964936886395, "grad_norm": 0.2978036105632782, "learning_rate": 1.5437750621172782e-06, "loss": 0.2492, "step": 6335 }, { "epoch": 4.443197755960729, "grad_norm": 0.2591835558414459, "learning_rate": 1.5399341401745426e-06, "loss": 0.0835, "step": 6336 }, { "epoch": 4.443899018232819, "grad_norm": 0.26313284039497375, "learning_rate": 1.5360978504620538e-06, "loss": 0.0848, "step": 6337 }, { "epoch": 4.4446002805049085, "grad_norm": 0.24971501529216766, "learning_rate": 1.5322661937372863e-06, "loss": 0.0827, "step": 6338 }, { "epoch": 4.445301542776998, "grad_norm": 0.25185030698776245, "learning_rate": 1.528439170756815e-06, "loss": 0.0898, "step": 6339 }, { "epoch": 4.446002805049089, "grad_norm": 0.807957649230957, "learning_rate": 1.5246167822762808e-06, "loss": 0.4228, "step": 6340 }, { "epoch": 4.446704067321178, "grad_norm": 0.2984069287776947, "learning_rate": 1.5207990290504348e-06, "loss": 0.2578, "step": 6341 }, { "epoch": 4.447405329593268, "grad_norm": 0.4186892807483673, "learning_rate": 1.5169859118330909e-06, "loss": 0.2539, "step": 6342 }, { "epoch": 4.448106591865358, "grad_norm": 0.24499911069869995, "learning_rate": 1.5131774313771551e-06, "loss": 0.082, "step": 6343 }, { "epoch": 4.448807854137447, "grad_norm": 0.25156763195991516, "learning_rate": 1.5093735884346177e-06, "loss": 0.0897, "step": 6344 }, { "epoch": 4.449509116409537, "grad_norm": 0.2535279095172882, "learning_rate": 1.5055743837565611e-06, "loss": 0.0744, "step": 6345 }, { "epoch": 4.450210378681627, "grad_norm": 0.4219643771648407, "learning_rate": 1.5017798180931353e-06, "loss": 0.2651, "step": 6346 }, { "epoch": 4.450911640953716, "grad_norm": 0.2528790831565857, "learning_rate": 1.497989892193591e-06, "loss": 0.0823, "step": 6347 }, { "epoch": 4.451612903225806, "grad_norm": 3.7707674503326416, "learning_rate": 1.4942046068062487e-06, "loss": 0.4865, "step": 6348 }, { "epoch": 4.452314165497897, "grad_norm": 0.30639833211898804, "learning_rate": 1.4904239626785277e-06, "loss": 0.2577, "step": 6349 }, { "epoch": 4.453015427769986, "grad_norm": 0.2518167197704315, "learning_rate": 1.4866479605569195e-06, "loss": 0.0888, "step": 6350 }, { "epoch": 4.453716690042076, "grad_norm": 0.31561529636383057, "learning_rate": 1.4828766011870026e-06, "loss": 0.2517, "step": 6351 }, { "epoch": 4.454417952314166, "grad_norm": 0.25370386242866516, "learning_rate": 1.4791098853134315e-06, "loss": 0.0837, "step": 6352 }, { "epoch": 4.455119214586255, "grad_norm": 0.29303303360939026, "learning_rate": 1.4753478136799614e-06, "loss": 0.2526, "step": 6353 }, { "epoch": 4.455820476858345, "grad_norm": 0.9110117554664612, "learning_rate": 1.4715903870294117e-06, "loss": 0.4212, "step": 6354 }, { "epoch": 4.456521739130435, "grad_norm": 0.2649446725845337, "learning_rate": 1.4678376061036976e-06, "loss": 0.0748, "step": 6355 }, { "epoch": 4.457223001402524, "grad_norm": 0.32219138741493225, "learning_rate": 1.4640894716438119e-06, "loss": 0.2481, "step": 6356 }, { "epoch": 4.457924263674614, "grad_norm": 0.3127251863479614, "learning_rate": 1.4603459843898242e-06, "loss": 0.249, "step": 6357 }, { "epoch": 4.4586255259467045, "grad_norm": 0.25074583292007446, "learning_rate": 1.4566071450808988e-06, "loss": 0.0897, "step": 6358 }, { "epoch": 4.459326788218794, "grad_norm": 0.3991132080554962, "learning_rate": 1.4528729544552726e-06, "loss": 0.2493, "step": 6359 }, { "epoch": 4.460028050490884, "grad_norm": 0.388883501291275, "learning_rate": 1.4491434132502729e-06, "loss": 0.258, "step": 6360 }, { "epoch": 4.4607293127629735, "grad_norm": 0.34038808941841125, "learning_rate": 1.4454185222022965e-06, "loss": 0.2327, "step": 6361 }, { "epoch": 4.461430575035063, "grad_norm": 0.2697981297969818, "learning_rate": 1.4416982820468356e-06, "loss": 0.0766, "step": 6362 }, { "epoch": 4.462131837307153, "grad_norm": 0.9372533559799194, "learning_rate": 1.4379826935184581e-06, "loss": 0.4156, "step": 6363 }, { "epoch": 4.4628330995792425, "grad_norm": 0.2547260820865631, "learning_rate": 1.4342717573508107e-06, "loss": 0.0838, "step": 6364 }, { "epoch": 4.463534361851332, "grad_norm": 0.25169703364372253, "learning_rate": 1.430565474276621e-06, "loss": 0.0905, "step": 6365 }, { "epoch": 4.464235624123422, "grad_norm": 0.2609158456325531, "learning_rate": 1.4268638450277067e-06, "loss": 0.0847, "step": 6366 }, { "epoch": 4.4649368863955115, "grad_norm": 0.24949954450130463, "learning_rate": 1.4231668703349576e-06, "loss": 0.0823, "step": 6367 }, { "epoch": 4.465638148667602, "grad_norm": 0.8075298070907593, "learning_rate": 1.4194745509283514e-06, "loss": 0.4233, "step": 6368 }, { "epoch": 4.466339410939692, "grad_norm": 0.320097953081131, "learning_rate": 1.415786887536935e-06, "loss": 0.2413, "step": 6369 }, { "epoch": 4.467040673211781, "grad_norm": 0.8301578164100647, "learning_rate": 1.412103880888857e-06, "loss": 0.4042, "step": 6370 }, { "epoch": 4.467741935483871, "grad_norm": 0.25212493538856506, "learning_rate": 1.4084255317113187e-06, "loss": 0.0748, "step": 6371 }, { "epoch": 4.468443197755961, "grad_norm": 0.3201468884944916, "learning_rate": 1.4047518407306283e-06, "loss": 0.241, "step": 6372 }, { "epoch": 4.46914446002805, "grad_norm": 0.2579905688762665, "learning_rate": 1.4010828086721523e-06, "loss": 0.0822, "step": 6373 }, { "epoch": 4.46984572230014, "grad_norm": 0.25557708740234375, "learning_rate": 1.397418436260356e-06, "loss": 0.0829, "step": 6374 }, { "epoch": 4.47054698457223, "grad_norm": 0.2551831603050232, "learning_rate": 1.3937587242187688e-06, "loss": 0.0893, "step": 6375 }, { "epoch": 4.47124824684432, "grad_norm": 0.25532159209251404, "learning_rate": 1.3901036732700128e-06, "loss": 0.0888, "step": 6376 }, { "epoch": 4.47194950911641, "grad_norm": 0.2506493330001831, "learning_rate": 1.3864532841357831e-06, "loss": 0.0896, "step": 6377 }, { "epoch": 4.4726507713884995, "grad_norm": 0.43422195315361023, "learning_rate": 1.3828075575368555e-06, "loss": 0.2612, "step": 6378 }, { "epoch": 4.473352033660589, "grad_norm": 0.8626829385757446, "learning_rate": 1.3791664941930793e-06, "loss": 0.4168, "step": 6379 }, { "epoch": 4.474053295932679, "grad_norm": 0.25070422887802124, "learning_rate": 1.3755300948233962e-06, "loss": 0.0834, "step": 6380 }, { "epoch": 4.4747545582047685, "grad_norm": 0.2552199959754944, "learning_rate": 1.3718983601458152e-06, "loss": 0.0838, "step": 6381 }, { "epoch": 4.475455820476858, "grad_norm": 0.2521866261959076, "learning_rate": 1.3682712908774293e-06, "loss": 0.0896, "step": 6382 }, { "epoch": 4.476157082748948, "grad_norm": 0.26168838143348694, "learning_rate": 1.36464888773441e-06, "loss": 0.0755, "step": 6383 }, { "epoch": 4.4768583450210375, "grad_norm": 0.2514285445213318, "learning_rate": 1.3610311514320128e-06, "loss": 0.0893, "step": 6384 }, { "epoch": 4.477559607293127, "grad_norm": 0.2917121648788452, "learning_rate": 1.3574180826845528e-06, "loss": 0.2488, "step": 6385 }, { "epoch": 4.478260869565218, "grad_norm": 0.2941823899745941, "learning_rate": 1.3538096822054453e-06, "loss": 0.2545, "step": 6386 }, { "epoch": 4.478962131837307, "grad_norm": 0.267029345035553, "learning_rate": 1.350205950707173e-06, "loss": 0.0769, "step": 6387 }, { "epoch": 4.479663394109397, "grad_norm": 3.630648612976074, "learning_rate": 1.3466068889013027e-06, "loss": 0.4575, "step": 6388 }, { "epoch": 4.480364656381487, "grad_norm": 0.2528753876686096, "learning_rate": 1.3430124974984692e-06, "loss": 0.0833, "step": 6389 }, { "epoch": 4.481065918653576, "grad_norm": 0.293435662984848, "learning_rate": 1.3394227772083961e-06, "loss": 0.2553, "step": 6390 }, { "epoch": 4.481767180925666, "grad_norm": 3.451003074645996, "learning_rate": 1.3358377287398804e-06, "loss": 0.4477, "step": 6391 }, { "epoch": 4.482468443197756, "grad_norm": 3.800833225250244, "learning_rate": 1.332257352800792e-06, "loss": 0.6134, "step": 6392 }, { "epoch": 4.483169705469845, "grad_norm": 0.2978748679161072, "learning_rate": 1.328681650098082e-06, "loss": 0.2547, "step": 6393 }, { "epoch": 4.483870967741936, "grad_norm": 0.3041779100894928, "learning_rate": 1.3251106213377829e-06, "loss": 0.2471, "step": 6394 }, { "epoch": 4.484572230014026, "grad_norm": 0.2558680772781372, "learning_rate": 1.3215442672249973e-06, "loss": 0.0833, "step": 6395 }, { "epoch": 4.485273492286115, "grad_norm": 0.2469528764486313, "learning_rate": 1.317982588463909e-06, "loss": 0.0804, "step": 6396 }, { "epoch": 4.485974754558205, "grad_norm": 0.2523874044418335, "learning_rate": 1.3144255857577781e-06, "loss": 0.0896, "step": 6397 }, { "epoch": 4.486676016830295, "grad_norm": 0.2619272470474243, "learning_rate": 1.3108732598089424e-06, "loss": 0.0754, "step": 6398 }, { "epoch": 4.487377279102384, "grad_norm": 0.2518288791179657, "learning_rate": 1.307325611318813e-06, "loss": 0.0897, "step": 6399 }, { "epoch": 4.488078541374474, "grad_norm": 0.8685265779495239, "learning_rate": 1.30378264098788e-06, "loss": 0.4241, "step": 6400 }, { "epoch": 4.488779803646564, "grad_norm": 0.26916834712028503, "learning_rate": 1.300244349515703e-06, "loss": 0.0774, "step": 6401 }, { "epoch": 4.489481065918653, "grad_norm": 0.24706411361694336, "learning_rate": 1.2967107376009342e-06, "loss": 0.0819, "step": 6402 }, { "epoch": 4.490182328190743, "grad_norm": 0.26053181290626526, "learning_rate": 1.2931818059412793e-06, "loss": 0.0835, "step": 6403 }, { "epoch": 4.4908835904628335, "grad_norm": 0.25453194975852966, "learning_rate": 1.289657555233545e-06, "loss": 0.0836, "step": 6404 }, { "epoch": 4.491584852734923, "grad_norm": 0.29741156101226807, "learning_rate": 1.286137986173594e-06, "loss": 0.2477, "step": 6405 }, { "epoch": 4.492286115007013, "grad_norm": 0.2489626407623291, "learning_rate": 1.2826230994563676e-06, "loss": 0.0891, "step": 6406 }, { "epoch": 4.4929873772791025, "grad_norm": 0.30395832657814026, "learning_rate": 1.2791128957758913e-06, "loss": 0.2495, "step": 6407 }, { "epoch": 4.493688639551192, "grad_norm": 0.2566397190093994, "learning_rate": 1.275607375825258e-06, "loss": 0.0837, "step": 6408 }, { "epoch": 4.494389901823282, "grad_norm": 0.24939721822738647, "learning_rate": 1.2721065402966441e-06, "loss": 0.0829, "step": 6409 }, { "epoch": 4.4950911640953715, "grad_norm": 0.25165170431137085, "learning_rate": 1.2686103898812917e-06, "loss": 0.0895, "step": 6410 }, { "epoch": 4.495792426367461, "grad_norm": 0.25726011395454407, "learning_rate": 1.2651189252695177e-06, "loss": 0.0834, "step": 6411 }, { "epoch": 4.496493688639551, "grad_norm": 0.8064595460891724, "learning_rate": 1.2616321471507265e-06, "loss": 0.4128, "step": 6412 }, { "epoch": 4.497194950911641, "grad_norm": 0.25789710879325867, "learning_rate": 1.2581500562133837e-06, "loss": 0.0845, "step": 6413 }, { "epoch": 4.497896213183731, "grad_norm": 0.2529245615005493, "learning_rate": 1.2546726531450282e-06, "loss": 0.0902, "step": 6414 }, { "epoch": 4.498597475455821, "grad_norm": 0.2756042182445526, "learning_rate": 1.2511999386322914e-06, "loss": 0.0796, "step": 6415 }, { "epoch": 4.49929873772791, "grad_norm": 0.25934720039367676, "learning_rate": 1.2477319133608557e-06, "loss": 0.076, "step": 6416 }, { "epoch": 4.5, "grad_norm": 0.3011397421360016, "learning_rate": 1.244268578015495e-06, "loss": 0.2488, "step": 6417 }, { "epoch": 4.50070126227209, "grad_norm": 0.30280596017837524, "learning_rate": 1.2408099332800488e-06, "loss": 0.2485, "step": 6418 }, { "epoch": 4.501402524544179, "grad_norm": 0.7973014116287231, "learning_rate": 1.2373559798374374e-06, "loss": 0.4106, "step": 6419 }, { "epoch": 4.502103786816269, "grad_norm": 0.30186888575553894, "learning_rate": 1.23390671836964e-06, "loss": 0.0709, "step": 6420 }, { "epoch": 4.502805049088359, "grad_norm": 0.29378145933151245, "learning_rate": 1.2304621495577262e-06, "loss": 0.2487, "step": 6421 }, { "epoch": 4.503506311360448, "grad_norm": 1.3212326765060425, "learning_rate": 1.2270222740818293e-06, "loss": 0.5798, "step": 6422 }, { "epoch": 4.504207573632539, "grad_norm": 0.252092182636261, "learning_rate": 1.2235870926211619e-06, "loss": 0.0894, "step": 6423 }, { "epoch": 4.5049088359046285, "grad_norm": 0.25171157717704773, "learning_rate": 1.2201566058540004e-06, "loss": 0.0891, "step": 6424 }, { "epoch": 4.505610098176718, "grad_norm": 0.25169607996940613, "learning_rate": 1.2167308144577088e-06, "loss": 0.0902, "step": 6425 }, { "epoch": 4.506311360448808, "grad_norm": 0.25542503595352173, "learning_rate": 1.2133097191087072e-06, "loss": 0.0895, "step": 6426 }, { "epoch": 4.5070126227208975, "grad_norm": 3.2061607837677, "learning_rate": 1.2098933204825053e-06, "loss": 0.4171, "step": 6427 }, { "epoch": 4.507713884992987, "grad_norm": 0.8000690937042236, "learning_rate": 1.206481619253666e-06, "loss": 0.4051, "step": 6428 }, { "epoch": 4.508415147265077, "grad_norm": 0.25210851430892944, "learning_rate": 1.2030746160958422e-06, "loss": 0.0893, "step": 6429 }, { "epoch": 4.5091164095371665, "grad_norm": 0.25239673256874084, "learning_rate": 1.1996723116817516e-06, "loss": 0.0896, "step": 6430 }, { "epoch": 4.509817671809257, "grad_norm": 1.295607089996338, "learning_rate": 1.1962747066831897e-06, "loss": 0.5794, "step": 6431 }, { "epoch": 4.510518934081347, "grad_norm": 0.30206212401390076, "learning_rate": 1.1928818017710092e-06, "loss": 0.2479, "step": 6432 }, { "epoch": 4.511220196353436, "grad_norm": 0.2667599022388458, "learning_rate": 1.1894935976151572e-06, "loss": 0.0842, "step": 6433 }, { "epoch": 4.511921458625526, "grad_norm": 0.2719915509223938, "learning_rate": 1.1861100948846293e-06, "loss": 0.0784, "step": 6434 }, { "epoch": 4.512622720897616, "grad_norm": 0.29588690400123596, "learning_rate": 1.1827312942475106e-06, "loss": 0.2444, "step": 6435 }, { "epoch": 4.513323983169705, "grad_norm": 0.2700355350971222, "learning_rate": 1.1793571963709448e-06, "loss": 0.0832, "step": 6436 }, { "epoch": 4.514025245441795, "grad_norm": 0.26551422476768494, "learning_rate": 1.1759878019211606e-06, "loss": 0.0828, "step": 6437 }, { "epoch": 4.514726507713885, "grad_norm": 0.29026174545288086, "learning_rate": 1.1726231115634451e-06, "loss": 0.2559, "step": 6438 }, { "epoch": 4.515427769985974, "grad_norm": 0.299826055765152, "learning_rate": 1.1692631259621722e-06, "loss": 0.2477, "step": 6439 }, { "epoch": 4.516129032258064, "grad_norm": 0.2562020421028137, "learning_rate": 1.1659078457807644e-06, "loss": 0.0837, "step": 6440 }, { "epoch": 4.516830294530155, "grad_norm": 0.2614131271839142, "learning_rate": 1.162557271681733e-06, "loss": 0.0845, "step": 6441 }, { "epoch": 4.517531556802244, "grad_norm": 0.25357601046562195, "learning_rate": 1.1592114043266516e-06, "loss": 0.0839, "step": 6442 }, { "epoch": 4.518232819074334, "grad_norm": 0.25314903259277344, "learning_rate": 1.1558702443761727e-06, "loss": 0.0894, "step": 6443 }, { "epoch": 4.518934081346424, "grad_norm": 0.30371829867362976, "learning_rate": 1.152533792490007e-06, "loss": 0.2497, "step": 6444 }, { "epoch": 4.519635343618513, "grad_norm": 0.25391289591789246, "learning_rate": 1.1492020493269529e-06, "loss": 0.0898, "step": 6445 }, { "epoch": 4.520336605890603, "grad_norm": 0.25279882550239563, "learning_rate": 1.1458750155448616e-06, "loss": 0.0901, "step": 6446 }, { "epoch": 4.521037868162693, "grad_norm": 0.2563851475715637, "learning_rate": 1.1425526918006607e-06, "loss": 0.0897, "step": 6447 }, { "epoch": 4.521739130434782, "grad_norm": 0.34350234270095825, "learning_rate": 1.139235078750353e-06, "loss": 0.234, "step": 6448 }, { "epoch": 4.522440392706873, "grad_norm": 0.2557099461555481, "learning_rate": 1.1359221770490037e-06, "loss": 0.0896, "step": 6449 }, { "epoch": 4.5231416549789625, "grad_norm": 0.24999785423278809, "learning_rate": 1.1326139873507502e-06, "loss": 0.0823, "step": 6450 }, { "epoch": 4.523842917251052, "grad_norm": 0.27493956685066223, "learning_rate": 1.1293105103088063e-06, "loss": 0.0784, "step": 6451 }, { "epoch": 4.524544179523142, "grad_norm": 0.252937912940979, "learning_rate": 1.1260117465754394e-06, "loss": 0.0896, "step": 6452 }, { "epoch": 4.5252454417952315, "grad_norm": 2.916613817214966, "learning_rate": 1.1227176968020058e-06, "loss": 0.4047, "step": 6453 }, { "epoch": 4.525946704067321, "grad_norm": 0.3887341022491455, "learning_rate": 1.1194283616389162e-06, "loss": 0.2514, "step": 6454 }, { "epoch": 4.526647966339411, "grad_norm": 0.25371941924095154, "learning_rate": 1.1161437417356563e-06, "loss": 0.0824, "step": 6455 }, { "epoch": 4.5273492286115005, "grad_norm": 0.26083171367645264, "learning_rate": 1.1128638377407796e-06, "loss": 0.085, "step": 6456 }, { "epoch": 4.52805049088359, "grad_norm": 0.2818686068058014, "learning_rate": 1.1095886503019072e-06, "loss": 0.2491, "step": 6457 }, { "epoch": 4.52875175315568, "grad_norm": 0.2634180188179016, "learning_rate": 1.1063181800657352e-06, "loss": 0.0752, "step": 6458 }, { "epoch": 4.52945301542777, "grad_norm": 0.2537623345851898, "learning_rate": 1.1030524276780223e-06, "loss": 0.0894, "step": 6459 }, { "epoch": 4.53015427769986, "grad_norm": 0.2694953382015228, "learning_rate": 1.0997913937835947e-06, "loss": 0.0783, "step": 6460 }, { "epoch": 4.53085553997195, "grad_norm": 0.252672016620636, "learning_rate": 1.0965350790263451e-06, "loss": 0.0752, "step": 6461 }, { "epoch": 4.531556802244039, "grad_norm": 0.2531993091106415, "learning_rate": 1.0932834840492484e-06, "loss": 0.0899, "step": 6462 }, { "epoch": 4.532258064516129, "grad_norm": 0.2530447542667389, "learning_rate": 1.0900366094943298e-06, "loss": 0.0834, "step": 6463 }, { "epoch": 4.532959326788219, "grad_norm": 0.2505335211753845, "learning_rate": 1.0867944560026933e-06, "loss": 0.0897, "step": 6464 }, { "epoch": 4.533660589060308, "grad_norm": 0.25192803144454956, "learning_rate": 1.0835570242145071e-06, "loss": 0.0904, "step": 6465 }, { "epoch": 4.534361851332398, "grad_norm": 0.35435962677001953, "learning_rate": 1.0803243147690074e-06, "loss": 0.2389, "step": 6466 }, { "epoch": 4.5350631136044885, "grad_norm": 0.2911701202392578, "learning_rate": 1.0770963283045027e-06, "loss": 0.2563, "step": 6467 }, { "epoch": 4.535764375876578, "grad_norm": 0.3090967833995819, "learning_rate": 1.0738730654583583e-06, "loss": 0.2517, "step": 6468 }, { "epoch": 4.536465638148668, "grad_norm": 0.34411531686782837, "learning_rate": 1.0706545268670098e-06, "loss": 0.2391, "step": 6469 }, { "epoch": 4.5371669004207575, "grad_norm": 0.26482874155044556, "learning_rate": 1.0674407131659737e-06, "loss": 0.0758, "step": 6470 }, { "epoch": 4.537868162692847, "grad_norm": 0.7858099341392517, "learning_rate": 1.0642316249898144e-06, "loss": 0.4127, "step": 6471 }, { "epoch": 4.538569424964937, "grad_norm": 0.2845078110694885, "learning_rate": 1.061027262972178e-06, "loss": 0.252, "step": 6472 }, { "epoch": 4.5392706872370265, "grad_norm": 0.2563697099685669, "learning_rate": 1.0578276277457666e-06, "loss": 0.0836, "step": 6473 }, { "epoch": 4.539971949509116, "grad_norm": 0.2988446056842804, "learning_rate": 1.054632719942361e-06, "loss": 0.2472, "step": 6474 }, { "epoch": 4.540673211781206, "grad_norm": 0.25140467286109924, "learning_rate": 1.0514425401927874e-06, "loss": 0.0824, "step": 6475 }, { "epoch": 4.5413744740532955, "grad_norm": 0.258938193321228, "learning_rate": 1.0482570891269667e-06, "loss": 0.0835, "step": 6476 }, { "epoch": 4.542075736325386, "grad_norm": 0.2531615197658539, "learning_rate": 1.0450763673738622e-06, "loss": 0.0833, "step": 6477 }, { "epoch": 4.542776998597476, "grad_norm": 4.413488864898682, "learning_rate": 1.0419003755615165e-06, "loss": 0.532, "step": 6478 }, { "epoch": 4.543478260869565, "grad_norm": 0.25200697779655457, "learning_rate": 1.038729114317033e-06, "loss": 0.0893, "step": 6479 }, { "epoch": 4.544179523141655, "grad_norm": 0.3129206597805023, "learning_rate": 1.035562584266589e-06, "loss": 0.2578, "step": 6480 }, { "epoch": 4.544880785413745, "grad_norm": 0.293989360332489, "learning_rate": 1.0324007860354146e-06, "loss": 0.2553, "step": 6481 }, { "epoch": 4.545582047685834, "grad_norm": 0.37084105610847473, "learning_rate": 1.0292437202478133e-06, "loss": 0.2579, "step": 6482 }, { "epoch": 4.546283309957924, "grad_norm": 0.25810736417770386, "learning_rate": 1.0260913875271533e-06, "loss": 0.084, "step": 6483 }, { "epoch": 4.546984572230014, "grad_norm": 0.25548428297042847, "learning_rate": 1.0229437884958698e-06, "loss": 0.0844, "step": 6484 }, { "epoch": 4.547685834502104, "grad_norm": 0.2855333983898163, "learning_rate": 1.0198009237754574e-06, "loss": 0.0694, "step": 6485 }, { "epoch": 4.548387096774194, "grad_norm": 0.26557356119155884, "learning_rate": 1.0166627939864865e-06, "loss": 0.0868, "step": 6486 }, { "epoch": 4.549088359046284, "grad_norm": 0.2518807649612427, "learning_rate": 1.0135293997485805e-06, "loss": 0.0894, "step": 6487 }, { "epoch": 4.549789621318373, "grad_norm": 0.2529963552951813, "learning_rate": 1.0104007416804422e-06, "loss": 0.0826, "step": 6488 }, { "epoch": 4.550490883590463, "grad_norm": 0.2838487923145294, "learning_rate": 1.0072768203998161e-06, "loss": 0.2519, "step": 6489 }, { "epoch": 4.551192145862553, "grad_norm": 0.2632748484611511, "learning_rate": 1.0041576365235395e-06, "loss": 0.0771, "step": 6490 }, { "epoch": 4.551893408134642, "grad_norm": 0.25999927520751953, "learning_rate": 1.0010431906674889e-06, "loss": 0.0817, "step": 6491 }, { "epoch": 4.552594670406732, "grad_norm": 0.2532215714454651, "learning_rate": 9.97933483446628e-07, "loss": 0.0898, "step": 6492 }, { "epoch": 4.553295932678822, "grad_norm": 0.3051576614379883, "learning_rate": 9.948285154749659e-07, "loss": 0.2437, "step": 6493 }, { "epoch": 4.553997194950911, "grad_norm": 0.2565939724445343, "learning_rate": 9.917282873655925e-07, "loss": 0.0886, "step": 6494 }, { "epoch": 4.554698457223001, "grad_norm": 0.26578328013420105, "learning_rate": 9.886327997306432e-07, "loss": 0.0781, "step": 6495 }, { "epoch": 4.5553997194950915, "grad_norm": 0.2555679380893707, "learning_rate": 9.855420531813319e-07, "loss": 0.0757, "step": 6496 }, { "epoch": 4.556100981767181, "grad_norm": 0.2532484829425812, "learning_rate": 9.824560483279315e-07, "loss": 0.0887, "step": 6497 }, { "epoch": 4.556802244039271, "grad_norm": 0.2514137625694275, "learning_rate": 9.793747857797826e-07, "loss": 0.09, "step": 6498 }, { "epoch": 4.5575035063113605, "grad_norm": 0.32197698950767517, "learning_rate": 9.762982661452786e-07, "loss": 0.2401, "step": 6499 }, { "epoch": 4.55820476858345, "grad_norm": 3.970062255859375, "learning_rate": 9.732264900318866e-07, "loss": 0.6561, "step": 6500 }, { "epoch": 4.55890603085554, "grad_norm": 0.3020341694355011, "learning_rate": 9.701594580461355e-07, "loss": 0.2465, "step": 6501 }, { "epoch": 4.5596072931276295, "grad_norm": 0.24322931468486786, "learning_rate": 9.670971707936156e-07, "loss": 0.0821, "step": 6502 }, { "epoch": 4.560308555399719, "grad_norm": 0.30539196729660034, "learning_rate": 9.640396288789822e-07, "loss": 0.2472, "step": 6503 }, { "epoch": 4.56100981767181, "grad_norm": 0.8019788265228271, "learning_rate": 9.60986832905947e-07, "loss": 0.4192, "step": 6504 }, { "epoch": 4.561711079943899, "grad_norm": 0.2533547878265381, "learning_rate": 9.579387834772969e-07, "loss": 0.083, "step": 6505 }, { "epoch": 4.562412342215989, "grad_norm": 0.2567501962184906, "learning_rate": 9.548954811948673e-07, "loss": 0.089, "step": 6506 }, { "epoch": 4.563113604488079, "grad_norm": 0.3032354414463043, "learning_rate": 9.518569266595695e-07, "loss": 0.2492, "step": 6507 }, { "epoch": 4.563814866760168, "grad_norm": 0.8128066062927246, "learning_rate": 9.488231204713676e-07, "loss": 0.4237, "step": 6508 }, { "epoch": 4.564516129032258, "grad_norm": 0.2562670111656189, "learning_rate": 9.457940632292939e-07, "loss": 0.0822, "step": 6509 }, { "epoch": 4.565217391304348, "grad_norm": 0.29032203555107117, "learning_rate": 9.427697555314363e-07, "loss": 0.2525, "step": 6510 }, { "epoch": 4.565918653576437, "grad_norm": 0.33134007453918457, "learning_rate": 9.39750197974959e-07, "loss": 0.244, "step": 6511 }, { "epoch": 4.566619915848527, "grad_norm": 0.3129400312900543, "learning_rate": 9.367353911560655e-07, "loss": 0.2418, "step": 6512 }, { "epoch": 4.567321178120617, "grad_norm": 0.25096002221107483, "learning_rate": 9.337253356700493e-07, "loss": 0.0898, "step": 6513 }, { "epoch": 4.568022440392707, "grad_norm": 0.8788493871688843, "learning_rate": 9.307200321112375e-07, "loss": 0.4294, "step": 6514 }, { "epoch": 4.568723702664797, "grad_norm": 0.2503940463066101, "learning_rate": 9.277194810730444e-07, "loss": 0.0898, "step": 6515 }, { "epoch": 4.5694249649368865, "grad_norm": 0.32190650701522827, "learning_rate": 9.247236831479266e-07, "loss": 0.2413, "step": 6516 }, { "epoch": 4.570126227208976, "grad_norm": 0.26574116945266724, "learning_rate": 9.217326389274139e-07, "loss": 0.0768, "step": 6517 }, { "epoch": 4.570827489481066, "grad_norm": 0.2682577669620514, "learning_rate": 9.187463490020864e-07, "loss": 0.0786, "step": 6518 }, { "epoch": 4.5715287517531555, "grad_norm": 0.2512907087802887, "learning_rate": 9.157648139615977e-07, "loss": 0.0901, "step": 6519 }, { "epoch": 4.572230014025245, "grad_norm": 0.2514925003051758, "learning_rate": 9.127880343946543e-07, "loss": 0.0893, "step": 6520 }, { "epoch": 4.572931276297335, "grad_norm": 0.25810176134109497, "learning_rate": 9.09816010889028e-07, "loss": 0.0743, "step": 6521 }, { "epoch": 4.573632538569425, "grad_norm": 0.2692939043045044, "learning_rate": 9.068487440315493e-07, "loss": 0.0782, "step": 6522 }, { "epoch": 4.574333800841515, "grad_norm": 0.25070682168006897, "learning_rate": 9.038862344081161e-07, "loss": 0.0898, "step": 6523 }, { "epoch": 4.575035063113605, "grad_norm": 0.25211483240127563, "learning_rate": 9.009284826036691e-07, "loss": 0.0905, "step": 6524 }, { "epoch": 4.575736325385694, "grad_norm": 0.2574620842933655, "learning_rate": 8.979754892022296e-07, "loss": 0.0892, "step": 6525 }, { "epoch": 4.576437587657784, "grad_norm": 0.2867315411567688, "learning_rate": 8.950272547868676e-07, "loss": 0.2531, "step": 6526 }, { "epoch": 4.577138849929874, "grad_norm": 0.3954198956489563, "learning_rate": 8.920837799397203e-07, "loss": 0.2575, "step": 6527 }, { "epoch": 4.577840112201963, "grad_norm": 0.2894914150238037, "learning_rate": 8.891450652419808e-07, "loss": 0.254, "step": 6528 }, { "epoch": 4.578541374474053, "grad_norm": 0.25435781478881836, "learning_rate": 8.862111112739019e-07, "loss": 0.0899, "step": 6529 }, { "epoch": 4.579242636746143, "grad_norm": 0.25238704681396484, "learning_rate": 8.832819186148006e-07, "loss": 0.0894, "step": 6530 }, { "epoch": 4.579943899018232, "grad_norm": 0.2626813054084778, "learning_rate": 8.803574878430504e-07, "loss": 0.0848, "step": 6531 }, { "epoch": 4.580645161290323, "grad_norm": 0.2486402988433838, "learning_rate": 8.77437819536081e-07, "loss": 0.0822, "step": 6532 }, { "epoch": 4.581346423562413, "grad_norm": 0.2507103681564331, "learning_rate": 8.745229142703898e-07, "loss": 0.0894, "step": 6533 }, { "epoch": 4.582047685834502, "grad_norm": 0.2907272279262543, "learning_rate": 8.716127726215273e-07, "loss": 0.072, "step": 6534 }, { "epoch": 4.582748948106592, "grad_norm": 0.25341588258743286, "learning_rate": 8.687073951641089e-07, "loss": 0.0888, "step": 6535 }, { "epoch": 4.583450210378682, "grad_norm": 0.2949685752391815, "learning_rate": 8.658067824718036e-07, "loss": 0.252, "step": 6536 }, { "epoch": 4.584151472650771, "grad_norm": 0.2530330717563629, "learning_rate": 8.629109351173504e-07, "loss": 0.0895, "step": 6537 }, { "epoch": 4.584852734922861, "grad_norm": 0.26022353768348694, "learning_rate": 8.600198536725279e-07, "loss": 0.0851, "step": 6538 }, { "epoch": 4.585553997194951, "grad_norm": 0.25104284286499023, "learning_rate": 8.571335387081908e-07, "loss": 0.0891, "step": 6539 }, { "epoch": 4.586255259467041, "grad_norm": 0.2804504930973053, "learning_rate": 8.542519907942442e-07, "loss": 0.2514, "step": 6540 }, { "epoch": 4.586956521739131, "grad_norm": 0.30528753995895386, "learning_rate": 8.51375210499658e-07, "loss": 0.247, "step": 6541 }, { "epoch": 4.5876577840112205, "grad_norm": 0.2519146800041199, "learning_rate": 8.485031983924557e-07, "loss": 0.0902, "step": 6542 }, { "epoch": 4.58835904628331, "grad_norm": 0.27544930577278137, "learning_rate": 8.456359550397224e-07, "loss": 0.0768, "step": 6543 }, { "epoch": 4.5890603085554, "grad_norm": 0.2501532733440399, "learning_rate": 8.427734810075999e-07, "loss": 0.089, "step": 6544 }, { "epoch": 4.5897615708274895, "grad_norm": 0.25980231165885925, "learning_rate": 8.399157768612831e-07, "loss": 0.085, "step": 6545 }, { "epoch": 4.590462833099579, "grad_norm": 0.39320817589759827, "learning_rate": 8.370628431650401e-07, "loss": 0.2632, "step": 6546 }, { "epoch": 4.591164095371669, "grad_norm": 0.29968196153640747, "learning_rate": 8.342146804821815e-07, "loss": 0.2449, "step": 6547 }, { "epoch": 4.5918653576437585, "grad_norm": 0.30345770716667175, "learning_rate": 8.313712893750824e-07, "loss": 0.2462, "step": 6548 }, { "epoch": 4.592566619915848, "grad_norm": 0.2890704870223999, "learning_rate": 8.285326704051772e-07, "loss": 0.0708, "step": 6549 }, { "epoch": 4.593267882187938, "grad_norm": 0.4019879102706909, "learning_rate": 8.256988241329533e-07, "loss": 0.2541, "step": 6550 }, { "epoch": 4.593969144460028, "grad_norm": 0.25054872035980225, "learning_rate": 8.228697511179634e-07, "loss": 0.0829, "step": 6551 }, { "epoch": 4.594670406732118, "grad_norm": 0.28540557622909546, "learning_rate": 8.200454519188078e-07, "loss": 0.2511, "step": 6552 }, { "epoch": 4.595371669004208, "grad_norm": 0.3878970146179199, "learning_rate": 8.172259270931488e-07, "loss": 0.2525, "step": 6553 }, { "epoch": 4.596072931276297, "grad_norm": 0.24796010553836823, "learning_rate": 8.144111771977103e-07, "loss": 0.0828, "step": 6554 }, { "epoch": 4.596774193548387, "grad_norm": 0.2969844341278076, "learning_rate": 8.116012027882675e-07, "loss": 0.2575, "step": 6555 }, { "epoch": 4.597475455820477, "grad_norm": 0.2508508861064911, "learning_rate": 8.087960044196541e-07, "loss": 0.0823, "step": 6556 }, { "epoch": 4.598176718092566, "grad_norm": 0.3022007942199707, "learning_rate": 8.05995582645766e-07, "loss": 0.2489, "step": 6557 }, { "epoch": 4.598877980364656, "grad_norm": 0.29305756092071533, "learning_rate": 8.031999380195443e-07, "loss": 0.2536, "step": 6558 }, { "epoch": 4.599579242636747, "grad_norm": 0.2942676544189453, "learning_rate": 8.004090710929973e-07, "loss": 0.2431, "step": 6559 }, { "epoch": 4.600280504908836, "grad_norm": 0.2847282886505127, "learning_rate": 7.976229824171872e-07, "loss": 0.2516, "step": 6560 }, { "epoch": 4.600981767180926, "grad_norm": 0.25178980827331543, "learning_rate": 7.948416725422292e-07, "loss": 0.0889, "step": 6561 }, { "epoch": 4.6016830294530155, "grad_norm": 0.30885928869247437, "learning_rate": 7.920651420173008e-07, "loss": 0.2472, "step": 6562 }, { "epoch": 4.602384291725105, "grad_norm": 0.25071677565574646, "learning_rate": 7.892933913906303e-07, "loss": 0.0889, "step": 6563 }, { "epoch": 4.603085553997195, "grad_norm": 0.26939627528190613, "learning_rate": 7.865264212095075e-07, "loss": 0.0778, "step": 6564 }, { "epoch": 4.6037868162692845, "grad_norm": 5.843258857727051, "learning_rate": 7.837642320202732e-07, "loss": 0.6275, "step": 6565 }, { "epoch": 4.604488078541374, "grad_norm": 0.819117546081543, "learning_rate": 7.810068243683244e-07, "loss": 0.4234, "step": 6566 }, { "epoch": 4.605189340813464, "grad_norm": 0.38563859462738037, "learning_rate": 7.782541987981201e-07, "loss": 0.243, "step": 6567 }, { "epoch": 4.6058906030855535, "grad_norm": 0.2500062584877014, "learning_rate": 7.7550635585317e-07, "loss": 0.09, "step": 6568 }, { "epoch": 4.606591865357644, "grad_norm": 0.25144344568252563, "learning_rate": 7.727632960760373e-07, "loss": 0.0903, "step": 6569 }, { "epoch": 4.607293127629734, "grad_norm": 0.25398969650268555, "learning_rate": 7.700250200083469e-07, "loss": 0.0823, "step": 6570 }, { "epoch": 4.607994389901823, "grad_norm": 2.978900194168091, "learning_rate": 7.672915281907778e-07, "loss": 0.6947, "step": 6571 }, { "epoch": 4.608695652173913, "grad_norm": 0.2522296607494354, "learning_rate": 7.64562821163059e-07, "loss": 0.0903, "step": 6572 }, { "epoch": 4.609396914446003, "grad_norm": 0.7952918410301208, "learning_rate": 7.618388994639763e-07, "loss": 0.4203, "step": 6573 }, { "epoch": 4.610098176718092, "grad_norm": 0.2503267228603363, "learning_rate": 7.591197636313801e-07, "loss": 0.0818, "step": 6574 }, { "epoch": 4.610799438990182, "grad_norm": 0.2970280647277832, "learning_rate": 7.564054142021604e-07, "loss": 0.2497, "step": 6575 }, { "epoch": 4.611500701262272, "grad_norm": 0.2909744679927826, "learning_rate": 7.536958517122799e-07, "loss": 0.2529, "step": 6576 }, { "epoch": 4.612201963534362, "grad_norm": 0.2534960210323334, "learning_rate": 7.509910766967332e-07, "loss": 0.0824, "step": 6577 }, { "epoch": 4.612903225806452, "grad_norm": 2.8605053424835205, "learning_rate": 7.482910896895984e-07, "loss": 0.3881, "step": 6578 }, { "epoch": 4.613604488078542, "grad_norm": 0.25169140100479126, "learning_rate": 7.455958912239797e-07, "loss": 0.0898, "step": 6579 }, { "epoch": 4.614305750350631, "grad_norm": 0.2532404661178589, "learning_rate": 7.429054818320541e-07, "loss": 0.0893, "step": 6580 }, { "epoch": 4.615007012622721, "grad_norm": 0.8300971984863281, "learning_rate": 7.40219862045044e-07, "loss": 0.4068, "step": 6581 }, { "epoch": 4.615708274894811, "grad_norm": 0.7916992902755737, "learning_rate": 7.375390323932363e-07, "loss": 0.4083, "step": 6582 }, { "epoch": 4.6164095371669, "grad_norm": 0.28167805075645447, "learning_rate": 7.348629934059575e-07, "loss": 0.0701, "step": 6583 }, { "epoch": 4.61711079943899, "grad_norm": 0.3067367970943451, "learning_rate": 7.321917456116018e-07, "loss": 0.2572, "step": 6584 }, { "epoch": 4.61781206171108, "grad_norm": 0.25506553053855896, "learning_rate": 7.295252895376109e-07, "loss": 0.0899, "step": 6585 }, { "epoch": 4.618513323983169, "grad_norm": 0.28942880034446716, "learning_rate": 7.268636257104805e-07, "loss": 0.2522, "step": 6586 }, { "epoch": 4.61921458625526, "grad_norm": 0.2542858421802521, "learning_rate": 7.242067546557568e-07, "loss": 0.0891, "step": 6587 }, { "epoch": 4.6199158485273495, "grad_norm": 0.3014567196369171, "learning_rate": 7.215546768980508e-07, "loss": 0.2527, "step": 6588 }, { "epoch": 4.620617110799439, "grad_norm": 0.25852733850479126, "learning_rate": 7.189073929610129e-07, "loss": 0.084, "step": 6589 }, { "epoch": 4.621318373071529, "grad_norm": 0.24570831656455994, "learning_rate": 7.162649033673585e-07, "loss": 0.0819, "step": 6590 }, { "epoch": 4.6220196353436185, "grad_norm": 0.2540488839149475, "learning_rate": 7.136272086388507e-07, "loss": 0.0826, "step": 6591 }, { "epoch": 4.622720897615708, "grad_norm": 0.2971332371234894, "learning_rate": 7.109943092963062e-07, "loss": 0.0708, "step": 6592 }, { "epoch": 4.623422159887798, "grad_norm": 0.3020614683628082, "learning_rate": 7.083662058595953e-07, "loss": 0.2488, "step": 6593 }, { "epoch": 4.6241234221598875, "grad_norm": 0.2520442306995392, "learning_rate": 7.057428988476417e-07, "loss": 0.0897, "step": 6594 }, { "epoch": 4.624824684431978, "grad_norm": 0.25725802779197693, "learning_rate": 7.031243887784228e-07, "loss": 0.0751, "step": 6595 }, { "epoch": 4.625525946704068, "grad_norm": 0.2970391511917114, "learning_rate": 7.005106761689667e-07, "loss": 0.2537, "step": 6596 }, { "epoch": 4.626227208976157, "grad_norm": 0.2596827745437622, "learning_rate": 6.979017615353551e-07, "loss": 0.0837, "step": 6597 }, { "epoch": 4.626928471248247, "grad_norm": 0.2504991292953491, "learning_rate": 6.952976453927257e-07, "loss": 0.0898, "step": 6598 }, { "epoch": 4.627629733520337, "grad_norm": 0.2550729513168335, "learning_rate": 6.926983282552618e-07, "loss": 0.0827, "step": 6599 }, { "epoch": 4.628330995792426, "grad_norm": 0.24979077279567719, "learning_rate": 6.901038106362029e-07, "loss": 0.0898, "step": 6600 }, { "epoch": 4.629032258064516, "grad_norm": 0.25957024097442627, "learning_rate": 6.875140930478474e-07, "loss": 0.0832, "step": 6601 }, { "epoch": 4.629733520336606, "grad_norm": 0.2582792639732361, "learning_rate": 6.849291760015281e-07, "loss": 0.0837, "step": 6602 }, { "epoch": 4.630434782608695, "grad_norm": 0.30305004119873047, "learning_rate": 6.823490600076532e-07, "loss": 0.2558, "step": 6603 }, { "epoch": 4.631136044880785, "grad_norm": 0.2529078722000122, "learning_rate": 6.797737455756653e-07, "loss": 0.0894, "step": 6604 }, { "epoch": 4.631837307152876, "grad_norm": 0.28262194991111755, "learning_rate": 6.772032332140633e-07, "loss": 0.0698, "step": 6605 }, { "epoch": 4.632538569424965, "grad_norm": 3.5795915126800537, "learning_rate": 6.746375234304048e-07, "loss": 0.4651, "step": 6606 }, { "epoch": 4.633239831697055, "grad_norm": 3.3967020511627197, "learning_rate": 6.720766167312876e-07, "loss": 0.593, "step": 6607 }, { "epoch": 4.6339410939691446, "grad_norm": 0.24696789681911469, "learning_rate": 6.695205136223681e-07, "loss": 0.0809, "step": 6608 }, { "epoch": 4.634642356241234, "grad_norm": 0.25014832615852356, "learning_rate": 6.669692146083567e-07, "loss": 0.09, "step": 6609 }, { "epoch": 4.635343618513324, "grad_norm": 0.31623977422714233, "learning_rate": 6.644227201930059e-07, "loss": 0.2436, "step": 6610 }, { "epoch": 4.6360448807854135, "grad_norm": 3.0106427669525146, "learning_rate": 6.618810308791329e-07, "loss": 0.411, "step": 6611 }, { "epoch": 4.636746143057503, "grad_norm": 0.2492159605026245, "learning_rate": 6.593441471685946e-07, "loss": 0.0826, "step": 6612 }, { "epoch": 4.637447405329594, "grad_norm": 0.3464606702327728, "learning_rate": 6.56812069562307e-07, "loss": 0.2305, "step": 6613 }, { "epoch": 4.638148667601683, "grad_norm": 0.288472056388855, "learning_rate": 6.54284798560223e-07, "loss": 0.2534, "step": 6614 }, { "epoch": 4.638849929873773, "grad_norm": 0.29354745149612427, "learning_rate": 6.517623346613654e-07, "loss": 0.2419, "step": 6615 }, { "epoch": 4.639551192145863, "grad_norm": 0.2508925497531891, "learning_rate": 6.492446783637945e-07, "loss": 0.0896, "step": 6616 }, { "epoch": 4.640252454417952, "grad_norm": 0.24747160077095032, "learning_rate": 6.467318301646319e-07, "loss": 0.0819, "step": 6617 }, { "epoch": 4.640953716690042, "grad_norm": 0.2543914318084717, "learning_rate": 6.442237905600363e-07, "loss": 0.0897, "step": 6618 }, { "epoch": 4.641654978962132, "grad_norm": 0.2720724642276764, "learning_rate": 6.417205600452336e-07, "loss": 0.0752, "step": 6619 }, { "epoch": 4.642356241234221, "grad_norm": 0.3073001205921173, "learning_rate": 6.392221391144815e-07, "loss": 0.2505, "step": 6620 }, { "epoch": 4.643057503506311, "grad_norm": 0.29072800278663635, "learning_rate": 6.367285282611018e-07, "loss": 0.2541, "step": 6621 }, { "epoch": 4.643758765778401, "grad_norm": 0.25366276502609253, "learning_rate": 6.342397279774621e-07, "loss": 0.0903, "step": 6622 }, { "epoch": 4.64446002805049, "grad_norm": 0.38471174240112305, "learning_rate": 6.317557387549799e-07, "loss": 0.2493, "step": 6623 }, { "epoch": 4.645161290322581, "grad_norm": 0.24797788262367249, "learning_rate": 6.292765610841216e-07, "loss": 0.0823, "step": 6624 }, { "epoch": 4.645862552594671, "grad_norm": 0.2465478926897049, "learning_rate": 6.268021954544096e-07, "loss": 0.0821, "step": 6625 }, { "epoch": 4.64656381486676, "grad_norm": 0.25438347458839417, "learning_rate": 6.243326423544083e-07, "loss": 0.089, "step": 6626 }, { "epoch": 4.64726507713885, "grad_norm": 0.25189855694770813, "learning_rate": 6.218679022717388e-07, "loss": 0.0902, "step": 6627 }, { "epoch": 4.64796633941094, "grad_norm": 0.3005567789077759, "learning_rate": 6.194079756930593e-07, "loss": 0.2478, "step": 6628 }, { "epoch": 4.648667601683029, "grad_norm": 0.30884459614753723, "learning_rate": 6.169528631040949e-07, "loss": 0.2495, "step": 6629 }, { "epoch": 4.649368863955119, "grad_norm": 0.37237557768821716, "learning_rate": 6.145025649896081e-07, "loss": 0.2457, "step": 6630 }, { "epoch": 4.650070126227209, "grad_norm": 0.253480464220047, "learning_rate": 6.12057081833417e-07, "loss": 0.0903, "step": 6631 }, { "epoch": 4.650771388499299, "grad_norm": 0.2510205805301666, "learning_rate": 6.09616414118383e-07, "loss": 0.0898, "step": 6632 }, { "epoch": 4.651472650771389, "grad_norm": 0.25410154461860657, "learning_rate": 6.071805623264259e-07, "loss": 0.0895, "step": 6633 }, { "epoch": 4.6521739130434785, "grad_norm": 3.6499860286712646, "learning_rate": 6.047495269385029e-07, "loss": 0.4569, "step": 6634 }, { "epoch": 4.652875175315568, "grad_norm": 0.30148711800575256, "learning_rate": 6.023233084346296e-07, "loss": 0.2538, "step": 6635 }, { "epoch": 4.653576437587658, "grad_norm": 0.28478437662124634, "learning_rate": 5.99901907293865e-07, "loss": 0.2527, "step": 6636 }, { "epoch": 4.6542776998597475, "grad_norm": 0.2600894570350647, "learning_rate": 5.974853239943179e-07, "loss": 0.085, "step": 6637 }, { "epoch": 4.654978962131837, "grad_norm": 0.2534273564815521, "learning_rate": 5.950735590131484e-07, "loss": 0.0895, "step": 6638 }, { "epoch": 4.655680224403927, "grad_norm": 0.8157161474227905, "learning_rate": 5.926666128265673e-07, "loss": 0.404, "step": 6639 }, { "epoch": 4.6563814866760165, "grad_norm": 0.24808385968208313, "learning_rate": 5.90264485909825e-07, "loss": 0.0824, "step": 6640 }, { "epoch": 4.657082748948106, "grad_norm": 0.2526392936706543, "learning_rate": 5.878671787372281e-07, "loss": 0.0897, "step": 6641 }, { "epoch": 4.657784011220197, "grad_norm": 0.2692171335220337, "learning_rate": 5.854746917821258e-07, "loss": 0.085, "step": 6642 }, { "epoch": 4.658485273492286, "grad_norm": 0.29353591799736023, "learning_rate": 5.830870255169263e-07, "loss": 0.2522, "step": 6643 }, { "epoch": 4.659186535764376, "grad_norm": 0.2586229741573334, "learning_rate": 5.807041804130664e-07, "loss": 0.0756, "step": 6644 }, { "epoch": 4.659887798036466, "grad_norm": 0.2546698749065399, "learning_rate": 5.783261569410559e-07, "loss": 0.0831, "step": 6645 }, { "epoch": 4.660589060308555, "grad_norm": 0.8110001087188721, "learning_rate": 5.759529555704273e-07, "loss": 0.3978, "step": 6646 }, { "epoch": 4.661290322580645, "grad_norm": 0.3004505932331085, "learning_rate": 5.735845767697839e-07, "loss": 0.2447, "step": 6647 }, { "epoch": 4.661991584852735, "grad_norm": 1.2885218858718872, "learning_rate": 5.712210210067626e-07, "loss": 0.5803, "step": 6648 }, { "epoch": 4.662692847124824, "grad_norm": 0.3714926242828369, "learning_rate": 5.688622887480482e-07, "loss": 0.2567, "step": 6649 }, { "epoch": 4.663394109396915, "grad_norm": 0.8312938213348389, "learning_rate": 5.665083804593824e-07, "loss": 0.4077, "step": 6650 }, { "epoch": 4.664095371669005, "grad_norm": 0.3131903409957886, "learning_rate": 5.641592966055403e-07, "loss": 0.2504, "step": 6651 }, { "epoch": 4.664796633941094, "grad_norm": 0.25147172808647156, "learning_rate": 5.618150376503595e-07, "loss": 0.0903, "step": 6652 }, { "epoch": 4.665497896213184, "grad_norm": 0.253920316696167, "learning_rate": 5.59475604056714e-07, "loss": 0.0889, "step": 6653 }, { "epoch": 4.666199158485274, "grad_norm": 0.31064078211784363, "learning_rate": 5.571409962865343e-07, "loss": 0.2543, "step": 6654 }, { "epoch": 4.666900420757363, "grad_norm": 0.25196388363838196, "learning_rate": 5.548112148007878e-07, "loss": 0.0903, "step": 6655 }, { "epoch": 4.667601683029453, "grad_norm": 0.25243282318115234, "learning_rate": 5.524862600594954e-07, "loss": 0.0891, "step": 6656 }, { "epoch": 4.6683029453015426, "grad_norm": 0.2875657081604004, "learning_rate": 5.501661325217205e-07, "loss": 0.2548, "step": 6657 }, { "epoch": 4.669004207573632, "grad_norm": 0.260322630405426, "learning_rate": 5.478508326455828e-07, "loss": 0.0844, "step": 6658 }, { "epoch": 4.669705469845722, "grad_norm": 0.25393614172935486, "learning_rate": 5.455403608882331e-07, "loss": 0.0893, "step": 6659 }, { "epoch": 4.670406732117812, "grad_norm": 0.24938374757766724, "learning_rate": 5.432347177058872e-07, "loss": 0.0814, "step": 6660 }, { "epoch": 4.671107994389902, "grad_norm": 0.2794840633869171, "learning_rate": 5.409339035537919e-07, "loss": 0.2511, "step": 6661 }, { "epoch": 4.671809256661992, "grad_norm": 0.25505781173706055, "learning_rate": 5.386379188862534e-07, "loss": 0.0829, "step": 6662 }, { "epoch": 4.672510518934081, "grad_norm": 0.27720046043395996, "learning_rate": 5.363467641566117e-07, "loss": 0.079, "step": 6663 }, { "epoch": 4.673211781206171, "grad_norm": 0.27086251974105835, "learning_rate": 5.340604398172605e-07, "loss": 0.0777, "step": 6664 }, { "epoch": 4.673913043478261, "grad_norm": 0.2921179234981537, "learning_rate": 5.317789463196388e-07, "loss": 0.2541, "step": 6665 }, { "epoch": 4.67461430575035, "grad_norm": 0.2574419379234314, "learning_rate": 5.295022841142333e-07, "loss": 0.0834, "step": 6666 }, { "epoch": 4.67531556802244, "grad_norm": 0.2597752809524536, "learning_rate": 5.272304536505707e-07, "loss": 0.0834, "step": 6667 }, { "epoch": 4.676016830294531, "grad_norm": 0.2512148320674896, "learning_rate": 5.249634553772337e-07, "loss": 0.0898, "step": 6668 }, { "epoch": 4.67671809256662, "grad_norm": 0.2566084563732147, "learning_rate": 5.22701289741842e-07, "loss": 0.0843, "step": 6669 }, { "epoch": 4.67741935483871, "grad_norm": 0.2911485433578491, "learning_rate": 5.204439571910636e-07, "loss": 0.256, "step": 6670 }, { "epoch": 4.6781206171108, "grad_norm": 0.2489541471004486, "learning_rate": 5.18191458170611e-07, "loss": 0.0891, "step": 6671 }, { "epoch": 4.678821879382889, "grad_norm": 0.3766954839229584, "learning_rate": 5.159437931252453e-07, "loss": 0.2574, "step": 6672 }, { "epoch": 4.679523141654979, "grad_norm": 0.31311070919036865, "learning_rate": 5.137009624987726e-07, "loss": 0.2406, "step": 6673 }, { "epoch": 4.680224403927069, "grad_norm": 0.3002491891384125, "learning_rate": 5.11462966734047e-07, "loss": 0.2467, "step": 6674 }, { "epoch": 4.680925666199158, "grad_norm": 0.2871038019657135, "learning_rate": 5.092298062729567e-07, "loss": 0.2532, "step": 6675 }, { "epoch": 4.681626928471248, "grad_norm": 0.25248944759368896, "learning_rate": 5.070014815564544e-07, "loss": 0.0897, "step": 6676 }, { "epoch": 4.682328190743338, "grad_norm": 0.25262773036956787, "learning_rate": 5.047779930245134e-07, "loss": 0.0895, "step": 6677 }, { "epoch": 4.683029453015427, "grad_norm": 0.24960914254188538, "learning_rate": 5.025593411161739e-07, "loss": 0.0899, "step": 6678 }, { "epoch": 4.683730715287518, "grad_norm": 0.25253933668136597, "learning_rate": 5.003455262695106e-07, "loss": 0.0894, "step": 6679 }, { "epoch": 4.6844319775596075, "grad_norm": 0.26716867089271545, "learning_rate": 4.981365489216433e-07, "loss": 0.0776, "step": 6680 }, { "epoch": 4.685133239831697, "grad_norm": 0.2940695285797119, "learning_rate": 4.959324095087398e-07, "loss": 0.2566, "step": 6681 }, { "epoch": 4.685834502103787, "grad_norm": 0.3321552574634552, "learning_rate": 4.937331084660129e-07, "loss": 0.2441, "step": 6682 }, { "epoch": 4.6865357643758765, "grad_norm": 0.2535405457019806, "learning_rate": 4.915386462277183e-07, "loss": 0.084, "step": 6683 }, { "epoch": 4.687237026647966, "grad_norm": 0.2538938820362091, "learning_rate": 4.893490232271508e-07, "loss": 0.0891, "step": 6684 }, { "epoch": 4.687938288920056, "grad_norm": 0.2557510733604431, "learning_rate": 4.871642398966592e-07, "loss": 0.0826, "step": 6685 }, { "epoch": 4.6886395511921455, "grad_norm": 0.24915549159049988, "learning_rate": 4.849842966676371e-07, "loss": 0.0817, "step": 6686 }, { "epoch": 4.689340813464236, "grad_norm": 0.38232171535491943, "learning_rate": 4.82809193970507e-07, "loss": 0.257, "step": 6687 }, { "epoch": 4.690042075736326, "grad_norm": 1.3262311220169067, "learning_rate": 4.806389322347588e-07, "loss": 0.5853, "step": 6688 }, { "epoch": 4.690743338008415, "grad_norm": 0.2515103816986084, "learning_rate": 4.784735118889077e-07, "loss": 0.0831, "step": 6689 }, { "epoch": 4.691444600280505, "grad_norm": 0.2499104142189026, "learning_rate": 4.763129333605176e-07, "loss": 0.0897, "step": 6690 }, { "epoch": 4.692145862552595, "grad_norm": 0.2526598572731018, "learning_rate": 4.7415719707620544e-07, "loss": 0.0837, "step": 6691 }, { "epoch": 4.692847124824684, "grad_norm": 0.24951112270355225, "learning_rate": 4.7200630346161955e-07, "loss": 0.0823, "step": 6692 }, { "epoch": 4.693548387096774, "grad_norm": 0.29501208662986755, "learning_rate": 4.698602529414592e-07, "loss": 0.2447, "step": 6693 }, { "epoch": 4.694249649368864, "grad_norm": 0.2533400356769562, "learning_rate": 4.6771904593946593e-07, "loss": 0.0901, "step": 6694 }, { "epoch": 4.694950911640953, "grad_norm": 1.3370434045791626, "learning_rate": 4.655826828784238e-07, "loss": 0.5782, "step": 6695 }, { "epoch": 4.695652173913043, "grad_norm": 0.8045379519462585, "learning_rate": 4.634511641801648e-07, "loss": 0.4129, "step": 6696 }, { "epoch": 4.696353436185134, "grad_norm": 0.30013737082481384, "learning_rate": 4.613244902655578e-07, "loss": 0.2494, "step": 6697 }, { "epoch": 4.697054698457223, "grad_norm": 0.2520592212677002, "learning_rate": 4.5920266155451695e-07, "loss": 0.0897, "step": 6698 }, { "epoch": 4.697755960729313, "grad_norm": 0.2539771795272827, "learning_rate": 4.5708567846600435e-07, "loss": 0.0901, "step": 6699 }, { "epoch": 4.698457223001403, "grad_norm": 0.2537994682788849, "learning_rate": 4.549735414180162e-07, "loss": 0.0831, "step": 6700 }, { "epoch": 4.699158485273492, "grad_norm": 0.2533799707889557, "learning_rate": 4.5286625082760493e-07, "loss": 0.09, "step": 6701 }, { "epoch": 4.699859747545582, "grad_norm": 0.24888396263122559, "learning_rate": 4.507638071108544e-07, "loss": 0.0826, "step": 6702 }, { "epoch": 4.7005610098176716, "grad_norm": 0.26163798570632935, "learning_rate": 4.48666210682902e-07, "loss": 0.0769, "step": 6703 }, { "epoch": 4.701262272089761, "grad_norm": 0.2901971936225891, "learning_rate": 4.4657346195791095e-07, "loss": 0.2541, "step": 6704 }, { "epoch": 4.701963534361852, "grad_norm": 0.30036047101020813, "learning_rate": 4.4448556134910613e-07, "loss": 0.2496, "step": 6705 }, { "epoch": 4.702664796633941, "grad_norm": 0.2841147184371948, "learning_rate": 4.4240250926874115e-07, "loss": 0.2533, "step": 6706 }, { "epoch": 4.703366058906031, "grad_norm": 0.2653639614582062, "learning_rate": 4.4032430612812593e-07, "loss": 0.0755, "step": 6707 }, { "epoch": 4.704067321178121, "grad_norm": 0.7966226935386658, "learning_rate": 4.382509523375988e-07, "loss": 0.4179, "step": 6708 }, { "epoch": 4.70476858345021, "grad_norm": 0.2900155186653137, "learning_rate": 4.361824483065491e-07, "loss": 0.0694, "step": 6709 }, { "epoch": 4.7054698457223, "grad_norm": 0.2569934129714966, "learning_rate": 4.341187944434083e-07, "loss": 0.084, "step": 6710 }, { "epoch": 4.70617110799439, "grad_norm": 0.25152158737182617, "learning_rate": 4.320599911556478e-07, "loss": 0.0898, "step": 6711 }, { "epoch": 4.706872370266479, "grad_norm": 0.2506415545940399, "learning_rate": 4.3000603884977564e-07, "loss": 0.0815, "step": 6712 }, { "epoch": 4.707573632538569, "grad_norm": 0.24948182702064514, "learning_rate": 4.2795693793135914e-07, "loss": 0.0823, "step": 6713 }, { "epoch": 4.708274894810659, "grad_norm": 0.24772478640079498, "learning_rate": 4.259126888049858e-07, "loss": 0.082, "step": 6714 }, { "epoch": 4.708976157082749, "grad_norm": 0.25141844153404236, "learning_rate": 4.238732918743049e-07, "loss": 0.0897, "step": 6715 }, { "epoch": 4.709677419354839, "grad_norm": 0.29433920979499817, "learning_rate": 4.218387475419916e-07, "loss": 0.2546, "step": 6716 }, { "epoch": 4.710378681626929, "grad_norm": 0.2501557767391205, "learning_rate": 4.1980905620978006e-07, "loss": 0.0902, "step": 6717 }, { "epoch": 4.711079943899018, "grad_norm": 0.7898280024528503, "learning_rate": 4.1778421827842473e-07, "loss": 0.4159, "step": 6718 }, { "epoch": 4.711781206171108, "grad_norm": 0.817695677280426, "learning_rate": 4.157642341477419e-07, "loss": 0.4106, "step": 6719 }, { "epoch": 4.712482468443198, "grad_norm": 0.2577083110809326, "learning_rate": 4.137491042165764e-07, "loss": 0.0812, "step": 6720 }, { "epoch": 4.713183730715287, "grad_norm": 0.32066503167152405, "learning_rate": 4.1173882888282114e-07, "loss": 0.2391, "step": 6721 }, { "epoch": 4.713884992987377, "grad_norm": 0.26777538657188416, "learning_rate": 4.097334085434029e-07, "loss": 0.0765, "step": 6722 }, { "epoch": 4.7145862552594675, "grad_norm": 0.2706166207790375, "learning_rate": 4.077328435943051e-07, "loss": 0.079, "step": 6723 }, { "epoch": 4.715287517531557, "grad_norm": 0.255165696144104, "learning_rate": 4.057371344305394e-07, "loss": 0.0831, "step": 6724 }, { "epoch": 4.715988779803647, "grad_norm": 0.2894384562969208, "learning_rate": 4.0374628144615735e-07, "loss": 0.2535, "step": 6725 }, { "epoch": 4.7166900420757365, "grad_norm": 0.2494085133075714, "learning_rate": 4.0176028503425835e-07, "loss": 0.0809, "step": 6726 }, { "epoch": 4.717391304347826, "grad_norm": 0.27153468132019043, "learning_rate": 3.9977914558698427e-07, "loss": 0.0782, "step": 6727 }, { "epoch": 4.718092566619916, "grad_norm": 0.2519690990447998, "learning_rate": 3.978028634955111e-07, "loss": 0.0896, "step": 6728 }, { "epoch": 4.7187938288920055, "grad_norm": 0.8881912231445312, "learning_rate": 3.9583143915006004e-07, "loss": 0.4123, "step": 6729 }, { "epoch": 4.719495091164095, "grad_norm": 0.29536327719688416, "learning_rate": 3.9386487293988914e-07, "loss": 0.2415, "step": 6730 }, { "epoch": 4.720196353436185, "grad_norm": 0.30109500885009766, "learning_rate": 3.919031652533101e-07, "loss": 0.2509, "step": 6731 }, { "epoch": 4.7208976157082745, "grad_norm": 0.28669777512550354, "learning_rate": 3.899463164776546e-07, "loss": 0.2518, "step": 6732 }, { "epoch": 4.721598877980365, "grad_norm": 0.25198692083358765, "learning_rate": 3.8799432699931083e-07, "loss": 0.0898, "step": 6733 }, { "epoch": 4.722300140252455, "grad_norm": 0.25725191831588745, "learning_rate": 3.860471972036983e-07, "loss": 0.0762, "step": 6734 }, { "epoch": 4.723001402524544, "grad_norm": 3.6494486331939697, "learning_rate": 3.8410492747528727e-07, "loss": 0.4628, "step": 6735 }, { "epoch": 4.723702664796634, "grad_norm": 0.2912884056568146, "learning_rate": 3.8216751819757926e-07, "loss": 0.2565, "step": 6736 }, { "epoch": 4.724403927068724, "grad_norm": 0.29134470224380493, "learning_rate": 3.802349697531182e-07, "loss": 0.2499, "step": 6737 }, { "epoch": 4.725105189340813, "grad_norm": 0.2518364191055298, "learning_rate": 3.783072825234907e-07, "loss": 0.089, "step": 6738 }, { "epoch": 4.725806451612903, "grad_norm": 0.2511918246746063, "learning_rate": 3.763844568893199e-07, "loss": 0.0897, "step": 6739 }, { "epoch": 4.726507713884993, "grad_norm": 3.949904203414917, "learning_rate": 3.744664932302744e-07, "loss": 0.4669, "step": 6740 }, { "epoch": 4.727208976157083, "grad_norm": 0.26610103249549866, "learning_rate": 3.725533919250568e-07, "loss": 0.0764, "step": 6741 }, { "epoch": 4.727910238429173, "grad_norm": 0.794940710067749, "learning_rate": 3.7064515335140935e-07, "loss": 0.4177, "step": 6742 }, { "epoch": 4.728611500701263, "grad_norm": 0.30029502511024475, "learning_rate": 3.687417778861252e-07, "loss": 0.2475, "step": 6743 }, { "epoch": 4.729312762973352, "grad_norm": 0.2844734787940979, "learning_rate": 3.6684326590502025e-07, "loss": 0.2516, "step": 6744 }, { "epoch": 4.730014025245442, "grad_norm": 0.2527453899383545, "learning_rate": 3.64949617782967e-07, "loss": 0.0884, "step": 6745 }, { "epoch": 4.730715287517532, "grad_norm": 0.24757523834705353, "learning_rate": 3.6306083389386347e-07, "loss": 0.0824, "step": 6746 }, { "epoch": 4.731416549789621, "grad_norm": 0.2570001780986786, "learning_rate": 3.6117691461065584e-07, "loss": 0.0836, "step": 6747 }, { "epoch": 4.732117812061711, "grad_norm": 0.8574639558792114, "learning_rate": 3.5929786030532707e-07, "loss": 0.4157, "step": 6748 }, { "epoch": 4.732819074333801, "grad_norm": 0.28860366344451904, "learning_rate": 3.574236713488971e-07, "loss": 0.2529, "step": 6749 }, { "epoch": 4.73352033660589, "grad_norm": 0.25289642810821533, "learning_rate": 3.555543481114337e-07, "loss": 0.0901, "step": 6750 }, { "epoch": 4.73422159887798, "grad_norm": 0.32385867834091187, "learning_rate": 3.5368989096203065e-07, "loss": 0.2403, "step": 6751 }, { "epoch": 4.73492286115007, "grad_norm": 0.2531503438949585, "learning_rate": 3.51830300268835e-07, "loss": 0.0898, "step": 6752 }, { "epoch": 4.73562412342216, "grad_norm": 0.29371556639671326, "learning_rate": 3.499755763990198e-07, "loss": 0.073, "step": 6753 }, { "epoch": 4.73632538569425, "grad_norm": 0.8725511431694031, "learning_rate": 3.481257197188087e-07, "loss": 0.4207, "step": 6754 }, { "epoch": 4.737026647966339, "grad_norm": 0.300352543592453, "learning_rate": 3.46280730593454e-07, "loss": 0.2453, "step": 6755 }, { "epoch": 4.737727910238429, "grad_norm": 0.2893063724040985, "learning_rate": 3.4444060938725877e-07, "loss": 0.2533, "step": 6756 }, { "epoch": 4.738429172510519, "grad_norm": 0.30353349447250366, "learning_rate": 3.4260535646355176e-07, "loss": 0.2482, "step": 6757 }, { "epoch": 4.739130434782608, "grad_norm": 0.29969701170921326, "learning_rate": 3.4077497218470976e-07, "loss": 0.246, "step": 6758 }, { "epoch": 4.739831697054698, "grad_norm": 0.2647351324558258, "learning_rate": 3.3894945691214653e-07, "loss": 0.0847, "step": 6759 }, { "epoch": 4.740532959326789, "grad_norm": 0.25311607122421265, "learning_rate": 3.371288110063098e-07, "loss": 0.0902, "step": 6760 }, { "epoch": 4.741234221598878, "grad_norm": 0.2539476752281189, "learning_rate": 3.353130348266925e-07, "loss": 0.0824, "step": 6761 }, { "epoch": 4.741935483870968, "grad_norm": 0.40953385829925537, "learning_rate": 3.3350212873181905e-07, "loss": 0.2519, "step": 6762 }, { "epoch": 4.742636746143058, "grad_norm": 0.2506941556930542, "learning_rate": 3.31696093079259e-07, "loss": 0.0896, "step": 6763 }, { "epoch": 4.743338008415147, "grad_norm": 4.790666103363037, "learning_rate": 3.2989492822561584e-07, "loss": 0.537, "step": 6764 }, { "epoch": 4.744039270687237, "grad_norm": 0.2739790678024292, "learning_rate": 3.28098634526533e-07, "loss": 0.0784, "step": 6765 }, { "epoch": 4.744740532959327, "grad_norm": 0.2516102194786072, "learning_rate": 3.263072123366934e-07, "loss": 0.0834, "step": 6766 }, { "epoch": 4.745441795231416, "grad_norm": 0.7979763746261597, "learning_rate": 3.2452066200981125e-07, "loss": 0.4166, "step": 6767 }, { "epoch": 4.746143057503506, "grad_norm": 0.3154265880584717, "learning_rate": 3.2273898389865167e-07, "loss": 0.2386, "step": 6768 }, { "epoch": 4.746844319775596, "grad_norm": 0.24811384081840515, "learning_rate": 3.209621783550026e-07, "loss": 0.0738, "step": 6769 }, { "epoch": 4.747545582047686, "grad_norm": 0.250165730714798, "learning_rate": 3.1919024572970303e-07, "loss": 0.0895, "step": 6770 }, { "epoch": 4.748246844319776, "grad_norm": 3.2048487663269043, "learning_rate": 3.174231863726201e-07, "loss": 0.4059, "step": 6771 }, { "epoch": 4.7489481065918655, "grad_norm": 0.37413740158081055, "learning_rate": 3.156610006326666e-07, "loss": 0.2298, "step": 6772 }, { "epoch": 4.749649368863955, "grad_norm": 0.2514837980270386, "learning_rate": 3.139036888577862e-07, "loss": 0.0837, "step": 6773 }, { "epoch": 4.750350631136045, "grad_norm": 0.2697930335998535, "learning_rate": 3.121512513949626e-07, "loss": 0.075, "step": 6774 }, { "epoch": 4.7510518934081345, "grad_norm": 0.26032403111457825, "learning_rate": 3.10403688590219e-07, "loss": 0.0833, "step": 6775 }, { "epoch": 4.751753155680224, "grad_norm": 0.25107017159461975, "learning_rate": 3.0866100078861535e-07, "loss": 0.0899, "step": 6776 }, { "epoch": 4.752454417952314, "grad_norm": 0.25425854325294495, "learning_rate": 3.0692318833424605e-07, "loss": 0.0896, "step": 6777 }, { "epoch": 4.753155680224404, "grad_norm": 0.26015210151672363, "learning_rate": 3.051902515702476e-07, "loss": 0.0894, "step": 6778 }, { "epoch": 4.753856942496494, "grad_norm": 0.27789321541786194, "learning_rate": 3.0346219083878803e-07, "loss": 0.0774, "step": 6779 }, { "epoch": 4.754558204768584, "grad_norm": 0.25126388669013977, "learning_rate": 3.017390064810832e-07, "loss": 0.0897, "step": 6780 }, { "epoch": 4.755259467040673, "grad_norm": 0.3268929719924927, "learning_rate": 3.0002069883736936e-07, "loss": 0.2496, "step": 6781 }, { "epoch": 4.755960729312763, "grad_norm": 0.2526334226131439, "learning_rate": 2.983072682469362e-07, "loss": 0.0896, "step": 6782 }, { "epoch": 4.756661991584853, "grad_norm": 0.41889655590057373, "learning_rate": 2.9659871504809924e-07, "loss": 0.2623, "step": 6783 }, { "epoch": 4.757363253856942, "grad_norm": 0.25206872820854187, "learning_rate": 2.948950395782191e-07, "loss": 0.0901, "step": 6784 }, { "epoch": 4.758064516129032, "grad_norm": 0.2685116231441498, "learning_rate": 2.9319624217368504e-07, "loss": 0.0681, "step": 6785 }, { "epoch": 4.758765778401122, "grad_norm": 0.2518468499183655, "learning_rate": 2.9150232316993153e-07, "loss": 0.0902, "step": 6786 }, { "epoch": 4.759467040673211, "grad_norm": 0.25521695613861084, "learning_rate": 2.898132829014244e-07, "loss": 0.0898, "step": 6787 }, { "epoch": 4.760168302945302, "grad_norm": 0.2773083746433258, "learning_rate": 2.881291217016663e-07, "loss": 0.0784, "step": 6788 }, { "epoch": 4.760869565217392, "grad_norm": 0.253465861082077, "learning_rate": 2.864498399031995e-07, "loss": 0.0903, "step": 6789 }, { "epoch": 4.761570827489481, "grad_norm": 0.2651306986808777, "learning_rate": 2.847754378376005e-07, "loss": 0.0866, "step": 6790 }, { "epoch": 4.762272089761571, "grad_norm": 0.2650688588619232, "learning_rate": 2.8310591583547984e-07, "loss": 0.0774, "step": 6791 }, { "epoch": 4.762973352033661, "grad_norm": 0.250828355550766, "learning_rate": 2.8144127422649325e-07, "loss": 0.0897, "step": 6792 }, { "epoch": 4.76367461430575, "grad_norm": 0.2559043765068054, "learning_rate": 2.797815133393222e-07, "loss": 0.0893, "step": 6793 }, { "epoch": 4.76437587657784, "grad_norm": 0.25190600752830505, "learning_rate": 2.7812663350169357e-07, "loss": 0.0905, "step": 6794 }, { "epoch": 4.76507713884993, "grad_norm": 0.2544984817504883, "learning_rate": 2.7647663504036246e-07, "loss": 0.0838, "step": 6795 }, { "epoch": 4.76577840112202, "grad_norm": 0.251681387424469, "learning_rate": 2.748315182811267e-07, "loss": 0.0892, "step": 6796 }, { "epoch": 4.76647966339411, "grad_norm": 0.2544425427913666, "learning_rate": 2.731912835488154e-07, "loss": 0.0902, "step": 6797 }, { "epoch": 4.767180925666199, "grad_norm": 0.26525506377220154, "learning_rate": 2.715559311672944e-07, "loss": 0.0751, "step": 6798 }, { "epoch": 4.767882187938289, "grad_norm": 0.252417653799057, "learning_rate": 2.6992546145947205e-07, "loss": 0.0822, "step": 6799 }, { "epoch": 4.768583450210379, "grad_norm": 0.7890278100967407, "learning_rate": 2.682998747472826e-07, "loss": 0.4103, "step": 6800 }, { "epoch": 4.769284712482468, "grad_norm": 0.2584119141101837, "learning_rate": 2.666791713517025e-07, "loss": 0.0892, "step": 6801 }, { "epoch": 4.769985974754558, "grad_norm": 0.2650962471961975, "learning_rate": 2.650633515927398e-07, "loss": 0.0769, "step": 6802 }, { "epoch": 4.770687237026648, "grad_norm": 0.24359244108200073, "learning_rate": 2.634524157894475e-07, "loss": 0.0819, "step": 6803 }, { "epoch": 4.771388499298737, "grad_norm": 0.8590158820152283, "learning_rate": 2.618463642599017e-07, "loss": 0.4103, "step": 6804 }, { "epoch": 4.772089761570827, "grad_norm": 0.2907174825668335, "learning_rate": 2.6024519732122386e-07, "loss": 0.2523, "step": 6805 }, { "epoch": 4.772791023842917, "grad_norm": 0.29884639382362366, "learning_rate": 2.5864891528956106e-07, "loss": 0.2569, "step": 6806 }, { "epoch": 4.773492286115007, "grad_norm": 0.29360243678092957, "learning_rate": 2.570575184801111e-07, "loss": 0.2548, "step": 6807 }, { "epoch": 4.774193548387097, "grad_norm": 0.25104233622550964, "learning_rate": 2.554710072070893e-07, "loss": 0.0898, "step": 6808 }, { "epoch": 4.774894810659187, "grad_norm": 0.7886288166046143, "learning_rate": 2.5388938178376174e-07, "loss": 0.4168, "step": 6809 }, { "epoch": 4.775596072931276, "grad_norm": 0.25315386056900024, "learning_rate": 2.523126425224176e-07, "loss": 0.0904, "step": 6810 }, { "epoch": 4.776297335203366, "grad_norm": 0.28199052810668945, "learning_rate": 2.507407897343911e-07, "loss": 0.0778, "step": 6811 }, { "epoch": 4.776998597475456, "grad_norm": 0.25095388293266296, "learning_rate": 2.491738237300423e-07, "loss": 0.0896, "step": 6812 }, { "epoch": 4.777699859747545, "grad_norm": 0.2996211647987366, "learning_rate": 2.4761174481877646e-07, "loss": 0.2494, "step": 6813 }, { "epoch": 4.778401122019635, "grad_norm": 0.30432888865470886, "learning_rate": 2.460545533090275e-07, "loss": 0.2574, "step": 6814 }, { "epoch": 4.7791023842917255, "grad_norm": 0.2522801458835602, "learning_rate": 2.4450224950826604e-07, "loss": 0.0838, "step": 6815 }, { "epoch": 4.779803646563815, "grad_norm": 0.2724560797214508, "learning_rate": 2.4295483372299143e-07, "loss": 0.077, "step": 6816 }, { "epoch": 4.780504908835905, "grad_norm": 0.32694295048713684, "learning_rate": 2.4141230625875087e-07, "loss": 0.2384, "step": 6817 }, { "epoch": 4.7812061711079945, "grad_norm": 0.2544596791267395, "learning_rate": 2.3987466742011466e-07, "loss": 0.0834, "step": 6818 }, { "epoch": 4.781907433380084, "grad_norm": 0.25198906660079956, "learning_rate": 2.3834191751069545e-07, "loss": 0.0822, "step": 6819 }, { "epoch": 4.782608695652174, "grad_norm": 0.2541670799255371, "learning_rate": 2.3681405683313452e-07, "loss": 0.0901, "step": 6820 }, { "epoch": 4.7833099579242635, "grad_norm": 0.2755362093448639, "learning_rate": 2.3529108568911274e-07, "loss": 0.0754, "step": 6821 }, { "epoch": 4.784011220196353, "grad_norm": 0.8123235702514648, "learning_rate": 2.3377300437934236e-07, "loss": 0.414, "step": 6822 }, { "epoch": 4.784712482468443, "grad_norm": 0.8006017804145813, "learning_rate": 2.322598132035697e-07, "loss": 0.4168, "step": 6823 }, { "epoch": 4.7854137447405325, "grad_norm": 0.25460824370384216, "learning_rate": 2.3075151246057803e-07, "loss": 0.0904, "step": 6824 }, { "epoch": 4.786115007012623, "grad_norm": 0.3162444233894348, "learning_rate": 2.292481024481874e-07, "loss": 0.2526, "step": 6825 }, { "epoch": 4.786816269284713, "grad_norm": 0.2750289738178253, "learning_rate": 2.2774958346324092e-07, "loss": 0.0682, "step": 6826 }, { "epoch": 4.787517531556802, "grad_norm": 0.2560591697692871, "learning_rate": 2.262559558016325e-07, "loss": 0.0842, "step": 6827 }, { "epoch": 4.788218793828892, "grad_norm": 0.2536691427230835, "learning_rate": 2.247672197582762e-07, "loss": 0.0834, "step": 6828 }, { "epoch": 4.788920056100982, "grad_norm": 0.2510017454624176, "learning_rate": 2.2328337562712576e-07, "loss": 0.0889, "step": 6829 }, { "epoch": 4.789621318373071, "grad_norm": 0.25346097350120544, "learning_rate": 2.2180442370116905e-07, "loss": 0.0828, "step": 6830 }, { "epoch": 4.790322580645161, "grad_norm": 0.2547023892402649, "learning_rate": 2.2033036427242804e-07, "loss": 0.0832, "step": 6831 }, { "epoch": 4.791023842917251, "grad_norm": 0.29087480902671814, "learning_rate": 2.1886119763195878e-07, "loss": 0.2543, "step": 6832 }, { "epoch": 4.791725105189341, "grad_norm": 0.25240594148635864, "learning_rate": 2.1739692406985146e-07, "loss": 0.0895, "step": 6833 }, { "epoch": 4.792426367461431, "grad_norm": 0.25449028611183167, "learning_rate": 2.1593754387522757e-07, "loss": 0.0901, "step": 6834 }, { "epoch": 4.793127629733521, "grad_norm": 0.25198835134506226, "learning_rate": 2.144830573362483e-07, "loss": 0.0901, "step": 6835 }, { "epoch": 4.79382889200561, "grad_norm": 0.25278985500335693, "learning_rate": 2.1303346474009778e-07, "loss": 0.09, "step": 6836 }, { "epoch": 4.7945301542777, "grad_norm": 0.25507596135139465, "learning_rate": 2.1158876637300539e-07, "loss": 0.0841, "step": 6837 }, { "epoch": 4.79523141654979, "grad_norm": 0.28886881470680237, "learning_rate": 2.101489625202263e-07, "loss": 0.251, "step": 6838 }, { "epoch": 4.795932678821879, "grad_norm": 0.8066019415855408, "learning_rate": 2.0871405346605532e-07, "loss": 0.4095, "step": 6839 }, { "epoch": 4.796633941093969, "grad_norm": 0.29467013478279114, "learning_rate": 2.0728403949381582e-07, "loss": 0.2547, "step": 6840 }, { "epoch": 4.797335203366059, "grad_norm": 0.28767555952072144, "learning_rate": 2.05858920885868e-07, "loss": 0.2526, "step": 6841 }, { "epoch": 4.798036465638148, "grad_norm": 0.2528457045555115, "learning_rate": 2.044386979236035e-07, "loss": 0.0904, "step": 6842 }, { "epoch": 4.798737727910239, "grad_norm": 0.2654290199279785, "learning_rate": 2.0302337088744515e-07, "loss": 0.0861, "step": 6843 }, { "epoch": 4.7994389901823284, "grad_norm": 0.251956045627594, "learning_rate": 2.0161294005685284e-07, "loss": 0.0821, "step": 6844 }, { "epoch": 4.800140252454418, "grad_norm": 3.560446262359619, "learning_rate": 2.0020740571032037e-07, "loss": 0.4752, "step": 6845 }, { "epoch": 4.800841514726508, "grad_norm": 0.25921547412872314, "learning_rate": 1.988067681253758e-07, "loss": 0.0772, "step": 6846 }, { "epoch": 4.801542776998597, "grad_norm": 0.25120943784713745, "learning_rate": 1.9741102757856732e-07, "loss": 0.0902, "step": 6847 }, { "epoch": 4.802244039270687, "grad_norm": 0.250776469707489, "learning_rate": 1.9602018434549673e-07, "loss": 0.0901, "step": 6848 }, { "epoch": 4.802945301542777, "grad_norm": 0.249177023768425, "learning_rate": 1.94634238700786e-07, "loss": 0.089, "step": 6849 }, { "epoch": 4.803646563814866, "grad_norm": 0.2501850724220276, "learning_rate": 1.9325319091808847e-07, "loss": 0.0885, "step": 6850 }, { "epoch": 4.804347826086957, "grad_norm": 0.29677149653434753, "learning_rate": 1.918770412700943e-07, "loss": 0.0711, "step": 6851 }, { "epoch": 4.805049088359047, "grad_norm": 0.2563217580318451, "learning_rate": 1.9050579002853342e-07, "loss": 0.0847, "step": 6852 }, { "epoch": 4.805750350631136, "grad_norm": 0.24858775734901428, "learning_rate": 1.8913943746415307e-07, "loss": 0.082, "step": 6853 }, { "epoch": 4.806451612903226, "grad_norm": 0.2542248070240021, "learning_rate": 1.8777798384675138e-07, "loss": 0.0897, "step": 6854 }, { "epoch": 4.807152875175316, "grad_norm": 0.3193495273590088, "learning_rate": 1.864214294451383e-07, "loss": 0.2368, "step": 6855 }, { "epoch": 4.807854137447405, "grad_norm": 0.25379762053489685, "learning_rate": 1.850697745271801e-07, "loss": 0.0757, "step": 6856 }, { "epoch": 4.808555399719495, "grad_norm": 0.30132654309272766, "learning_rate": 1.83723019359755e-07, "loss": 0.2478, "step": 6857 }, { "epoch": 4.809256661991585, "grad_norm": 0.2569118142127991, "learning_rate": 1.823811642087836e-07, "loss": 0.0839, "step": 6858 }, { "epoch": 4.809957924263674, "grad_norm": 0.2507919669151306, "learning_rate": 1.8104420933921783e-07, "loss": 0.0901, "step": 6859 }, { "epoch": 4.810659186535764, "grad_norm": 0.25146958231925964, "learning_rate": 1.797121550150438e-07, "loss": 0.0901, "step": 6860 }, { "epoch": 4.8113604488078545, "grad_norm": 0.2509458363056183, "learning_rate": 1.783850014992733e-07, "loss": 0.0891, "step": 6861 }, { "epoch": 4.812061711079944, "grad_norm": 0.3076971471309662, "learning_rate": 1.770627490539606e-07, "loss": 0.2506, "step": 6862 }, { "epoch": 4.812762973352034, "grad_norm": 0.299591600894928, "learning_rate": 1.7574539794018574e-07, "loss": 0.2461, "step": 6863 }, { "epoch": 4.8134642356241235, "grad_norm": 0.858462393283844, "learning_rate": 1.7443294841805735e-07, "loss": 0.4234, "step": 6864 }, { "epoch": 4.814165497896213, "grad_norm": 0.29927635192871094, "learning_rate": 1.7312540074672646e-07, "loss": 0.2477, "step": 6865 }, { "epoch": 4.814866760168303, "grad_norm": 0.25374120473861694, "learning_rate": 1.7182275518436707e-07, "loss": 0.0885, "step": 6866 }, { "epoch": 4.8155680224403925, "grad_norm": 0.304004430770874, "learning_rate": 1.7052501198818737e-07, "loss": 0.2458, "step": 6867 }, { "epoch": 4.816269284712482, "grad_norm": 1.3112285137176514, "learning_rate": 1.6923217141443514e-07, "loss": 0.5764, "step": 6868 }, { "epoch": 4.816970546984573, "grad_norm": 0.2508065402507782, "learning_rate": 1.6794423371837564e-07, "loss": 0.0894, "step": 6869 }, { "epoch": 4.817671809256662, "grad_norm": 0.4031941890716553, "learning_rate": 1.6666119915432487e-07, "loss": 0.2567, "step": 6870 }, { "epoch": 4.818373071528752, "grad_norm": 0.3185243010520935, "learning_rate": 1.6538306797561077e-07, "loss": 0.2499, "step": 6871 }, { "epoch": 4.819074333800842, "grad_norm": 0.25231558084487915, "learning_rate": 1.6410984043460642e-07, "loss": 0.0898, "step": 6872 }, { "epoch": 4.819775596072931, "grad_norm": 0.25166603922843933, "learning_rate": 1.6284151678271076e-07, "loss": 0.0901, "step": 6873 }, { "epoch": 4.820476858345021, "grad_norm": 0.3857928216457367, "learning_rate": 1.6157809727035955e-07, "loss": 0.2501, "step": 6874 }, { "epoch": 4.821178120617111, "grad_norm": 0.25663450360298157, "learning_rate": 1.6031958214701714e-07, "loss": 0.0882, "step": 6875 }, { "epoch": 4.8218793828892, "grad_norm": 0.3081841766834259, "learning_rate": 1.5906597166117643e-07, "loss": 0.2476, "step": 6876 }, { "epoch": 4.82258064516129, "grad_norm": 0.25253039598464966, "learning_rate": 1.5781726606036717e-07, "loss": 0.0901, "step": 6877 }, { "epoch": 4.82328190743338, "grad_norm": 0.29653266072273254, "learning_rate": 1.565734655911477e-07, "loss": 0.2454, "step": 6878 }, { "epoch": 4.823983169705469, "grad_norm": 0.25944605469703674, "learning_rate": 1.5533457049910772e-07, "loss": 0.0832, "step": 6879 }, { "epoch": 4.82468443197756, "grad_norm": 0.25103268027305603, "learning_rate": 1.5410058102887093e-07, "loss": 0.0899, "step": 6880 }, { "epoch": 4.82538569424965, "grad_norm": 0.31651970744132996, "learning_rate": 1.5287149742408967e-07, "loss": 0.2504, "step": 6881 }, { "epoch": 4.826086956521739, "grad_norm": 0.2508023679256439, "learning_rate": 1.5164731992745041e-07, "loss": 0.0892, "step": 6882 }, { "epoch": 4.826788218793829, "grad_norm": 0.2950085699558258, "learning_rate": 1.5042804878066529e-07, "loss": 0.2569, "step": 6883 }, { "epoch": 4.827489481065919, "grad_norm": 0.2576238811016083, "learning_rate": 1.4921368422448344e-07, "loss": 0.083, "step": 6884 }, { "epoch": 4.828190743338008, "grad_norm": 0.2550368010997772, "learning_rate": 1.4800422649868528e-07, "loss": 0.0826, "step": 6885 }, { "epoch": 4.828892005610098, "grad_norm": 0.2960566580295563, "learning_rate": 1.46799675842077e-07, "loss": 0.2554, "step": 6886 }, { "epoch": 4.829593267882188, "grad_norm": 0.29636305570602417, "learning_rate": 1.456000324925044e-07, "loss": 0.2444, "step": 6887 }, { "epoch": 4.830294530154278, "grad_norm": 0.2947922646999359, "learning_rate": 1.444052966868309e-07, "loss": 0.2553, "step": 6888 }, { "epoch": 4.830995792426368, "grad_norm": 0.26872673630714417, "learning_rate": 1.432154686609649e-07, "loss": 0.0778, "step": 6889 }, { "epoch": 4.8316970546984574, "grad_norm": 0.24900256097316742, "learning_rate": 1.4203054864984078e-07, "loss": 0.0824, "step": 6890 }, { "epoch": 4.832398316970547, "grad_norm": 0.25881466269493103, "learning_rate": 1.4085053688742134e-07, "loss": 0.0822, "step": 6891 }, { "epoch": 4.833099579242637, "grad_norm": 0.2510226368904114, "learning_rate": 1.3967543360669798e-07, "loss": 0.0831, "step": 6892 }, { "epoch": 4.833800841514726, "grad_norm": 0.2615607976913452, "learning_rate": 1.385052390397018e-07, "loss": 0.076, "step": 6893 }, { "epoch": 4.834502103786816, "grad_norm": 0.29864129424095154, "learning_rate": 1.3733995341748962e-07, "loss": 0.2562, "step": 6894 }, { "epoch": 4.835203366058906, "grad_norm": 0.28349730372428894, "learning_rate": 1.3617957697014682e-07, "loss": 0.2535, "step": 6895 }, { "epoch": 4.835904628330995, "grad_norm": 0.2570153474807739, "learning_rate": 1.3502410992679293e-07, "loss": 0.0905, "step": 6896 }, { "epoch": 4.836605890603085, "grad_norm": 0.26681530475616455, "learning_rate": 1.3387355251557875e-07, "loss": 0.0764, "step": 6897 }, { "epoch": 4.837307152875176, "grad_norm": 0.28465428948402405, "learning_rate": 1.3272790496367816e-07, "loss": 0.2526, "step": 6898 }, { "epoch": 4.838008415147265, "grad_norm": 0.2876480519771576, "learning_rate": 1.3158716749730739e-07, "loss": 0.2528, "step": 6899 }, { "epoch": 4.838709677419355, "grad_norm": 0.29412198066711426, "learning_rate": 1.304513403417029e-07, "loss": 0.2531, "step": 6900 }, { "epoch": 4.839410939691445, "grad_norm": 4.334758281707764, "learning_rate": 1.2932042372114095e-07, "loss": 0.485, "step": 6901 }, { "epoch": 4.840112201963534, "grad_norm": 0.3121888041496277, "learning_rate": 1.2819441785891507e-07, "loss": 0.2395, "step": 6902 }, { "epoch": 4.840813464235624, "grad_norm": 0.25487542152404785, "learning_rate": 1.2707332297736407e-07, "loss": 0.0829, "step": 6903 }, { "epoch": 4.841514726507714, "grad_norm": 0.2612256705760956, "learning_rate": 1.2595713929784426e-07, "loss": 0.084, "step": 6904 }, { "epoch": 4.842215988779803, "grad_norm": 0.25951775908470154, "learning_rate": 1.248458670407543e-07, "loss": 0.0763, "step": 6905 }, { "epoch": 4.842917251051894, "grad_norm": 0.297167032957077, "learning_rate": 1.2373950642551312e-07, "loss": 0.2559, "step": 6906 }, { "epoch": 4.8436185133239835, "grad_norm": 0.2874661684036255, "learning_rate": 1.22638057670571e-07, "loss": 0.2538, "step": 6907 }, { "epoch": 4.844319775596073, "grad_norm": 0.25040343403816223, "learning_rate": 1.2154152099341509e-07, "loss": 0.0887, "step": 6908 }, { "epoch": 4.845021037868163, "grad_norm": 0.3056314289569855, "learning_rate": 1.2044989661056106e-07, "loss": 0.0621, "step": 6909 }, { "epoch": 4.8457223001402525, "grad_norm": 0.27431222796440125, "learning_rate": 1.1936318473754204e-07, "loss": 0.246, "step": 6910 }, { "epoch": 4.846423562412342, "grad_norm": 0.2565281093120575, "learning_rate": 1.1828138558894197e-07, "loss": 0.084, "step": 6911 }, { "epoch": 4.847124824684432, "grad_norm": 0.8132891058921814, "learning_rate": 1.1720449937835665e-07, "loss": 0.4041, "step": 6912 }, { "epoch": 4.8478260869565215, "grad_norm": 0.7888410687446594, "learning_rate": 1.161325263184243e-07, "loss": 0.4171, "step": 6913 }, { "epoch": 4.848527349228611, "grad_norm": 0.25442615151405334, "learning_rate": 1.1506546662080342e-07, "loss": 0.0903, "step": 6914 }, { "epoch": 4.849228611500701, "grad_norm": 0.2580665946006775, "learning_rate": 1.1400332049618933e-07, "loss": 0.0839, "step": 6915 }, { "epoch": 4.849929873772791, "grad_norm": 0.2919149100780487, "learning_rate": 1.129460881543004e-07, "loss": 0.2533, "step": 6916 }, { "epoch": 4.850631136044881, "grad_norm": 0.25609567761421204, "learning_rate": 1.1189376980389743e-07, "loss": 0.0742, "step": 6917 }, { "epoch": 4.851332398316971, "grad_norm": 0.25408557057380676, "learning_rate": 1.1084636565275308e-07, "loss": 0.0912, "step": 6918 }, { "epoch": 4.85203366058906, "grad_norm": 0.30071693658828735, "learning_rate": 1.0980387590768804e-07, "loss": 0.246, "step": 6919 }, { "epoch": 4.85273492286115, "grad_norm": 0.3911550045013428, "learning_rate": 1.0876630077453487e-07, "loss": 0.2584, "step": 6920 }, { "epoch": 4.85343618513324, "grad_norm": 0.2640300393104553, "learning_rate": 1.0773364045817135e-07, "loss": 0.0757, "step": 6921 }, { "epoch": 4.854137447405329, "grad_norm": 0.2538436949253082, "learning_rate": 1.0670589516249552e-07, "loss": 0.0751, "step": 6922 }, { "epoch": 4.854838709677419, "grad_norm": 0.2536250054836273, "learning_rate": 1.0568306509043668e-07, "loss": 0.082, "step": 6923 }, { "epoch": 4.85553997194951, "grad_norm": 0.38978826999664307, "learning_rate": 1.0466515044395553e-07, "loss": 0.2618, "step": 6924 }, { "epoch": 4.856241234221599, "grad_norm": 0.32347995042800903, "learning_rate": 1.0365215142404128e-07, "loss": 0.2398, "step": 6925 }, { "epoch": 4.856942496493689, "grad_norm": 0.2932126522064209, "learning_rate": 1.0264406823071171e-07, "loss": 0.2449, "step": 6926 }, { "epoch": 4.857643758765779, "grad_norm": 0.2556931972503662, "learning_rate": 1.0164090106301316e-07, "loss": 0.0898, "step": 6927 }, { "epoch": 4.858345021037868, "grad_norm": 0.2510680854320526, "learning_rate": 1.006426501190233e-07, "loss": 0.0898, "step": 6928 }, { "epoch": 4.859046283309958, "grad_norm": 0.2515217661857605, "learning_rate": 9.964931559585389e-08, "loss": 0.0899, "step": 6929 }, { "epoch": 4.859747545582048, "grad_norm": 0.251761257648468, "learning_rate": 9.866089768963138e-08, "loss": 0.0893, "step": 6930 }, { "epoch": 4.860448807854137, "grad_norm": 0.25076383352279663, "learning_rate": 9.767739659552744e-08, "loss": 0.0904, "step": 6931 }, { "epoch": 4.861150070126227, "grad_norm": 0.3034220337867737, "learning_rate": 9.669881250773117e-08, "loss": 0.2465, "step": 6932 }, { "epoch": 4.861851332398317, "grad_norm": 0.30168673396110535, "learning_rate": 9.572514561947133e-08, "loss": 0.254, "step": 6933 }, { "epoch": 4.862552594670406, "grad_norm": 0.2523575723171234, "learning_rate": 9.475639612299691e-08, "loss": 0.0902, "step": 6934 }, { "epoch": 4.863253856942497, "grad_norm": 0.2566322088241577, "learning_rate": 9.379256420958826e-08, "loss": 0.0841, "step": 6935 }, { "epoch": 4.8639551192145865, "grad_norm": 0.2510819733142853, "learning_rate": 9.283365006955702e-08, "loss": 0.0894, "step": 6936 }, { "epoch": 4.864656381486676, "grad_norm": 0.3227691054344177, "learning_rate": 9.18796538922434e-08, "loss": 0.2426, "step": 6937 }, { "epoch": 4.865357643758766, "grad_norm": 0.2527318000793457, "learning_rate": 9.09305758660134e-08, "loss": 0.0888, "step": 6938 }, { "epoch": 4.8660589060308554, "grad_norm": 3.324671506881714, "learning_rate": 8.998641617826431e-08, "loss": 0.4379, "step": 6939 }, { "epoch": 4.866760168302945, "grad_norm": 0.2526748478412628, "learning_rate": 8.904717501542204e-08, "loss": 0.0821, "step": 6940 }, { "epoch": 4.867461430575035, "grad_norm": 0.25316891074180603, "learning_rate": 8.811285256294377e-08, "loss": 0.0906, "step": 6941 }, { "epoch": 4.868162692847124, "grad_norm": 0.26093629002571106, "learning_rate": 8.71834490053125e-08, "loss": 0.0832, "step": 6942 }, { "epoch": 4.868863955119215, "grad_norm": 0.255289763212204, "learning_rate": 8.6258964526037e-08, "loss": 0.0885, "step": 6943 }, { "epoch": 4.869565217391305, "grad_norm": 0.268350213766098, "learning_rate": 8.533939930766288e-08, "loss": 0.0753, "step": 6944 }, { "epoch": 4.870266479663394, "grad_norm": 0.2651832103729248, "learning_rate": 8.442475353175606e-08, "loss": 0.077, "step": 6945 }, { "epoch": 4.870967741935484, "grad_norm": 0.2586659789085388, "learning_rate": 8.351502737891648e-08, "loss": 0.0828, "step": 6946 }, { "epoch": 4.871669004207574, "grad_norm": 0.869890570640564, "learning_rate": 8.26102210287727e-08, "loss": 0.4077, "step": 6947 }, { "epoch": 4.872370266479663, "grad_norm": 0.31139254570007324, "learning_rate": 8.171033465997901e-08, "loss": 0.2597, "step": 6948 }, { "epoch": 4.873071528751753, "grad_norm": 0.30135923624038696, "learning_rate": 8.081536845021831e-08, "loss": 0.2499, "step": 6949 }, { "epoch": 4.873772791023843, "grad_norm": 0.8617687225341797, "learning_rate": 7.992532257620478e-08, "loss": 0.4267, "step": 6950 }, { "epoch": 4.874474053295932, "grad_norm": 0.2818521559238434, "learning_rate": 7.904019721367839e-08, "loss": 0.0691, "step": 6951 }, { "epoch": 4.875175315568022, "grad_norm": 0.25470808148384094, "learning_rate": 7.815999253741046e-08, "loss": 0.0892, "step": 6952 }, { "epoch": 4.8758765778401125, "grad_norm": 0.8034953474998474, "learning_rate": 7.72847087211953e-08, "loss": 0.4132, "step": 6953 }, { "epoch": 4.876577840112202, "grad_norm": 0.32252922654151917, "learning_rate": 7.641434593786412e-08, "loss": 0.2551, "step": 6954 }, { "epoch": 4.877279102384292, "grad_norm": 0.2700931131839752, "learning_rate": 7.554890435926832e-08, "loss": 0.0777, "step": 6955 }, { "epoch": 4.8779803646563815, "grad_norm": 3.290972948074341, "learning_rate": 7.468838415629065e-08, "loss": 0.4435, "step": 6956 }, { "epoch": 4.878681626928471, "grad_norm": 0.8029117584228516, "learning_rate": 7.38327854988452e-08, "loss": 0.4177, "step": 6957 }, { "epoch": 4.879382889200561, "grad_norm": 0.25015294551849365, "learning_rate": 7.298210855586907e-08, "loss": 0.0829, "step": 6958 }, { "epoch": 4.8800841514726505, "grad_norm": 0.25249984860420227, "learning_rate": 7.213635349532788e-08, "loss": 0.0901, "step": 6959 }, { "epoch": 4.88078541374474, "grad_norm": 0.3260496258735657, "learning_rate": 7.129552048422417e-08, "loss": 0.2426, "step": 6960 }, { "epoch": 4.881486676016831, "grad_norm": 0.29204732179641724, "learning_rate": 7.045960968857235e-08, "loss": 0.2523, "step": 6961 }, { "epoch": 4.88218793828892, "grad_norm": 0.25153619050979614, "learning_rate": 6.962862127343206e-08, "loss": 0.0903, "step": 6962 }, { "epoch": 4.88288920056101, "grad_norm": 0.31098511815071106, "learning_rate": 6.880255540288039e-08, "loss": 0.2464, "step": 6963 }, { "epoch": 4.8835904628331, "grad_norm": 0.25644651055336, "learning_rate": 6.798141224002574e-08, "loss": 0.0839, "step": 6964 }, { "epoch": 4.884291725105189, "grad_norm": 0.25149959325790405, "learning_rate": 6.716519194700511e-08, "loss": 0.0894, "step": 6965 }, { "epoch": 4.884992987377279, "grad_norm": 0.25649896264076233, "learning_rate": 6.635389468498399e-08, "loss": 0.0841, "step": 6966 }, { "epoch": 4.885694249649369, "grad_norm": 0.2865700125694275, "learning_rate": 6.554752061414815e-08, "loss": 0.2517, "step": 6967 }, { "epoch": 4.886395511921458, "grad_norm": 2.634291410446167, "learning_rate": 6.474606989372579e-08, "loss": 0.3739, "step": 6968 }, { "epoch": 4.887096774193548, "grad_norm": 0.2558688223361969, "learning_rate": 6.394954268195697e-08, "loss": 0.083, "step": 6969 }, { "epoch": 4.887798036465638, "grad_norm": 0.2547205686569214, "learning_rate": 6.315793913612422e-08, "loss": 0.0819, "step": 6970 }, { "epoch": 4.888499298737728, "grad_norm": 0.38316625356674194, "learning_rate": 6.237125941252475e-08, "loss": 0.2468, "step": 6971 }, { "epoch": 4.889200561009818, "grad_norm": 0.25941002368927, "learning_rate": 6.158950366649541e-08, "loss": 0.0828, "step": 6972 }, { "epoch": 4.889901823281908, "grad_norm": 0.25261375308036804, "learning_rate": 6.08126720523905e-08, "loss": 0.0824, "step": 6973 }, { "epoch": 4.890603085553997, "grad_norm": 0.2472202032804489, "learning_rate": 6.004076472360121e-08, "loss": 0.0818, "step": 6974 }, { "epoch": 4.891304347826087, "grad_norm": 0.29662472009658813, "learning_rate": 5.9273781832538954e-08, "loss": 0.2452, "step": 6975 }, { "epoch": 4.892005610098177, "grad_norm": 0.2545974552631378, "learning_rate": 5.851172353064649e-08, "loss": 0.0887, "step": 6976 }, { "epoch": 4.892706872370266, "grad_norm": 0.251224160194397, "learning_rate": 5.7754589968392335e-08, "loss": 0.0896, "step": 6977 }, { "epoch": 4.893408134642356, "grad_norm": 0.2494242638349533, "learning_rate": 5.700238129527635e-08, "loss": 0.0889, "step": 6978 }, { "epoch": 4.8941093969144465, "grad_norm": 0.38764718174934387, "learning_rate": 5.625509765982417e-08, "loss": 0.26, "step": 6979 }, { "epoch": 4.894810659186536, "grad_norm": 0.2558780610561371, "learning_rate": 5.551273920958444e-08, "loss": 0.0838, "step": 6980 }, { "epoch": 4.895511921458626, "grad_norm": 0.2964930832386017, "learning_rate": 5.477530609113712e-08, "loss": 0.2443, "step": 6981 }, { "epoch": 4.8962131837307155, "grad_norm": 0.7991088628768921, "learning_rate": 5.404279845009353e-08, "loss": 0.4189, "step": 6982 }, { "epoch": 4.896914446002805, "grad_norm": 0.2562117278575897, "learning_rate": 5.331521643108517e-08, "loss": 0.0886, "step": 6983 }, { "epoch": 4.897615708274895, "grad_norm": 0.26099762320518494, "learning_rate": 5.2592560177777696e-08, "loss": 0.0755, "step": 6984 }, { "epoch": 4.8983169705469845, "grad_norm": 0.30698713660240173, "learning_rate": 5.1874829832856966e-08, "loss": 0.0706, "step": 6985 }, { "epoch": 4.899018232819074, "grad_norm": 0.3235281705856323, "learning_rate": 5.116202553804295e-08, "loss": 0.2413, "step": 6986 }, { "epoch": 4.899719495091164, "grad_norm": 5.611321926116943, "learning_rate": 5.045414743407861e-08, "loss": 0.7611, "step": 6987 }, { "epoch": 4.900420757363253, "grad_norm": 0.2543073892593384, "learning_rate": 4.975119566073827e-08, "loss": 0.0895, "step": 6988 }, { "epoch": 4.901122019635344, "grad_norm": 0.2505526840686798, "learning_rate": 4.9053170356816444e-08, "loss": 0.0825, "step": 6989 }, { "epoch": 4.901823281907434, "grad_norm": 0.27213478088378906, "learning_rate": 4.836007166014178e-08, "loss": 0.0771, "step": 6990 }, { "epoch": 4.902524544179523, "grad_norm": 3.388899564743042, "learning_rate": 4.767189970756869e-08, "loss": 0.4376, "step": 6991 }, { "epoch": 4.903225806451613, "grad_norm": 0.2535400390625, "learning_rate": 4.6988654634974596e-08, "loss": 0.0832, "step": 6992 }, { "epoch": 4.903927068723703, "grad_norm": 0.2919507324695587, "learning_rate": 4.6310336577273796e-08, "loss": 0.2537, "step": 6993 }, { "epoch": 4.904628330995792, "grad_norm": 0.25416290760040283, "learning_rate": 4.563694566839527e-08, "loss": 0.0896, "step": 6994 }, { "epoch": 4.905329593267882, "grad_norm": 0.2852970063686371, "learning_rate": 4.4968482041302105e-08, "loss": 0.2517, "step": 6995 }, { "epoch": 4.906030855539972, "grad_norm": 0.3019602596759796, "learning_rate": 4.4304945827985947e-08, "loss": 0.2476, "step": 6996 }, { "epoch": 4.906732117812062, "grad_norm": 0.2555604577064514, "learning_rate": 4.36463371594642e-08, "loss": 0.0829, "step": 6997 }, { "epoch": 4.907433380084152, "grad_norm": 0.24621355533599854, "learning_rate": 4.299265616577453e-08, "loss": 0.0817, "step": 6998 }, { "epoch": 4.9081346423562415, "grad_norm": 0.2909148633480072, "learning_rate": 4.2343902975994223e-08, "loss": 0.2549, "step": 6999 }, { "epoch": 4.908835904628331, "grad_norm": 0.8064649105072021, "learning_rate": 4.170007771821527e-08, "loss": 0.4049, "step": 7000 }, { "epoch": 4.909537166900421, "grad_norm": 0.2917349338531494, "learning_rate": 4.106118051956376e-08, "loss": 0.2441, "step": 7001 }, { "epoch": 4.9102384291725105, "grad_norm": 0.2497994601726532, "learning_rate": 4.042721150619433e-08, "loss": 0.0895, "step": 7002 }, { "epoch": 4.9109396914446, "grad_norm": 0.2551095485687256, "learning_rate": 3.9798170803279075e-08, "loss": 0.089, "step": 7003 }, { "epoch": 4.91164095371669, "grad_norm": 0.25235557556152344, "learning_rate": 3.9174058535029753e-08, "loss": 0.0903, "step": 7004 }, { "epoch": 4.9123422159887795, "grad_norm": 0.3002035319805145, "learning_rate": 3.8554874824672796e-08, "loss": 0.2451, "step": 7005 }, { "epoch": 4.913043478260869, "grad_norm": 0.3025304079055786, "learning_rate": 3.7940619794468726e-08, "loss": 0.2473, "step": 7006 }, { "epoch": 4.913744740532959, "grad_norm": 0.25113150477409363, "learning_rate": 3.733129356570664e-08, "loss": 0.0901, "step": 7007 }, { "epoch": 4.914446002805049, "grad_norm": 0.2514511048793793, "learning_rate": 3.6726896258695855e-08, "loss": 0.0901, "step": 7008 }, { "epoch": 4.915147265077139, "grad_norm": 0.2520851790904999, "learning_rate": 3.612742799277702e-08, "loss": 0.0897, "step": 7009 }, { "epoch": 4.915848527349229, "grad_norm": 0.2742401957511902, "learning_rate": 3.5532888886313785e-08, "loss": 0.0785, "step": 7010 }, { "epoch": 4.916549789621318, "grad_norm": 0.3333907127380371, "learning_rate": 3.4943279056698364e-08, "loss": 0.2439, "step": 7011 }, { "epoch": 4.917251051893408, "grad_norm": 0.28953471779823303, "learning_rate": 3.4358598620354286e-08, "loss": 0.2539, "step": 7012 }, { "epoch": 4.917952314165498, "grad_norm": 0.26360034942626953, "learning_rate": 3.37788476927281e-08, "loss": 0.0834, "step": 7013 }, { "epoch": 4.918653576437587, "grad_norm": 0.2522723376750946, "learning_rate": 3.3204026388286566e-08, "loss": 0.0751, "step": 7014 }, { "epoch": 4.919354838709677, "grad_norm": 0.7853675484657288, "learning_rate": 3.2634134820536125e-08, "loss": 0.4128, "step": 7015 }, { "epoch": 4.920056100981768, "grad_norm": 0.2782438099384308, "learning_rate": 3.206917310199786e-08, "loss": 0.067, "step": 7016 }, { "epoch": 4.920757363253857, "grad_norm": 0.26230376958847046, "learning_rate": 3.1509141344227e-08, "loss": 0.0824, "step": 7017 }, { "epoch": 4.921458625525947, "grad_norm": 0.7906025648117065, "learning_rate": 3.0954039657801747e-08, "loss": 0.4152, "step": 7018 }, { "epoch": 4.922159887798037, "grad_norm": 0.29405176639556885, "learning_rate": 3.04038681523261e-08, "loss": 0.2533, "step": 7019 }, { "epoch": 4.922861150070126, "grad_norm": 0.3104131519794464, "learning_rate": 2.985862693643537e-08, "loss": 0.2418, "step": 7020 }, { "epoch": 4.923562412342216, "grad_norm": 0.2553911805152893, "learning_rate": 2.9318316117787902e-08, "loss": 0.0829, "step": 7021 }, { "epoch": 4.924263674614306, "grad_norm": 0.2809080183506012, "learning_rate": 2.8782935803070567e-08, "loss": 0.2493, "step": 7022 }, { "epoch": 4.924964936886395, "grad_norm": 0.8007656335830688, "learning_rate": 2.8252486097990472e-08, "loss": 0.4157, "step": 7023 }, { "epoch": 4.925666199158485, "grad_norm": 0.2540837228298187, "learning_rate": 2.7726967107288836e-08, "loss": 0.0894, "step": 7024 }, { "epoch": 4.926367461430575, "grad_norm": 0.2890869081020355, "learning_rate": 2.7206378934729882e-08, "loss": 0.2546, "step": 7025 }, { "epoch": 4.927068723702665, "grad_norm": 0.25583773851394653, "learning_rate": 2.6690721683103605e-08, "loss": 0.0772, "step": 7026 }, { "epoch": 4.927769985974755, "grad_norm": 0.25522568821907043, "learning_rate": 2.6179995454228557e-08, "loss": 0.0894, "step": 7027 }, { "epoch": 4.9284712482468445, "grad_norm": 0.2678085267543793, "learning_rate": 2.5674200348949073e-08, "loss": 0.0759, "step": 7028 }, { "epoch": 4.929172510518934, "grad_norm": 0.2942962944507599, "learning_rate": 2.5173336467135267e-08, "loss": 0.2537, "step": 7029 }, { "epoch": 4.929873772791024, "grad_norm": 4.602131366729736, "learning_rate": 2.467740390768025e-08, "loss": 0.51, "step": 7030 }, { "epoch": 4.9305750350631135, "grad_norm": 0.25054582953453064, "learning_rate": 2.418640276851125e-08, "loss": 0.0895, "step": 7031 }, { "epoch": 4.931276297335203, "grad_norm": 0.28149667382240295, "learning_rate": 2.3700333146575714e-08, "loss": 0.2509, "step": 7032 }, { "epoch": 4.931977559607293, "grad_norm": 0.2582158148288727, "learning_rate": 2.3219195137846873e-08, "loss": 0.0841, "step": 7033 }, { "epoch": 4.932678821879383, "grad_norm": 0.24944430589675903, "learning_rate": 2.274298883732928e-08, "loss": 0.0895, "step": 7034 }, { "epoch": 4.933380084151473, "grad_norm": 0.25413039326667786, "learning_rate": 2.2271714339047735e-08, "loss": 0.0899, "step": 7035 }, { "epoch": 4.934081346423563, "grad_norm": 0.25957268476486206, "learning_rate": 2.1805371736058345e-08, "loss": 0.0848, "step": 7036 }, { "epoch": 4.934782608695652, "grad_norm": 0.2555570900440216, "learning_rate": 2.1343961120443013e-08, "loss": 0.0826, "step": 7037 }, { "epoch": 4.935483870967742, "grad_norm": 0.29794564843177795, "learning_rate": 2.0887482583303864e-08, "loss": 0.256, "step": 7038 }, { "epoch": 4.936185133239832, "grad_norm": 0.28357747197151184, "learning_rate": 2.0435936214774354e-08, "loss": 0.2529, "step": 7039 }, { "epoch": 4.936886395511921, "grad_norm": 0.26263827085494995, "learning_rate": 1.9989322104013718e-08, "loss": 0.084, "step": 7040 }, { "epoch": 4.937587657784011, "grad_norm": 0.8544180393218994, "learning_rate": 1.954764033920975e-08, "loss": 0.4222, "step": 7041 }, { "epoch": 4.938288920056101, "grad_norm": 0.25862327218055725, "learning_rate": 1.911089100756769e-08, "loss": 0.0846, "step": 7042 }, { "epoch": 4.93899018232819, "grad_norm": 3.307304620742798, "learning_rate": 1.867907419532966e-08, "loss": 0.4452, "step": 7043 }, { "epoch": 4.939691444600281, "grad_norm": 0.2932613790035248, "learning_rate": 1.825218998775524e-08, "loss": 0.254, "step": 7044 }, { "epoch": 4.9403927068723705, "grad_norm": 0.2893993854522705, "learning_rate": 1.7830238469132564e-08, "loss": 0.2538, "step": 7045 }, { "epoch": 4.94109396914446, "grad_norm": 0.39113283157348633, "learning_rate": 1.7413219722781094e-08, "loss": 0.253, "step": 7046 }, { "epoch": 4.94179523141655, "grad_norm": 0.31064847111701965, "learning_rate": 1.700113383103774e-08, "loss": 0.2398, "step": 7047 }, { "epoch": 4.9424964936886395, "grad_norm": 0.29300183057785034, "learning_rate": 1.6593980875270752e-08, "loss": 0.2559, "step": 7048 }, { "epoch": 4.943197755960729, "grad_norm": 0.25434887409210205, "learning_rate": 1.6191760935874157e-08, "loss": 0.0834, "step": 7049 }, { "epoch": 4.943899018232819, "grad_norm": 0.3041515648365021, "learning_rate": 1.5794474092267753e-08, "loss": 0.2514, "step": 7050 }, { "epoch": 4.9446002805049085, "grad_norm": 0.8197051286697388, "learning_rate": 1.540212042289435e-08, "loss": 0.4056, "step": 7051 }, { "epoch": 4.945301542776999, "grad_norm": 0.25311464071273804, "learning_rate": 1.5014700005225313e-08, "loss": 0.0829, "step": 7052 }, { "epoch": 4.946002805049089, "grad_norm": 0.29024064540863037, "learning_rate": 1.463221291575778e-08, "loss": 0.2538, "step": 7053 }, { "epoch": 4.946704067321178, "grad_norm": 0.30408236384391785, "learning_rate": 1.4254659230014677e-08, "loss": 0.2514, "step": 7054 }, { "epoch": 4.947405329593268, "grad_norm": 0.25182491540908813, "learning_rate": 1.3882039022544702e-08, "loss": 0.0906, "step": 7055 }, { "epoch": 4.948106591865358, "grad_norm": 0.3839547336101532, "learning_rate": 1.3514352366922334e-08, "loss": 0.2496, "step": 7056 }, { "epoch": 4.948807854137447, "grad_norm": 0.29645097255706787, "learning_rate": 1.3151599335745057e-08, "loss": 0.2526, "step": 7057 }, { "epoch": 4.949509116409537, "grad_norm": 0.27290159463882446, "learning_rate": 1.2793780000644463e-08, "loss": 0.0764, "step": 7058 }, { "epoch": 4.950210378681627, "grad_norm": 0.2536103427410126, "learning_rate": 1.244089443226959e-08, "loss": 0.0897, "step": 7059 }, { "epoch": 4.950911640953716, "grad_norm": 0.2547653913497925, "learning_rate": 1.2092942700298038e-08, "loss": 0.089, "step": 7060 }, { "epoch": 4.951612903225806, "grad_norm": 0.26797834038734436, "learning_rate": 1.1749924873433182e-08, "loss": 0.0763, "step": 7061 }, { "epoch": 4.952314165497896, "grad_norm": 0.25117257237434387, "learning_rate": 1.1411841019406955e-08, "loss": 0.0899, "step": 7062 }, { "epoch": 4.953015427769986, "grad_norm": 0.25458741188049316, "learning_rate": 1.1078691204971515e-08, "loss": 0.0822, "step": 7063 }, { "epoch": 4.953716690042076, "grad_norm": 0.2524260878562927, "learning_rate": 1.0750475495910351e-08, "loss": 0.0836, "step": 7064 }, { "epoch": 4.954417952314166, "grad_norm": 0.2531983554363251, "learning_rate": 1.0427193957029958e-08, "loss": 0.09, "step": 7065 }, { "epoch": 4.955119214586255, "grad_norm": 0.2558956444263458, "learning_rate": 1.0108846652162607e-08, "loss": 0.0894, "step": 7066 }, { "epoch": 4.955820476858345, "grad_norm": 0.3079957365989685, "learning_rate": 9.795433644163577e-09, "loss": 0.2495, "step": 7067 }, { "epoch": 4.956521739130435, "grad_norm": 0.2590453624725342, "learning_rate": 9.486954994919473e-09, "loss": 0.0816, "step": 7068 }, { "epoch": 4.957223001402524, "grad_norm": 0.252346009016037, "learning_rate": 9.183410765339905e-09, "loss": 0.0901, "step": 7069 }, { "epoch": 4.957924263674614, "grad_norm": 0.2527426481246948, "learning_rate": 8.884801015360267e-09, "loss": 0.0885, "step": 7070 }, { "epoch": 4.9586255259467045, "grad_norm": 0.2717980146408081, "learning_rate": 8.591125803941723e-09, "loss": 0.0767, "step": 7071 }, { "epoch": 4.959326788218794, "grad_norm": 0.2551284730434418, "learning_rate": 8.302385189068452e-09, "loss": 0.0758, "step": 7072 }, { "epoch": 4.960028050490884, "grad_norm": 0.26179853081703186, "learning_rate": 8.018579227755952e-09, "loss": 0.0856, "step": 7073 }, { "epoch": 4.9607293127629735, "grad_norm": 0.3013096749782562, "learning_rate": 7.739707976042732e-09, "loss": 0.248, "step": 7074 }, { "epoch": 4.961430575035063, "grad_norm": 0.6999067664146423, "learning_rate": 7.465771488987528e-09, "loss": 0.2642, "step": 7075 }, { "epoch": 4.962131837307153, "grad_norm": 0.257195383310318, "learning_rate": 7.19676982068318e-09, "loss": 0.0829, "step": 7076 }, { "epoch": 4.9628330995792425, "grad_norm": 0.8086177110671997, "learning_rate": 6.932703024245535e-09, "loss": 0.4213, "step": 7077 }, { "epoch": 4.963534361851332, "grad_norm": 0.2519814968109131, "learning_rate": 6.67357115181344e-09, "loss": 0.0901, "step": 7078 }, { "epoch": 4.964235624123422, "grad_norm": 0.2958740293979645, "learning_rate": 6.4193742545543e-09, "loss": 0.2571, "step": 7079 }, { "epoch": 4.9649368863955115, "grad_norm": 0.2652023434638977, "learning_rate": 6.170112382655746e-09, "loss": 0.0759, "step": 7080 }, { "epoch": 4.965638148667602, "grad_norm": 0.2519853413105011, "learning_rate": 5.9257855853395164e-09, "loss": 0.0834, "step": 7081 }, { "epoch": 4.966339410939692, "grad_norm": 0.2547764778137207, "learning_rate": 5.6863939108475765e-09, "loss": 0.089, "step": 7082 }, { "epoch": 4.967040673211781, "grad_norm": 0.8138929605484009, "learning_rate": 5.4519374064448955e-09, "loss": 0.4239, "step": 7083 }, { "epoch": 4.967741935483871, "grad_norm": 0.2589680552482605, "learning_rate": 5.222416118427776e-09, "loss": 0.0833, "step": 7084 }, { "epoch": 4.968443197755961, "grad_norm": 0.3907742202281952, "learning_rate": 4.997830092115519e-09, "loss": 0.2625, "step": 7085 }, { "epoch": 4.96914446002805, "grad_norm": 0.3804592192173004, "learning_rate": 4.7781793718532084e-09, "loss": 0.2275, "step": 7086 }, { "epoch": 4.96984572230014, "grad_norm": 0.29527541995048523, "learning_rate": 4.563464001011708e-09, "loss": 0.2559, "step": 7087 }, { "epoch": 4.97054698457223, "grad_norm": 0.2631121277809143, "learning_rate": 4.353684021987658e-09, "loss": 0.0765, "step": 7088 }, { "epoch": 4.97124824684432, "grad_norm": 0.7919774651527405, "learning_rate": 4.14883947619793e-09, "loss": 0.4204, "step": 7089 }, { "epoch": 4.97194950911641, "grad_norm": 0.3908329904079437, "learning_rate": 3.948930404093498e-09, "loss": 0.2512, "step": 7090 }, { "epoch": 4.9726507713884995, "grad_norm": 0.25130170583724976, "learning_rate": 3.753956845145568e-09, "loss": 0.0891, "step": 7091 }, { "epoch": 4.973352033660589, "grad_norm": 0.28819745779037476, "learning_rate": 3.563918837853897e-09, "loss": 0.2539, "step": 7092 }, { "epoch": 4.974053295932679, "grad_norm": 0.2500593960285187, "learning_rate": 3.378816419738473e-09, "loss": 0.0893, "step": 7093 }, { "epoch": 4.9747545582047685, "grad_norm": 0.29863041639328003, "learning_rate": 3.198649627350614e-09, "loss": 0.2532, "step": 7094 }, { "epoch": 4.975455820476858, "grad_norm": 0.3037756681442261, "learning_rate": 3.023418496261865e-09, "loss": 0.2474, "step": 7095 }, { "epoch": 4.976157082748948, "grad_norm": 0.2578292191028595, "learning_rate": 2.8531230610751025e-09, "loss": 0.083, "step": 7096 }, { "epoch": 4.9768583450210375, "grad_norm": 0.28930240869522095, "learning_rate": 2.6877633554162063e-09, "loss": 0.2534, "step": 7097 }, { "epoch": 4.977559607293127, "grad_norm": 3.8113861083984375, "learning_rate": 2.5273394119312845e-09, "loss": 0.4789, "step": 7098 }, { "epoch": 4.978260869565218, "grad_norm": 0.26573804020881653, "learning_rate": 2.371851262297775e-09, "loss": 0.0757, "step": 7099 }, { "epoch": 4.978962131837307, "grad_norm": 0.25524917244911194, "learning_rate": 2.2212989372188964e-09, "loss": 0.0883, "step": 7100 }, { "epoch": 4.979663394109397, "grad_norm": 0.26033881306648254, "learning_rate": 2.0756824664208698e-09, "loss": 0.0842, "step": 7101 }, { "epoch": 4.980364656381487, "grad_norm": 0.25497737526893616, "learning_rate": 1.9350018786556956e-09, "loss": 0.0889, "step": 7102 }, { "epoch": 4.981065918653576, "grad_norm": 0.2548332214355469, "learning_rate": 1.7992572017011544e-09, "loss": 0.0887, "step": 7103 }, { "epoch": 4.981767180925666, "grad_norm": 0.25066065788269043, "learning_rate": 1.6684484623580298e-09, "loss": 0.0896, "step": 7104 }, { "epoch": 4.982468443197756, "grad_norm": 0.7977807521820068, "learning_rate": 1.5425756864584362e-09, "loss": 0.4177, "step": 7105 }, { "epoch": 4.983169705469845, "grad_norm": 0.2549397349357605, "learning_rate": 1.4216388988547158e-09, "loss": 0.0826, "step": 7106 }, { "epoch": 4.983870967741936, "grad_norm": 0.2537093460559845, "learning_rate": 1.3056381234249903e-09, "loss": 0.0906, "step": 7107 }, { "epoch": 4.984572230014026, "grad_norm": 0.3195262551307678, "learning_rate": 1.1945733830759366e-09, "loss": 0.2414, "step": 7108 }, { "epoch": 4.985273492286115, "grad_norm": 0.7977455258369446, "learning_rate": 1.0884446997344588e-09, "loss": 0.4159, "step": 7109 }, { "epoch": 4.985974754558205, "grad_norm": 0.2934187054634094, "learning_rate": 9.872520943560171e-10, "loss": 0.2537, "step": 7110 }, { "epoch": 4.986676016830295, "grad_norm": 0.2774091362953186, "learning_rate": 8.909955869274011e-10, "loss": 0.2487, "step": 7111 }, { "epoch": 4.987377279102384, "grad_norm": 0.25352147221565247, "learning_rate": 7.996751964445271e-10, "loss": 0.0844, "step": 7112 }, { "epoch": 4.988078541374474, "grad_norm": 0.2565845251083374, "learning_rate": 7.132909409485189e-10, "loss": 0.0816, "step": 7113 }, { "epoch": 4.988779803646564, "grad_norm": 0.2542582154273987, "learning_rate": 6.318428374896268e-10, "loss": 0.0908, "step": 7114 }, { "epoch": 4.989481065918653, "grad_norm": 0.29213669896125793, "learning_rate": 5.553309021522069e-10, "loss": 0.2444, "step": 7115 }, { "epoch": 4.990182328190743, "grad_norm": 0.29375413060188293, "learning_rate": 4.837551500436188e-10, "loss": 0.2551, "step": 7116 }, { "epoch": 4.9908835904628335, "grad_norm": 0.2508750557899475, "learning_rate": 4.1711559529700186e-10, "loss": 0.0899, "step": 7117 }, { "epoch": 4.991584852734923, "grad_norm": 0.28697922825813293, "learning_rate": 3.554122510712743e-10, "loss": 0.2529, "step": 7118 }, { "epoch": 4.992286115007013, "grad_norm": 0.2527596950531006, "learning_rate": 2.986451295483583e-10, "loss": 0.0895, "step": 7119 }, { "epoch": 4.9929873772791025, "grad_norm": 0.8026397228240967, "learning_rate": 2.468142419359554e-10, "loss": 0.411, "step": 7120 }, { "epoch": 4.993688639551192, "grad_norm": 0.2797410786151886, "learning_rate": 1.9991959846754617e-10, "loss": 0.0691, "step": 7121 }, { "epoch": 4.994389901823282, "grad_norm": 0.2574980854988098, "learning_rate": 1.5796120840794182e-10, "loss": 0.0827, "step": 7122 }, { "epoch": 4.9950911640953715, "grad_norm": 0.25539496541023254, "learning_rate": 1.2093908003663057e-10, "loss": 0.0894, "step": 7123 }, { "epoch": 4.995792426367461, "grad_norm": 0.2523026168346405, "learning_rate": 8.885322066443102e-11, "loss": 0.0892, "step": 7124 }, { "epoch": 4.996493688639552, "grad_norm": 0.24898947775363922, "learning_rate": 6.170363663071666e-11, "loss": 0.074, "step": 7125 }, { "epoch": 4.997194950911641, "grad_norm": 0.30010277032852173, "learning_rate": 3.949033329231355e-11, "loss": 0.2477, "step": 7126 }, { "epoch": 4.997896213183731, "grad_norm": 0.25323453545570374, "learning_rate": 2.2213315034602665e-11, "loss": 0.0903, "step": 7127 }, { "epoch": 4.998597475455821, "grad_norm": 0.26080629229545593, "learning_rate": 9.872585271519797e-12, "loss": 0.0847, "step": 7128 }, { "epoch": 4.99929873772791, "grad_norm": 0.25225865840911865, "learning_rate": 2.4681464400044817e-12, "loss": 0.0896, "step": 7129 }, { "epoch": 5.0, "grad_norm": 0.26379790902137756, "learning_rate": 0.0, "loss": 0.089, "step": 7130 } ], "logging_steps": 1, "max_steps": 7130, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 700, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.02220143302656e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }