{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1056, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002840909090909091, "grad_norm": 9.802746734876441, "learning_rate": 0.0, "loss": 0.6182, "step": 1 }, { "epoch": 0.005681818181818182, "grad_norm": 12.388093403552265, "learning_rate": 1.1627906976744187e-07, "loss": 0.6649, "step": 2 }, { "epoch": 0.008522727272727272, "grad_norm": 12.643095212909474, "learning_rate": 2.3255813953488374e-07, "loss": 0.6794, "step": 3 }, { "epoch": 0.011363636363636364, "grad_norm": 9.628453582962425, "learning_rate": 3.488372093023256e-07, "loss": 0.5426, "step": 4 }, { "epoch": 0.014204545454545454, "grad_norm": 12.114285873199693, "learning_rate": 4.651162790697675e-07, "loss": 0.6628, "step": 5 }, { "epoch": 0.017045454545454544, "grad_norm": 12.590069642332757, "learning_rate": 5.813953488372094e-07, "loss": 0.6635, "step": 6 }, { "epoch": 0.019886363636363636, "grad_norm": 11.894881604292143, "learning_rate": 6.976744186046513e-07, "loss": 0.6478, "step": 7 }, { "epoch": 0.022727272727272728, "grad_norm": 10.523659604864859, "learning_rate": 8.139534883720931e-07, "loss": 0.6382, "step": 8 }, { "epoch": 0.02556818181818182, "grad_norm": 9.260520595400251, "learning_rate": 9.30232558139535e-07, "loss": 0.5683, "step": 9 }, { "epoch": 0.028409090909090908, "grad_norm": 8.701673712634479, "learning_rate": 1.0465116279069768e-06, "loss": 0.5677, "step": 10 }, { "epoch": 0.03125, "grad_norm": 7.754246744436588, "learning_rate": 1.1627906976744188e-06, "loss": 0.5026, "step": 11 }, { "epoch": 0.03409090909090909, "grad_norm": 8.663705476348797, "learning_rate": 1.2790697674418605e-06, "loss": 0.6104, "step": 12 }, { "epoch": 0.036931818181818184, "grad_norm": 5.045315784322545, "learning_rate": 1.3953488372093025e-06, "loss": 0.4227, "step": 13 }, { "epoch": 0.03977272727272727, "grad_norm": 4.926402953478099, "learning_rate": 1.5116279069767443e-06, "loss": 0.4896, "step": 14 }, { "epoch": 0.04261363636363636, "grad_norm": 4.591926718398226, "learning_rate": 1.6279069767441862e-06, "loss": 0.4869, "step": 15 }, { "epoch": 0.045454545454545456, "grad_norm": 4.197025239911461, "learning_rate": 1.7441860465116282e-06, "loss": 0.4637, "step": 16 }, { "epoch": 0.048295454545454544, "grad_norm": 3.8588657903560684, "learning_rate": 1.86046511627907e-06, "loss": 0.4426, "step": 17 }, { "epoch": 0.05113636363636364, "grad_norm": 1.8811670709600292, "learning_rate": 1.976744186046512e-06, "loss": 0.4305, "step": 18 }, { "epoch": 0.05397727272727273, "grad_norm": 1.6752451580220031, "learning_rate": 2.0930232558139536e-06, "loss": 0.4529, "step": 19 }, { "epoch": 0.056818181818181816, "grad_norm": 1.2090823975791671, "learning_rate": 2.2093023255813954e-06, "loss": 0.3613, "step": 20 }, { "epoch": 0.05965909090909091, "grad_norm": 1.1814336772386804, "learning_rate": 2.3255813953488376e-06, "loss": 0.4037, "step": 21 }, { "epoch": 0.0625, "grad_norm": 0.8954725283144086, "learning_rate": 2.4418604651162793e-06, "loss": 0.3702, "step": 22 }, { "epoch": 0.06534090909090909, "grad_norm": 0.8798870296631145, "learning_rate": 2.558139534883721e-06, "loss": 0.3973, "step": 23 }, { "epoch": 0.06818181818181818, "grad_norm": 0.5832983194953867, "learning_rate": 2.674418604651163e-06, "loss": 0.3262, "step": 24 }, { "epoch": 0.07102272727272728, "grad_norm": 0.8732475291899245, "learning_rate": 2.790697674418605e-06, "loss": 0.3909, "step": 25 }, { "epoch": 0.07386363636363637, "grad_norm": 1.100897285846476, "learning_rate": 2.9069767441860468e-06, "loss": 0.3817, "step": 26 }, { "epoch": 0.07670454545454546, "grad_norm": 1.0608377951702355, "learning_rate": 3.0232558139534885e-06, "loss": 0.3583, "step": 27 }, { "epoch": 0.07954545454545454, "grad_norm": 1.0224952192594947, "learning_rate": 3.1395348837209307e-06, "loss": 0.4162, "step": 28 }, { "epoch": 0.08238636363636363, "grad_norm": 0.8097165887156961, "learning_rate": 3.2558139534883724e-06, "loss": 0.3477, "step": 29 }, { "epoch": 0.08522727272727272, "grad_norm": 0.7315228867679278, "learning_rate": 3.372093023255814e-06, "loss": 0.3951, "step": 30 }, { "epoch": 0.08806818181818182, "grad_norm": 0.6032121177421607, "learning_rate": 3.4883720930232564e-06, "loss": 0.3414, "step": 31 }, { "epoch": 0.09090909090909091, "grad_norm": 0.5651833216962348, "learning_rate": 3.6046511627906977e-06, "loss": 0.3635, "step": 32 }, { "epoch": 0.09375, "grad_norm": 0.5192255380315864, "learning_rate": 3.72093023255814e-06, "loss": 0.3888, "step": 33 }, { "epoch": 0.09659090909090909, "grad_norm": 0.49173473741498314, "learning_rate": 3.837209302325582e-06, "loss": 0.3749, "step": 34 }, { "epoch": 0.09943181818181818, "grad_norm": 0.48300590116190206, "learning_rate": 3.953488372093024e-06, "loss": 0.3719, "step": 35 }, { "epoch": 0.10227272727272728, "grad_norm": 0.47568795818970555, "learning_rate": 4.0697674418604655e-06, "loss": 0.3502, "step": 36 }, { "epoch": 0.10511363636363637, "grad_norm": 0.5738976486828545, "learning_rate": 4.186046511627907e-06, "loss": 0.3553, "step": 37 }, { "epoch": 0.10795454545454546, "grad_norm": 0.48281438241706864, "learning_rate": 4.302325581395349e-06, "loss": 0.3194, "step": 38 }, { "epoch": 0.11079545454545454, "grad_norm": 0.6040813728082152, "learning_rate": 4.418604651162791e-06, "loss": 0.3753, "step": 39 }, { "epoch": 0.11363636363636363, "grad_norm": 0.5510018703021852, "learning_rate": 4.5348837209302326e-06, "loss": 0.3497, "step": 40 }, { "epoch": 0.11647727272727272, "grad_norm": 0.4265614122633672, "learning_rate": 4.651162790697675e-06, "loss": 0.3067, "step": 41 }, { "epoch": 0.11931818181818182, "grad_norm": 0.3982552723726358, "learning_rate": 4.767441860465117e-06, "loss": 0.3166, "step": 42 }, { "epoch": 0.12215909090909091, "grad_norm": 0.42319934937905634, "learning_rate": 4.883720930232559e-06, "loss": 0.3406, "step": 43 }, { "epoch": 0.125, "grad_norm": 0.48844669962812265, "learning_rate": 5e-06, "loss": 0.3809, "step": 44 }, { "epoch": 0.1278409090909091, "grad_norm": 0.4283299903892573, "learning_rate": 4.999987977618099e-06, "loss": 0.3487, "step": 45 }, { "epoch": 0.13068181818181818, "grad_norm": 0.45165901843941525, "learning_rate": 4.999951910588025e-06, "loss": 0.3261, "step": 46 }, { "epoch": 0.13352272727272727, "grad_norm": 0.3309060296669714, "learning_rate": 4.999891799256668e-06, "loss": 0.3122, "step": 47 }, { "epoch": 0.13636363636363635, "grad_norm": 0.3836084760514636, "learning_rate": 4.9998076442021725e-06, "loss": 0.3001, "step": 48 }, { "epoch": 0.13920454545454544, "grad_norm": 0.425230874245839, "learning_rate": 4.999699446233934e-06, "loss": 0.3341, "step": 49 }, { "epoch": 0.14204545454545456, "grad_norm": 0.4444798732501407, "learning_rate": 4.999567206392591e-06, "loss": 0.3373, "step": 50 }, { "epoch": 0.14488636363636365, "grad_norm": 0.381536539310927, "learning_rate": 4.999410925950012e-06, "loss": 0.3267, "step": 51 }, { "epoch": 0.14772727272727273, "grad_norm": 0.3767650025962174, "learning_rate": 4.99923060640929e-06, "loss": 0.328, "step": 52 }, { "epoch": 0.15056818181818182, "grad_norm": 0.3903203005773619, "learning_rate": 4.99902624950472e-06, "loss": 0.3367, "step": 53 }, { "epoch": 0.1534090909090909, "grad_norm": 0.47731540090520985, "learning_rate": 4.9987978572017875e-06, "loss": 0.3749, "step": 54 }, { "epoch": 0.15625, "grad_norm": 0.36341294567474813, "learning_rate": 4.998545431697149e-06, "loss": 0.2952, "step": 55 }, { "epoch": 0.1590909090909091, "grad_norm": 0.4160548663852485, "learning_rate": 4.998268975418606e-06, "loss": 0.3779, "step": 56 }, { "epoch": 0.16193181818181818, "grad_norm": 0.3664734921308225, "learning_rate": 4.997968491025093e-06, "loss": 0.3105, "step": 57 }, { "epoch": 0.16477272727272727, "grad_norm": 0.35755496009312704, "learning_rate": 4.997643981406638e-06, "loss": 0.3508, "step": 58 }, { "epoch": 0.16761363636363635, "grad_norm": 0.3738253178296096, "learning_rate": 4.997295449684345e-06, "loss": 0.349, "step": 59 }, { "epoch": 0.17045454545454544, "grad_norm": 0.3175005755892801, "learning_rate": 4.996922899210358e-06, "loss": 0.2984, "step": 60 }, { "epoch": 0.17329545454545456, "grad_norm": 0.39931619691125575, "learning_rate": 4.996526333567833e-06, "loss": 0.3627, "step": 61 }, { "epoch": 0.17613636363636365, "grad_norm": 0.3726199489633269, "learning_rate": 4.9961057565709015e-06, "loss": 0.3274, "step": 62 }, { "epoch": 0.17897727272727273, "grad_norm": 0.3954308613768431, "learning_rate": 4.995661172264632e-06, "loss": 0.34, "step": 63 }, { "epoch": 0.18181818181818182, "grad_norm": 0.3814105011898473, "learning_rate": 4.995192584924995e-06, "loss": 0.3122, "step": 64 }, { "epoch": 0.1846590909090909, "grad_norm": 0.3655156038716592, "learning_rate": 4.99469999905882e-06, "loss": 0.35, "step": 65 }, { "epoch": 0.1875, "grad_norm": 0.39224970009402493, "learning_rate": 4.99418341940375e-06, "loss": 0.3057, "step": 66 }, { "epoch": 0.1903409090909091, "grad_norm": 0.33083770067354695, "learning_rate": 4.9936428509282e-06, "loss": 0.3144, "step": 67 }, { "epoch": 0.19318181818181818, "grad_norm": 0.31786460924484966, "learning_rate": 4.9930782988313065e-06, "loss": 0.3156, "step": 68 }, { "epoch": 0.19602272727272727, "grad_norm": 0.3515587615165226, "learning_rate": 4.992489768542877e-06, "loss": 0.318, "step": 69 }, { "epoch": 0.19886363636363635, "grad_norm": 0.39680326873271354, "learning_rate": 4.991877265723343e-06, "loss": 0.3319, "step": 70 }, { "epoch": 0.20170454545454544, "grad_norm": 0.3532333123348208, "learning_rate": 4.9912407962636965e-06, "loss": 0.3343, "step": 71 }, { "epoch": 0.20454545454545456, "grad_norm": 0.3684851475562903, "learning_rate": 4.990580366285441e-06, "loss": 0.3214, "step": 72 }, { "epoch": 0.20738636363636365, "grad_norm": 0.34703126004025847, "learning_rate": 4.98989598214053e-06, "loss": 0.3497, "step": 73 }, { "epoch": 0.21022727272727273, "grad_norm": 0.331786659705209, "learning_rate": 4.989187650411306e-06, "loss": 0.3119, "step": 74 }, { "epoch": 0.21306818181818182, "grad_norm": 0.3514432926351399, "learning_rate": 4.988455377910436e-06, "loss": 0.3276, "step": 75 }, { "epoch": 0.2159090909090909, "grad_norm": 0.45669134699095365, "learning_rate": 4.987699171680846e-06, "loss": 0.3502, "step": 76 }, { "epoch": 0.21875, "grad_norm": 0.3799997391446089, "learning_rate": 4.98691903899566e-06, "loss": 0.3389, "step": 77 }, { "epoch": 0.2215909090909091, "grad_norm": 0.32987905423731806, "learning_rate": 4.986114987358118e-06, "loss": 0.3154, "step": 78 }, { "epoch": 0.22443181818181818, "grad_norm": 0.37320907794023317, "learning_rate": 4.985287024501512e-06, "loss": 0.2865, "step": 79 }, { "epoch": 0.22727272727272727, "grad_norm": 0.3606727238448836, "learning_rate": 4.9844351583891125e-06, "loss": 0.3352, "step": 80 }, { "epoch": 0.23011363636363635, "grad_norm": 0.28704484493903537, "learning_rate": 4.983559397214086e-06, "loss": 0.2761, "step": 81 }, { "epoch": 0.23295454545454544, "grad_norm": 0.3395805127723043, "learning_rate": 4.982659749399421e-06, "loss": 0.3013, "step": 82 }, { "epoch": 0.23579545454545456, "grad_norm": 0.32754503212231606, "learning_rate": 4.981736223597845e-06, "loss": 0.3291, "step": 83 }, { "epoch": 0.23863636363636365, "grad_norm": 0.3278411182469415, "learning_rate": 4.9807888286917425e-06, "loss": 0.281, "step": 84 }, { "epoch": 0.24147727272727273, "grad_norm": 0.3312034883074764, "learning_rate": 4.979817573793068e-06, "loss": 0.3484, "step": 85 }, { "epoch": 0.24431818181818182, "grad_norm": 0.3001329867151946, "learning_rate": 4.978822468243259e-06, "loss": 0.2842, "step": 86 }, { "epoch": 0.2471590909090909, "grad_norm": 0.3516159032278349, "learning_rate": 4.977803521613147e-06, "loss": 0.3084, "step": 87 }, { "epoch": 0.25, "grad_norm": 0.3782753735314241, "learning_rate": 4.9767607437028645e-06, "loss": 0.3381, "step": 88 }, { "epoch": 0.2528409090909091, "grad_norm": 0.3170089268559784, "learning_rate": 4.97569414454175e-06, "loss": 0.3215, "step": 89 }, { "epoch": 0.2556818181818182, "grad_norm": 0.29420316873312097, "learning_rate": 4.9746037343882545e-06, "loss": 0.2998, "step": 90 }, { "epoch": 0.2585227272727273, "grad_norm": 0.45657642279690197, "learning_rate": 4.97348952372984e-06, "loss": 0.3354, "step": 91 }, { "epoch": 0.26136363636363635, "grad_norm": 0.32675165284478025, "learning_rate": 4.972351523282878e-06, "loss": 0.2715, "step": 92 }, { "epoch": 0.26420454545454547, "grad_norm": 0.37411987401338476, "learning_rate": 4.97118974399255e-06, "loss": 0.331, "step": 93 }, { "epoch": 0.26704545454545453, "grad_norm": 0.2906231907319114, "learning_rate": 4.970004197032741e-06, "loss": 0.2635, "step": 94 }, { "epoch": 0.26988636363636365, "grad_norm": 0.42609899782651967, "learning_rate": 4.968794893805927e-06, "loss": 0.3662, "step": 95 }, { "epoch": 0.2727272727272727, "grad_norm": 0.35277264498485456, "learning_rate": 4.967561845943074e-06, "loss": 0.3656, "step": 96 }, { "epoch": 0.2755681818181818, "grad_norm": 0.33825537104063047, "learning_rate": 4.966305065303519e-06, "loss": 0.2949, "step": 97 }, { "epoch": 0.2784090909090909, "grad_norm": 0.36200881129772927, "learning_rate": 4.96502456397486e-06, "loss": 0.3457, "step": 98 }, { "epoch": 0.28125, "grad_norm": 0.31133758943801504, "learning_rate": 4.963720354272837e-06, "loss": 0.2831, "step": 99 }, { "epoch": 0.2840909090909091, "grad_norm": 0.3398462998770164, "learning_rate": 4.962392448741216e-06, "loss": 0.308, "step": 100 }, { "epoch": 0.2869318181818182, "grad_norm": 0.2825796948908475, "learning_rate": 4.961040860151669e-06, "loss": 0.2634, "step": 101 }, { "epoch": 0.2897727272727273, "grad_norm": 0.38927704510942096, "learning_rate": 4.9596656015036434e-06, "loss": 0.2942, "step": 102 }, { "epoch": 0.29261363636363635, "grad_norm": 0.35680520232446933, "learning_rate": 4.95826668602425e-06, "loss": 0.3148, "step": 103 }, { "epoch": 0.29545454545454547, "grad_norm": 0.40848691247631896, "learning_rate": 4.956844127168124e-06, "loss": 0.3475, "step": 104 }, { "epoch": 0.29829545454545453, "grad_norm": 0.3675982469780909, "learning_rate": 4.955397938617304e-06, "loss": 0.3223, "step": 105 }, { "epoch": 0.30113636363636365, "grad_norm": 0.32048567892217283, "learning_rate": 4.953928134281093e-06, "loss": 0.316, "step": 106 }, { "epoch": 0.3039772727272727, "grad_norm": 0.3107707861319827, "learning_rate": 4.952434728295931e-06, "loss": 0.3031, "step": 107 }, { "epoch": 0.3068181818181818, "grad_norm": 0.38878643961644715, "learning_rate": 4.950917735025256e-06, "loss": 0.3355, "step": 108 }, { "epoch": 0.3096590909090909, "grad_norm": 0.3735768679081344, "learning_rate": 4.949377169059365e-06, "loss": 0.3008, "step": 109 }, { "epoch": 0.3125, "grad_norm": 0.3808439931809935, "learning_rate": 4.947813045215277e-06, "loss": 0.3002, "step": 110 }, { "epoch": 0.3153409090909091, "grad_norm": 0.3256292929675435, "learning_rate": 4.946225378536587e-06, "loss": 0.2988, "step": 111 }, { "epoch": 0.3181818181818182, "grad_norm": 0.35150877205189135, "learning_rate": 4.944614184293321e-06, "loss": 0.2993, "step": 112 }, { "epoch": 0.3210227272727273, "grad_norm": 0.37494589367664166, "learning_rate": 4.942979477981797e-06, "loss": 0.3129, "step": 113 }, { "epoch": 0.32386363636363635, "grad_norm": 0.3506621432286222, "learning_rate": 4.941321275324463e-06, "loss": 0.3015, "step": 114 }, { "epoch": 0.32670454545454547, "grad_norm": 0.30804865814837706, "learning_rate": 4.939639592269757e-06, "loss": 0.2709, "step": 115 }, { "epoch": 0.32954545454545453, "grad_norm": 0.4334401140811609, "learning_rate": 4.9379344449919465e-06, "loss": 0.3211, "step": 116 }, { "epoch": 0.33238636363636365, "grad_norm": 0.4113976286859321, "learning_rate": 4.936205849890977e-06, "loss": 0.3486, "step": 117 }, { "epoch": 0.3352272727272727, "grad_norm": 0.38143204868428404, "learning_rate": 4.934453823592313e-06, "loss": 0.3248, "step": 118 }, { "epoch": 0.3380681818181818, "grad_norm": 0.3935231496732602, "learning_rate": 4.9326783829467795e-06, "loss": 0.3369, "step": 119 }, { "epoch": 0.3409090909090909, "grad_norm": 0.3715854335519974, "learning_rate": 4.930879545030395e-06, "loss": 0.3162, "step": 120 }, { "epoch": 0.34375, "grad_norm": 0.2987173708346766, "learning_rate": 4.929057327144213e-06, "loss": 0.2704, "step": 121 }, { "epoch": 0.3465909090909091, "grad_norm": 0.3505876441509565, "learning_rate": 4.927211746814155e-06, "loss": 0.2897, "step": 122 }, { "epoch": 0.3494318181818182, "grad_norm": 0.3808807666150658, "learning_rate": 4.925342821790834e-06, "loss": 0.298, "step": 123 }, { "epoch": 0.3522727272727273, "grad_norm": 0.40265933198110954, "learning_rate": 4.923450570049398e-06, "loss": 0.3063, "step": 124 }, { "epoch": 0.35511363636363635, "grad_norm": 0.329984359578131, "learning_rate": 4.921535009789344e-06, "loss": 0.281, "step": 125 }, { "epoch": 0.35795454545454547, "grad_norm": 0.3327810259029677, "learning_rate": 4.91959615943435e-06, "loss": 0.3035, "step": 126 }, { "epoch": 0.36079545454545453, "grad_norm": 0.33832701513333335, "learning_rate": 4.917634037632095e-06, "loss": 0.2817, "step": 127 }, { "epoch": 0.36363636363636365, "grad_norm": 0.3446767418817894, "learning_rate": 4.915648663254081e-06, "loss": 0.3275, "step": 128 }, { "epoch": 0.3664772727272727, "grad_norm": 0.4067285176470478, "learning_rate": 4.9136400553954526e-06, "loss": 0.2644, "step": 129 }, { "epoch": 0.3693181818181818, "grad_norm": 0.32647438056937467, "learning_rate": 4.91160823337481e-06, "loss": 0.3012, "step": 130 }, { "epoch": 0.3721590909090909, "grad_norm": 0.2641653305047082, "learning_rate": 4.909553216734024e-06, "loss": 0.2551, "step": 131 }, { "epoch": 0.375, "grad_norm": 0.3587439503975781, "learning_rate": 4.907475025238051e-06, "loss": 0.3429, "step": 132 }, { "epoch": 0.3778409090909091, "grad_norm": 0.39094595293189244, "learning_rate": 4.905373678874741e-06, "loss": 0.3428, "step": 133 }, { "epoch": 0.3806818181818182, "grad_norm": 0.33295666810345625, "learning_rate": 4.903249197854645e-06, "loss": 0.3024, "step": 134 }, { "epoch": 0.3835227272727273, "grad_norm": 0.4067834961803898, "learning_rate": 4.90110160261082e-06, "loss": 0.388, "step": 135 }, { "epoch": 0.38636363636363635, "grad_norm": 0.3041105753158812, "learning_rate": 4.898930913798635e-06, "loss": 0.2791, "step": 136 }, { "epoch": 0.38920454545454547, "grad_norm": 0.3854716077313248, "learning_rate": 4.89673715229557e-06, "loss": 0.3516, "step": 137 }, { "epoch": 0.39204545454545453, "grad_norm": 0.41029172649451373, "learning_rate": 4.894520339201014e-06, "loss": 0.3221, "step": 138 }, { "epoch": 0.39488636363636365, "grad_norm": 0.31953693308642406, "learning_rate": 4.892280495836068e-06, "loss": 0.3268, "step": 139 }, { "epoch": 0.3977272727272727, "grad_norm": 0.4798811586379984, "learning_rate": 4.890017643743334e-06, "loss": 0.3115, "step": 140 }, { "epoch": 0.4005681818181818, "grad_norm": 0.3603031050892597, "learning_rate": 4.887731804686707e-06, "loss": 0.2844, "step": 141 }, { "epoch": 0.4034090909090909, "grad_norm": 0.40465606169589835, "learning_rate": 4.885423000651174e-06, "loss": 0.3573, "step": 142 }, { "epoch": 0.40625, "grad_norm": 0.3643063680731307, "learning_rate": 4.883091253842592e-06, "loss": 0.2861, "step": 143 }, { "epoch": 0.4090909090909091, "grad_norm": 0.2855806950882976, "learning_rate": 4.8807365866874825e-06, "loss": 0.2856, "step": 144 }, { "epoch": 0.4119318181818182, "grad_norm": 0.43700846878534866, "learning_rate": 4.878359021832812e-06, "loss": 0.3025, "step": 145 }, { "epoch": 0.4147727272727273, "grad_norm": 0.3691328488500052, "learning_rate": 4.875958582145775e-06, "loss": 0.3516, "step": 146 }, { "epoch": 0.41761363636363635, "grad_norm": 0.3602263970719629, "learning_rate": 4.873535290713571e-06, "loss": 0.3276, "step": 147 }, { "epoch": 0.42045454545454547, "grad_norm": 0.2873285630204768, "learning_rate": 4.871089170843192e-06, "loss": 0.272, "step": 148 }, { "epoch": 0.42329545454545453, "grad_norm": 0.3275589221978115, "learning_rate": 4.868620246061185e-06, "loss": 0.3127, "step": 149 }, { "epoch": 0.42613636363636365, "grad_norm": 0.3595600686315243, "learning_rate": 4.866128540113436e-06, "loss": 0.293, "step": 150 }, { "epoch": 0.4289772727272727, "grad_norm": 0.39412366891247624, "learning_rate": 4.863614076964937e-06, "loss": 0.3105, "step": 151 }, { "epoch": 0.4318181818181818, "grad_norm": 0.2967856642106585, "learning_rate": 4.8610768807995575e-06, "loss": 0.2488, "step": 152 }, { "epoch": 0.4346590909090909, "grad_norm": 0.3353960107255814, "learning_rate": 4.85851697601981e-06, "loss": 0.31, "step": 153 }, { "epoch": 0.4375, "grad_norm": 0.3293934153604414, "learning_rate": 4.855934387246619e-06, "loss": 0.31, "step": 154 }, { "epoch": 0.4403409090909091, "grad_norm": 0.4020477745824599, "learning_rate": 4.853329139319076e-06, "loss": 0.3607, "step": 155 }, { "epoch": 0.4431818181818182, "grad_norm": 0.40194438779646285, "learning_rate": 4.850701257294212e-06, "loss": 0.3194, "step": 156 }, { "epoch": 0.4460227272727273, "grad_norm": 0.35880107189234606, "learning_rate": 4.848050766446746e-06, "loss": 0.3257, "step": 157 }, { "epoch": 0.44886363636363635, "grad_norm": 0.3225921590602741, "learning_rate": 4.84537769226885e-06, "loss": 0.2865, "step": 158 }, { "epoch": 0.45170454545454547, "grad_norm": 0.43105913904133064, "learning_rate": 4.842682060469899e-06, "loss": 0.2917, "step": 159 }, { "epoch": 0.45454545454545453, "grad_norm": 0.3984098156673031, "learning_rate": 4.839963896976223e-06, "loss": 0.3137, "step": 160 }, { "epoch": 0.45738636363636365, "grad_norm": 0.34203541957482897, "learning_rate": 4.837223227930864e-06, "loss": 0.3021, "step": 161 }, { "epoch": 0.4602272727272727, "grad_norm": 0.3410914811625815, "learning_rate": 4.834460079693317e-06, "loss": 0.3197, "step": 162 }, { "epoch": 0.4630681818181818, "grad_norm": 0.3668120756523038, "learning_rate": 4.831674478839281e-06, "loss": 0.3242, "step": 163 }, { "epoch": 0.4659090909090909, "grad_norm": 0.34128762447014865, "learning_rate": 4.828866452160402e-06, "loss": 0.2626, "step": 164 }, { "epoch": 0.46875, "grad_norm": 0.34134817423813496, "learning_rate": 4.826036026664014e-06, "loss": 0.2771, "step": 165 }, { "epoch": 0.4715909090909091, "grad_norm": 0.3270025125687817, "learning_rate": 4.823183229572883e-06, "loss": 0.2921, "step": 166 }, { "epoch": 0.4744318181818182, "grad_norm": 0.3701876487404051, "learning_rate": 4.820308088324942e-06, "loss": 0.3315, "step": 167 }, { "epoch": 0.4772727272727273, "grad_norm": 0.4223541290676315, "learning_rate": 4.8174106305730284e-06, "loss": 0.3458, "step": 168 }, { "epoch": 0.48011363636363635, "grad_norm": 0.36826807946452467, "learning_rate": 4.814490884184615e-06, "loss": 0.3098, "step": 169 }, { "epoch": 0.48295454545454547, "grad_norm": 0.34247450811498126, "learning_rate": 4.811548877241549e-06, "loss": 0.2794, "step": 170 }, { "epoch": 0.48579545454545453, "grad_norm": 0.36931394013248037, "learning_rate": 4.808584638039774e-06, "loss": 0.3075, "step": 171 }, { "epoch": 0.48863636363636365, "grad_norm": 0.38654212773141833, "learning_rate": 4.805598195089063e-06, "loss": 0.2957, "step": 172 }, { "epoch": 0.4914772727272727, "grad_norm": 0.327791247654709, "learning_rate": 4.802589577112742e-06, "loss": 0.317, "step": 173 }, { "epoch": 0.4943181818181818, "grad_norm": 0.4180368575468772, "learning_rate": 4.7995588130474145e-06, "loss": 0.2873, "step": 174 }, { "epoch": 0.4971590909090909, "grad_norm": 0.41772200012858535, "learning_rate": 4.7965059320426825e-06, "loss": 0.3365, "step": 175 }, { "epoch": 0.5, "grad_norm": 0.3622810863279747, "learning_rate": 4.7934309634608676e-06, "loss": 0.3406, "step": 176 }, { "epoch": 0.5028409090909091, "grad_norm": 0.33039829085718986, "learning_rate": 4.790333936876727e-06, "loss": 0.2582, "step": 177 }, { "epoch": 0.5056818181818182, "grad_norm": 0.2963847161562058, "learning_rate": 4.78721488207717e-06, "loss": 0.2621, "step": 178 }, { "epoch": 0.5085227272727273, "grad_norm": 0.3688579036529526, "learning_rate": 4.7840738290609714e-06, "loss": 0.3106, "step": 179 }, { "epoch": 0.5113636363636364, "grad_norm": 0.3882009236138182, "learning_rate": 4.78091080803848e-06, "loss": 0.2615, "step": 180 }, { "epoch": 0.5142045454545454, "grad_norm": 0.35367280178437593, "learning_rate": 4.777725849431336e-06, "loss": 0.3045, "step": 181 }, { "epoch": 0.5170454545454546, "grad_norm": 0.3874603305325755, "learning_rate": 4.774518983872169e-06, "loss": 0.3151, "step": 182 }, { "epoch": 0.5198863636363636, "grad_norm": 0.3089601400335368, "learning_rate": 4.77129024220431e-06, "loss": 0.2565, "step": 183 }, { "epoch": 0.5227272727272727, "grad_norm": 0.3741939570187776, "learning_rate": 4.7680396554814886e-06, "loss": 0.2824, "step": 184 }, { "epoch": 0.5255681818181818, "grad_norm": 0.3684238808190501, "learning_rate": 4.764767254967544e-06, "loss": 0.2717, "step": 185 }, { "epoch": 0.5284090909090909, "grad_norm": 0.34181925499552346, "learning_rate": 4.761473072136114e-06, "loss": 0.2984, "step": 186 }, { "epoch": 0.53125, "grad_norm": 0.44267647661167453, "learning_rate": 4.758157138670337e-06, "loss": 0.3472, "step": 187 }, { "epoch": 0.5340909090909091, "grad_norm": 0.3887831736377981, "learning_rate": 4.75481948646255e-06, "loss": 0.3111, "step": 188 }, { "epoch": 0.5369318181818182, "grad_norm": 0.3683856304101638, "learning_rate": 4.751460147613973e-06, "loss": 0.3146, "step": 189 }, { "epoch": 0.5397727272727273, "grad_norm": 0.38527593119976, "learning_rate": 4.748079154434413e-06, "loss": 0.3314, "step": 190 }, { "epoch": 0.5426136363636364, "grad_norm": 0.4031772051747187, "learning_rate": 4.744676539441941e-06, "loss": 0.315, "step": 191 }, { "epoch": 0.5454545454545454, "grad_norm": 0.3353722780310112, "learning_rate": 4.741252335362588e-06, "loss": 0.269, "step": 192 }, { "epoch": 0.5482954545454546, "grad_norm": 0.3394618273632171, "learning_rate": 4.737806575130024e-06, "loss": 0.2745, "step": 193 }, { "epoch": 0.5511363636363636, "grad_norm": 0.4450532210463518, "learning_rate": 4.734339291885246e-06, "loss": 0.3188, "step": 194 }, { "epoch": 0.5539772727272727, "grad_norm": 0.397975066441739, "learning_rate": 4.7308505189762565e-06, "loss": 0.2985, "step": 195 }, { "epoch": 0.5568181818181818, "grad_norm": 0.3440535351319966, "learning_rate": 4.727340289957744e-06, "loss": 0.2809, "step": 196 }, { "epoch": 0.5596590909090909, "grad_norm": 0.6446149440778554, "learning_rate": 4.723808638590759e-06, "loss": 0.3218, "step": 197 }, { "epoch": 0.5625, "grad_norm": 0.37606508969708213, "learning_rate": 4.720255598842392e-06, "loss": 0.3176, "step": 198 }, { "epoch": 0.5653409090909091, "grad_norm": 0.43147254520622674, "learning_rate": 4.716681204885442e-06, "loss": 0.3268, "step": 199 }, { "epoch": 0.5681818181818182, "grad_norm": 0.41993041372097106, "learning_rate": 4.713085491098093e-06, "loss": 0.2804, "step": 200 }, { "epoch": 0.5710227272727273, "grad_norm": 0.48960282010679945, "learning_rate": 4.70946849206358e-06, "loss": 0.3996, "step": 201 }, { "epoch": 0.5738636363636364, "grad_norm": 0.3375570582028718, "learning_rate": 4.705830242569859e-06, "loss": 0.2914, "step": 202 }, { "epoch": 0.5767045454545454, "grad_norm": 0.33067898836626264, "learning_rate": 4.70217077760927e-06, "loss": 0.2717, "step": 203 }, { "epoch": 0.5795454545454546, "grad_norm": 0.3919628586280393, "learning_rate": 4.6984901323781996e-06, "loss": 0.2758, "step": 204 }, { "epoch": 0.5823863636363636, "grad_norm": 0.37621132131624546, "learning_rate": 4.6947883422767475e-06, "loss": 0.2927, "step": 205 }, { "epoch": 0.5852272727272727, "grad_norm": 0.3588621280506994, "learning_rate": 4.69106544290838e-06, "loss": 0.3202, "step": 206 }, { "epoch": 0.5880681818181818, "grad_norm": 0.36135048731331515, "learning_rate": 4.687321470079593e-06, "loss": 0.3075, "step": 207 }, { "epoch": 0.5909090909090909, "grad_norm": 0.3804960320633388, "learning_rate": 4.683556459799562e-06, "loss": 0.304, "step": 208 }, { "epoch": 0.59375, "grad_norm": 0.32482777456644224, "learning_rate": 4.679770448279801e-06, "loss": 0.2333, "step": 209 }, { "epoch": 0.5965909090909091, "grad_norm": 0.38423666885394503, "learning_rate": 4.6759634719338106e-06, "loss": 0.3079, "step": 210 }, { "epoch": 0.5994318181818182, "grad_norm": 0.3584077009643052, "learning_rate": 4.672135567376729e-06, "loss": 0.3078, "step": 211 }, { "epoch": 0.6022727272727273, "grad_norm": 0.43190228684358967, "learning_rate": 4.668286771424982e-06, "loss": 0.3693, "step": 212 }, { "epoch": 0.6051136363636364, "grad_norm": 0.3335333217535499, "learning_rate": 4.664417121095925e-06, "loss": 0.2978, "step": 213 }, { "epoch": 0.6079545454545454, "grad_norm": 0.3343126694937098, "learning_rate": 4.660526653607489e-06, "loss": 0.2654, "step": 214 }, { "epoch": 0.6107954545454546, "grad_norm": 0.400588578067547, "learning_rate": 4.656615406377824e-06, "loss": 0.3541, "step": 215 }, { "epoch": 0.6136363636363636, "grad_norm": 0.28366454469863744, "learning_rate": 4.652683417024933e-06, "loss": 0.2595, "step": 216 }, { "epoch": 0.6164772727272727, "grad_norm": 0.3333388085745537, "learning_rate": 4.648730723366321e-06, "loss": 0.3034, "step": 217 }, { "epoch": 0.6193181818181818, "grad_norm": 0.3802324883963107, "learning_rate": 4.644757363418622e-06, "loss": 0.3149, "step": 218 }, { "epoch": 0.6221590909090909, "grad_norm": 0.3323209944938239, "learning_rate": 4.640763375397235e-06, "loss": 0.2831, "step": 219 }, { "epoch": 0.625, "grad_norm": 0.3816473948946037, "learning_rate": 4.636748797715961e-06, "loss": 0.2901, "step": 220 }, { "epoch": 0.6278409090909091, "grad_norm": 0.45087508944423654, "learning_rate": 4.632713668986628e-06, "loss": 0.2668, "step": 221 }, { "epoch": 0.6306818181818182, "grad_norm": 0.3277834281020941, "learning_rate": 4.628658028018723e-06, "loss": 0.3115, "step": 222 }, { "epoch": 0.6335227272727273, "grad_norm": 0.4149700033604779, "learning_rate": 4.624581913819019e-06, "loss": 0.3049, "step": 223 }, { "epoch": 0.6363636363636364, "grad_norm": 0.2986911926260575, "learning_rate": 4.6204853655911945e-06, "loss": 0.2828, "step": 224 }, { "epoch": 0.6392045454545454, "grad_norm": 0.38662077935688544, "learning_rate": 4.6163684227354656e-06, "loss": 0.3019, "step": 225 }, { "epoch": 0.6420454545454546, "grad_norm": 0.3670137115048512, "learning_rate": 4.612231124848199e-06, "loss": 0.2998, "step": 226 }, { "epoch": 0.6448863636363636, "grad_norm": 0.3820920011764151, "learning_rate": 4.608073511721534e-06, "loss": 0.3627, "step": 227 }, { "epoch": 0.6477272727272727, "grad_norm": 0.26469955866368194, "learning_rate": 4.6038956233430034e-06, "loss": 0.2419, "step": 228 }, { "epoch": 0.6505681818181818, "grad_norm": 0.32240469660709375, "learning_rate": 4.59969749989514e-06, "loss": 0.2692, "step": 229 }, { "epoch": 0.6534090909090909, "grad_norm": 0.3896277142098736, "learning_rate": 4.5954791817551e-06, "loss": 0.2789, "step": 230 }, { "epoch": 0.65625, "grad_norm": 0.3510490299412409, "learning_rate": 4.591240709494269e-06, "loss": 0.281, "step": 231 }, { "epoch": 0.6590909090909091, "grad_norm": 0.3636438474583087, "learning_rate": 4.586982123877871e-06, "loss": 0.2998, "step": 232 }, { "epoch": 0.6619318181818182, "grad_norm": 0.3274578399993675, "learning_rate": 4.582703465864582e-06, "loss": 0.2758, "step": 233 }, { "epoch": 0.6647727272727273, "grad_norm": 0.3205713499503409, "learning_rate": 4.5784047766061305e-06, "loss": 0.2716, "step": 234 }, { "epoch": 0.6676136363636364, "grad_norm": 0.47159005981022434, "learning_rate": 4.574086097446903e-06, "loss": 0.3236, "step": 235 }, { "epoch": 0.6704545454545454, "grad_norm": 0.3617567220761258, "learning_rate": 4.569747469923547e-06, "loss": 0.2863, "step": 236 }, { "epoch": 0.6732954545454546, "grad_norm": 0.32166940611651096, "learning_rate": 4.565388935764572e-06, "loss": 0.31, "step": 237 }, { "epoch": 0.6761363636363636, "grad_norm": 0.3982166865116622, "learning_rate": 4.56101053688995e-06, "loss": 0.2874, "step": 238 }, { "epoch": 0.6789772727272727, "grad_norm": 0.4339388465917976, "learning_rate": 4.5566123154107055e-06, "loss": 0.3374, "step": 239 }, { "epoch": 0.6818181818181818, "grad_norm": 0.36030799942916975, "learning_rate": 4.552194313628518e-06, "loss": 0.2668, "step": 240 }, { "epoch": 0.6846590909090909, "grad_norm": 0.3940718141510353, "learning_rate": 4.547756574035311e-06, "loss": 0.3277, "step": 241 }, { "epoch": 0.6875, "grad_norm": 0.4326472723953054, "learning_rate": 4.5432991393128446e-06, "loss": 0.3227, "step": 242 }, { "epoch": 0.6903409090909091, "grad_norm": 0.41998189617141085, "learning_rate": 4.538822052332306e-06, "loss": 0.339, "step": 243 }, { "epoch": 0.6931818181818182, "grad_norm": 0.36510653915186314, "learning_rate": 4.534325356153892e-06, "loss": 0.2637, "step": 244 }, { "epoch": 0.6960227272727273, "grad_norm": 0.4748073641254545, "learning_rate": 4.529809094026404e-06, "loss": 0.3226, "step": 245 }, { "epoch": 0.6988636363636364, "grad_norm": 0.3848777680236735, "learning_rate": 4.525273309386825e-06, "loss": 0.3401, "step": 246 }, { "epoch": 0.7017045454545454, "grad_norm": 0.286675785535149, "learning_rate": 4.5207180458599e-06, "loss": 0.2495, "step": 247 }, { "epoch": 0.7045454545454546, "grad_norm": 0.3770143744991594, "learning_rate": 4.516143347257726e-06, "loss": 0.2923, "step": 248 }, { "epoch": 0.7073863636363636, "grad_norm": 0.37240976329747977, "learning_rate": 4.511549257579322e-06, "loss": 0.2968, "step": 249 }, { "epoch": 0.7102272727272727, "grad_norm": 0.53790018713925, "learning_rate": 4.506935821010206e-06, "loss": 0.298, "step": 250 }, { "epoch": 0.7130681818181818, "grad_norm": 0.3896643010491094, "learning_rate": 4.502303081921978e-06, "loss": 0.3125, "step": 251 }, { "epoch": 0.7159090909090909, "grad_norm": 0.32770126981260167, "learning_rate": 4.497651084871883e-06, "loss": 0.2781, "step": 252 }, { "epoch": 0.71875, "grad_norm": 0.3541924637393212, "learning_rate": 4.492979874602389e-06, "loss": 0.3023, "step": 253 }, { "epoch": 0.7215909090909091, "grad_norm": 0.3735099253437524, "learning_rate": 4.4882894960407566e-06, "loss": 0.3225, "step": 254 }, { "epoch": 0.7244318181818182, "grad_norm": 0.3853359485269271, "learning_rate": 4.483579994298602e-06, "loss": 0.3119, "step": 255 }, { "epoch": 0.7272727272727273, "grad_norm": 0.4232262055395998, "learning_rate": 4.478851414671469e-06, "loss": 0.2996, "step": 256 }, { "epoch": 0.7301136363636364, "grad_norm": 0.3403475343187684, "learning_rate": 4.474103802638389e-06, "loss": 0.2948, "step": 257 }, { "epoch": 0.7329545454545454, "grad_norm": 0.4197482437210073, "learning_rate": 4.469337203861447e-06, "loss": 0.2999, "step": 258 }, { "epoch": 0.7357954545454546, "grad_norm": 0.33941700168906186, "learning_rate": 4.464551664185339e-06, "loss": 0.2636, "step": 259 }, { "epoch": 0.7386363636363636, "grad_norm": 0.35067662494508334, "learning_rate": 4.459747229636933e-06, "loss": 0.3153, "step": 260 }, { "epoch": 0.7414772727272727, "grad_norm": 0.33432839847763335, "learning_rate": 4.454923946424827e-06, "loss": 0.2646, "step": 261 }, { "epoch": 0.7443181818181818, "grad_norm": 0.3486384565640427, "learning_rate": 4.450081860938904e-06, "loss": 0.3026, "step": 262 }, { "epoch": 0.7471590909090909, "grad_norm": 0.3647193452879592, "learning_rate": 4.4452210197498845e-06, "loss": 0.3208, "step": 263 }, { "epoch": 0.75, "grad_norm": 0.3621939393169193, "learning_rate": 4.440341469608879e-06, "loss": 0.3042, "step": 264 }, { "epoch": 0.7528409090909091, "grad_norm": 0.2856803312521231, "learning_rate": 4.43544325744694e-06, "loss": 0.2548, "step": 265 }, { "epoch": 0.7556818181818182, "grad_norm": 0.41636147550676134, "learning_rate": 4.4305264303746085e-06, "loss": 0.2743, "step": 266 }, { "epoch": 0.7585227272727273, "grad_norm": 0.3149004485762251, "learning_rate": 4.425591035681465e-06, "loss": 0.2768, "step": 267 }, { "epoch": 0.7613636363636364, "grad_norm": 0.39793987625802313, "learning_rate": 4.420637120835668e-06, "loss": 0.3055, "step": 268 }, { "epoch": 0.7642045454545454, "grad_norm": 0.4058178861459375, "learning_rate": 4.415664733483502e-06, "loss": 0.3168, "step": 269 }, { "epoch": 0.7670454545454546, "grad_norm": 0.3732878019312248, "learning_rate": 4.4106739214489195e-06, "loss": 0.2935, "step": 270 }, { "epoch": 0.7698863636363636, "grad_norm": 0.31801887671340195, "learning_rate": 4.405664732733079e-06, "loss": 0.2768, "step": 271 }, { "epoch": 0.7727272727272727, "grad_norm": 0.43538965465048635, "learning_rate": 4.400637215513883e-06, "loss": 0.2644, "step": 272 }, { "epoch": 0.7755681818181818, "grad_norm": 0.3619890541849985, "learning_rate": 4.395591418145519e-06, "loss": 0.2671, "step": 273 }, { "epoch": 0.7784090909090909, "grad_norm": 0.43611885998338823, "learning_rate": 4.390527389157989e-06, "loss": 0.3481, "step": 274 }, { "epoch": 0.78125, "grad_norm": 0.411038314679305, "learning_rate": 4.385445177256646e-06, "loss": 0.3283, "step": 275 }, { "epoch": 0.7840909090909091, "grad_norm": 0.4004177118376606, "learning_rate": 4.380344831321722e-06, "loss": 0.3421, "step": 276 }, { "epoch": 0.7869318181818182, "grad_norm": 0.31547958031028983, "learning_rate": 4.375226400407863e-06, "loss": 0.2541, "step": 277 }, { "epoch": 0.7897727272727273, "grad_norm": 0.36900762280860266, "learning_rate": 4.370089933743654e-06, "loss": 0.3097, "step": 278 }, { "epoch": 0.7926136363636364, "grad_norm": 0.4686945698836896, "learning_rate": 4.364935480731147e-06, "loss": 0.2918, "step": 279 }, { "epoch": 0.7954545454545454, "grad_norm": 0.3509902009735286, "learning_rate": 4.3597630909453835e-06, "loss": 0.2646, "step": 280 }, { "epoch": 0.7982954545454546, "grad_norm": 0.30875325359327965, "learning_rate": 4.35457281413392e-06, "loss": 0.2349, "step": 281 }, { "epoch": 0.8011363636363636, "grad_norm": 0.3943745151294021, "learning_rate": 4.349364700216346e-06, "loss": 0.2764, "step": 282 }, { "epoch": 0.8039772727272727, "grad_norm": 0.35558604531483284, "learning_rate": 4.344138799283814e-06, "loss": 0.2442, "step": 283 }, { "epoch": 0.8068181818181818, "grad_norm": 0.38278211936173095, "learning_rate": 4.338895161598541e-06, "loss": 0.3294, "step": 284 }, { "epoch": 0.8096590909090909, "grad_norm": 0.3932974746294698, "learning_rate": 4.333633837593341e-06, "loss": 0.2951, "step": 285 }, { "epoch": 0.8125, "grad_norm": 0.31762648150994005, "learning_rate": 4.328354877871131e-06, "loss": 0.2612, "step": 286 }, { "epoch": 0.8153409090909091, "grad_norm": 0.3405862130473983, "learning_rate": 4.323058333204446e-06, "loss": 0.2833, "step": 287 }, { "epoch": 0.8181818181818182, "grad_norm": 0.31883855959276614, "learning_rate": 4.317744254534954e-06, "loss": 0.2609, "step": 288 }, { "epoch": 0.8210227272727273, "grad_norm": 0.39913277335187336, "learning_rate": 4.312412692972959e-06, "loss": 0.2758, "step": 289 }, { "epoch": 0.8238636363636364, "grad_norm": 0.39064418227258985, "learning_rate": 4.307063699796918e-06, "loss": 0.2664, "step": 290 }, { "epoch": 0.8267045454545454, "grad_norm": 0.3126978473531618, "learning_rate": 4.301697326452942e-06, "loss": 0.2572, "step": 291 }, { "epoch": 0.8295454545454546, "grad_norm": 0.3641340405050646, "learning_rate": 4.296313624554303e-06, "loss": 0.286, "step": 292 }, { "epoch": 0.8323863636363636, "grad_norm": 0.4168496899263259, "learning_rate": 4.290912645880936e-06, "loss": 0.3035, "step": 293 }, { "epoch": 0.8352272727272727, "grad_norm": 0.3466683321895305, "learning_rate": 4.285494442378945e-06, "loss": 0.2853, "step": 294 }, { "epoch": 0.8380681818181818, "grad_norm": 0.3572355149237221, "learning_rate": 4.280059066160098e-06, "loss": 0.3021, "step": 295 }, { "epoch": 0.8409090909090909, "grad_norm": 0.36054386776426756, "learning_rate": 4.274606569501332e-06, "loss": 0.3041, "step": 296 }, { "epoch": 0.84375, "grad_norm": 0.3220431488871405, "learning_rate": 4.269137004844242e-06, "loss": 0.2542, "step": 297 }, { "epoch": 0.8465909090909091, "grad_norm": 0.4103185848899213, "learning_rate": 4.2636504247945865e-06, "loss": 0.2859, "step": 298 }, { "epoch": 0.8494318181818182, "grad_norm": 0.3444474167498623, "learning_rate": 4.258146882121772e-06, "loss": 0.3082, "step": 299 }, { "epoch": 0.8522727272727273, "grad_norm": 0.35145064032825696, "learning_rate": 4.252626429758354e-06, "loss": 0.2679, "step": 300 }, { "epoch": 0.8551136363636364, "grad_norm": 0.39931471518127176, "learning_rate": 4.247089120799521e-06, "loss": 0.3486, "step": 301 }, { "epoch": 0.8579545454545454, "grad_norm": 0.2860970262972797, "learning_rate": 4.241535008502587e-06, "loss": 0.23, "step": 302 }, { "epoch": 0.8607954545454546, "grad_norm": 0.4649020596412495, "learning_rate": 4.235964146286479e-06, "loss": 0.3252, "step": 303 }, { "epoch": 0.8636363636363636, "grad_norm": 0.3482820070437071, "learning_rate": 4.230376587731225e-06, "loss": 0.2854, "step": 304 }, { "epoch": 0.8664772727272727, "grad_norm": 0.3269410990279316, "learning_rate": 4.2247723865774336e-06, "loss": 0.2563, "step": 305 }, { "epoch": 0.8693181818181818, "grad_norm": 0.31949294830520775, "learning_rate": 4.219151596725782e-06, "loss": 0.2688, "step": 306 }, { "epoch": 0.8721590909090909, "grad_norm": 0.43502447469171057, "learning_rate": 4.213514272236499e-06, "loss": 0.3386, "step": 307 }, { "epoch": 0.875, "grad_norm": 0.3797117211719601, "learning_rate": 4.207860467328835e-06, "loss": 0.2855, "step": 308 }, { "epoch": 0.8778409090909091, "grad_norm": 0.3799062699361923, "learning_rate": 4.202190236380552e-06, "loss": 0.2545, "step": 309 }, { "epoch": 0.8806818181818182, "grad_norm": 0.3360385792661154, "learning_rate": 4.196503633927398e-06, "loss": 0.2909, "step": 310 }, { "epoch": 0.8835227272727273, "grad_norm": 0.4188943106281552, "learning_rate": 4.190800714662576e-06, "loss": 0.3291, "step": 311 }, { "epoch": 0.8863636363636364, "grad_norm": 0.43183074366157487, "learning_rate": 4.185081533436226e-06, "loss": 0.3303, "step": 312 }, { "epoch": 0.8892045454545454, "grad_norm": 0.35087669084397133, "learning_rate": 4.179346145254892e-06, "loss": 0.3152, "step": 313 }, { "epoch": 0.8920454545454546, "grad_norm": 0.34360678080641915, "learning_rate": 4.173594605280995e-06, "loss": 0.2726, "step": 314 }, { "epoch": 0.8948863636363636, "grad_norm": 0.39638626020449463, "learning_rate": 4.1678269688323045e-06, "loss": 0.3369, "step": 315 }, { "epoch": 0.8977272727272727, "grad_norm": 0.3566510725505037, "learning_rate": 4.1620432913814026e-06, "loss": 0.2469, "step": 316 }, { "epoch": 0.9005681818181818, "grad_norm": 0.32842562735745623, "learning_rate": 4.156243628555151e-06, "loss": 0.3018, "step": 317 }, { "epoch": 0.9034090909090909, "grad_norm": 0.30679142774263857, "learning_rate": 4.150428036134161e-06, "loss": 0.2476, "step": 318 }, { "epoch": 0.90625, "grad_norm": 0.38736943533330265, "learning_rate": 4.144596570052249e-06, "loss": 0.279, "step": 319 }, { "epoch": 0.9090909090909091, "grad_norm": 0.3461038914128392, "learning_rate": 4.1387492863959076e-06, "loss": 0.262, "step": 320 }, { "epoch": 0.9119318181818182, "grad_norm": 0.3328949084965424, "learning_rate": 4.132886241403756e-06, "loss": 0.2841, "step": 321 }, { "epoch": 0.9147727272727273, "grad_norm": 0.3684252037786764, "learning_rate": 4.127007491466008e-06, "loss": 0.3032, "step": 322 }, { "epoch": 0.9176136363636364, "grad_norm": 0.44163540100916987, "learning_rate": 4.121113093123925e-06, "loss": 0.3164, "step": 323 }, { "epoch": 0.9204545454545454, "grad_norm": 0.49048074141989995, "learning_rate": 4.115203103069273e-06, "loss": 0.2623, "step": 324 }, { "epoch": 0.9232954545454546, "grad_norm": 0.34827477492871306, "learning_rate": 4.109277578143779e-06, "loss": 0.2717, "step": 325 }, { "epoch": 0.9261363636363636, "grad_norm": 0.3603610666299997, "learning_rate": 4.10333657533858e-06, "loss": 0.2783, "step": 326 }, { "epoch": 0.9289772727272727, "grad_norm": 0.3901080384564019, "learning_rate": 4.097380151793681e-06, "loss": 0.286, "step": 327 }, { "epoch": 0.9318181818181818, "grad_norm": 0.3598672604385207, "learning_rate": 4.0914083647974025e-06, "loss": 0.3375, "step": 328 }, { "epoch": 0.9346590909090909, "grad_norm": 0.32775404856254314, "learning_rate": 4.085421271785824e-06, "loss": 0.2904, "step": 329 }, { "epoch": 0.9375, "grad_norm": 0.29442351680114387, "learning_rate": 4.079418930342243e-06, "loss": 0.2629, "step": 330 }, { "epoch": 0.9403409090909091, "grad_norm": 0.4405796100351076, "learning_rate": 4.0734013981966125e-06, "loss": 0.3665, "step": 331 }, { "epoch": 0.9431818181818182, "grad_norm": 0.3334068109525356, "learning_rate": 4.0673687332249866e-06, "loss": 0.3079, "step": 332 }, { "epoch": 0.9460227272727273, "grad_norm": 0.32669985590044703, "learning_rate": 4.061320993448968e-06, "loss": 0.2904, "step": 333 }, { "epoch": 0.9488636363636364, "grad_norm": 0.3442146928076968, "learning_rate": 4.055258237035146e-06, "loss": 0.3146, "step": 334 }, { "epoch": 0.9517045454545454, "grad_norm": 0.4309052746676042, "learning_rate": 4.04918052229454e-06, "loss": 0.3446, "step": 335 }, { "epoch": 0.9545454545454546, "grad_norm": 0.35908542610160016, "learning_rate": 4.043087907682035e-06, "loss": 0.2534, "step": 336 }, { "epoch": 0.9573863636363636, "grad_norm": 0.3894188962377372, "learning_rate": 4.036980451795822e-06, "loss": 0.3262, "step": 337 }, { "epoch": 0.9602272727272727, "grad_norm": 0.37392061032103363, "learning_rate": 4.030858213376838e-06, "loss": 0.3158, "step": 338 }, { "epoch": 0.9630681818181818, "grad_norm": 0.3880624083667109, "learning_rate": 4.02472125130819e-06, "loss": 0.2908, "step": 339 }, { "epoch": 0.9659090909090909, "grad_norm": 0.4031632690009814, "learning_rate": 4.018569624614602e-06, "loss": 0.3279, "step": 340 }, { "epoch": 0.96875, "grad_norm": 0.38583919245780574, "learning_rate": 4.012403392461837e-06, "loss": 0.2657, "step": 341 }, { "epoch": 0.9715909090909091, "grad_norm": 0.4657940346556613, "learning_rate": 4.006222614156132e-06, "loss": 0.3176, "step": 342 }, { "epoch": 0.9744318181818182, "grad_norm": 0.28406132307929355, "learning_rate": 4.000027349143633e-06, "loss": 0.2261, "step": 343 }, { "epoch": 0.9772727272727273, "grad_norm": 0.3809447081607224, "learning_rate": 3.993817657009808e-06, "loss": 0.291, "step": 344 }, { "epoch": 0.9801136363636364, "grad_norm": 0.37276416289236974, "learning_rate": 3.987593597478894e-06, "loss": 0.3229, "step": 345 }, { "epoch": 0.9829545454545454, "grad_norm": 0.36213806018136363, "learning_rate": 3.981355230413305e-06, "loss": 0.2785, "step": 346 }, { "epoch": 0.9857954545454546, "grad_norm": 0.3774008729788378, "learning_rate": 3.975102615813068e-06, "loss": 0.272, "step": 347 }, { "epoch": 0.9886363636363636, "grad_norm": 0.3268419464248498, "learning_rate": 3.968835813815236e-06, "loss": 0.2468, "step": 348 }, { "epoch": 0.9914772727272727, "grad_norm": 0.401670934547313, "learning_rate": 3.962554884693323e-06, "loss": 0.2953, "step": 349 }, { "epoch": 0.9943181818181818, "grad_norm": 0.40169610324443583, "learning_rate": 3.956259888856708e-06, "loss": 0.2939, "step": 350 }, { "epoch": 0.9971590909090909, "grad_norm": 0.2891600640815435, "learning_rate": 3.949950886850069e-06, "loss": 0.2805, "step": 351 }, { "epoch": 1.0, "grad_norm": 0.3279215818041681, "learning_rate": 3.943627939352789e-06, "loss": 0.2598, "step": 352 }, { "epoch": 1.0028409090909092, "grad_norm": 0.3533913319935541, "learning_rate": 3.9372911071783805e-06, "loss": 0.2673, "step": 353 }, { "epoch": 1.0056818181818181, "grad_norm": 0.38416565428145066, "learning_rate": 3.930940451273898e-06, "loss": 0.2933, "step": 354 }, { "epoch": 1.0085227272727273, "grad_norm": 0.41220420942768127, "learning_rate": 3.924576032719349e-06, "loss": 0.2952, "step": 355 }, { "epoch": 1.0113636363636365, "grad_norm": 0.4096268298831798, "learning_rate": 3.9181979127271076e-06, "loss": 0.2575, "step": 356 }, { "epoch": 1.0142045454545454, "grad_norm": 0.45379315898269595, "learning_rate": 3.911806152641333e-06, "loss": 0.2717, "step": 357 }, { "epoch": 1.0170454545454546, "grad_norm": 0.32770827000624236, "learning_rate": 3.9054008139373675e-06, "loss": 0.266, "step": 358 }, { "epoch": 1.0198863636363635, "grad_norm": 0.2965104343367262, "learning_rate": 3.8989819582211555e-06, "loss": 0.2548, "step": 359 }, { "epoch": 1.0227272727272727, "grad_norm": 0.4054461782711258, "learning_rate": 3.892549647228642e-06, "loss": 0.3398, "step": 360 }, { "epoch": 1.0255681818181819, "grad_norm": 0.39022556113460055, "learning_rate": 3.886103942825189e-06, "loss": 0.2826, "step": 361 }, { "epoch": 1.0284090909090908, "grad_norm": 0.3374532413491821, "learning_rate": 3.879644907004972e-06, "loss": 0.2644, "step": 362 }, { "epoch": 1.03125, "grad_norm": 0.337718457045594, "learning_rate": 3.873172601890386e-06, "loss": 0.2545, "step": 363 }, { "epoch": 1.0340909090909092, "grad_norm": 0.3729922751436951, "learning_rate": 3.86668708973145e-06, "loss": 0.2951, "step": 364 }, { "epoch": 1.0369318181818181, "grad_norm": 0.31238473142978845, "learning_rate": 3.860188432905209e-06, "loss": 0.2537, "step": 365 }, { "epoch": 1.0397727272727273, "grad_norm": 0.37350151083829397, "learning_rate": 3.853676693915129e-06, "loss": 0.2614, "step": 366 }, { "epoch": 1.0426136363636365, "grad_norm": 0.3575634359205247, "learning_rate": 3.8471519353905025e-06, "loss": 0.2437, "step": 367 }, { "epoch": 1.0454545454545454, "grad_norm": 0.3537757819725644, "learning_rate": 3.840614220085837e-06, "loss": 0.2747, "step": 368 }, { "epoch": 1.0482954545454546, "grad_norm": 0.34943668518465093, "learning_rate": 3.834063610880263e-06, "loss": 0.2844, "step": 369 }, { "epoch": 1.0511363636363635, "grad_norm": 0.32611370130766987, "learning_rate": 3.827500170776921e-06, "loss": 0.2578, "step": 370 }, { "epoch": 1.0539772727272727, "grad_norm": 0.29743321074762596, "learning_rate": 3.8209239629023565e-06, "loss": 0.2361, "step": 371 }, { "epoch": 1.0568181818181819, "grad_norm": 0.3317934285561481, "learning_rate": 3.814335050505916e-06, "loss": 0.2645, "step": 372 }, { "epoch": 1.0596590909090908, "grad_norm": 0.40729226447208133, "learning_rate": 3.8077334969591377e-06, "loss": 0.2929, "step": 373 }, { "epoch": 1.0625, "grad_norm": 0.35583822537265253, "learning_rate": 3.801119365755138e-06, "loss": 0.3036, "step": 374 }, { "epoch": 1.0653409090909092, "grad_norm": 0.47116931222172215, "learning_rate": 3.7944927205080073e-06, "loss": 0.2962, "step": 375 }, { "epoch": 1.0681818181818181, "grad_norm": 0.4620500786524589, "learning_rate": 3.7878536249521935e-06, "loss": 0.3186, "step": 376 }, { "epoch": 1.0710227272727273, "grad_norm": 0.4310223125222202, "learning_rate": 3.7812021429418886e-06, "loss": 0.305, "step": 377 }, { "epoch": 1.0738636363636365, "grad_norm": 0.35860375920691345, "learning_rate": 3.77453833845042e-06, "loss": 0.3124, "step": 378 }, { "epoch": 1.0767045454545454, "grad_norm": 0.40493909967111513, "learning_rate": 3.7678622755696292e-06, "loss": 0.2649, "step": 379 }, { "epoch": 1.0795454545454546, "grad_norm": 0.3699164344949677, "learning_rate": 3.7611740185092587e-06, "loss": 0.3346, "step": 380 }, { "epoch": 1.0823863636363635, "grad_norm": 0.5931781411606138, "learning_rate": 3.754473631596332e-06, "loss": 0.2729, "step": 381 }, { "epoch": 1.0852272727272727, "grad_norm": 0.3122039055630976, "learning_rate": 3.7477611792745384e-06, "loss": 0.2816, "step": 382 }, { "epoch": 1.0880681818181819, "grad_norm": 0.35273556528651445, "learning_rate": 3.7410367261036094e-06, "loss": 0.2765, "step": 383 }, { "epoch": 1.0909090909090908, "grad_norm": 0.361323677115818, "learning_rate": 3.7343003367587e-06, "loss": 0.2831, "step": 384 }, { "epoch": 1.09375, "grad_norm": 0.3776789429609578, "learning_rate": 3.727552076029767e-06, "loss": 0.3006, "step": 385 }, { "epoch": 1.0965909090909092, "grad_norm": 0.4049848534001206, "learning_rate": 3.7207920088209454e-06, "loss": 0.3213, "step": 386 }, { "epoch": 1.0994318181818181, "grad_norm": 0.3541711790485223, "learning_rate": 3.7140202001499214e-06, "loss": 0.2902, "step": 387 }, { "epoch": 1.1022727272727273, "grad_norm": 0.3501668624619801, "learning_rate": 3.707236715147312e-06, "loss": 0.2809, "step": 388 }, { "epoch": 1.1051136363636365, "grad_norm": 0.38321621491594765, "learning_rate": 3.700441619056035e-06, "loss": 0.3163, "step": 389 }, { "epoch": 1.1079545454545454, "grad_norm": 0.4044457614031915, "learning_rate": 3.693634977230681e-06, "loss": 0.2862, "step": 390 }, { "epoch": 1.1107954545454546, "grad_norm": 0.40951036198359486, "learning_rate": 3.686816855136891e-06, "loss": 0.28, "step": 391 }, { "epoch": 1.1136363636363635, "grad_norm": 0.29410719311103134, "learning_rate": 3.679987318350717e-06, "loss": 0.2299, "step": 392 }, { "epoch": 1.1164772727272727, "grad_norm": 0.3819079818809885, "learning_rate": 3.673146432557998e-06, "loss": 0.32, "step": 393 }, { "epoch": 1.1193181818181819, "grad_norm": 0.3628245440460693, "learning_rate": 3.666294263553729e-06, "loss": 0.2724, "step": 394 }, { "epoch": 1.1221590909090908, "grad_norm": 0.34928329721642853, "learning_rate": 3.659430877241423e-06, "loss": 0.248, "step": 395 }, { "epoch": 1.125, "grad_norm": 0.442925717620733, "learning_rate": 3.6525563396324826e-06, "loss": 0.2942, "step": 396 }, { "epoch": 1.1278409090909092, "grad_norm": 0.4525323331322651, "learning_rate": 3.6456707168455584e-06, "loss": 0.3258, "step": 397 }, { "epoch": 1.1306818181818181, "grad_norm": 0.39153087965891287, "learning_rate": 3.6387740751059218e-06, "loss": 0.3072, "step": 398 }, { "epoch": 1.1335227272727273, "grad_norm": 0.3886102447660378, "learning_rate": 3.6318664807448218e-06, "loss": 0.3415, "step": 399 }, { "epoch": 1.1363636363636362, "grad_norm": 0.3642339507412296, "learning_rate": 3.6249480001988463e-06, "loss": 0.2691, "step": 400 }, { "epoch": 1.1392045454545454, "grad_norm": 0.3380651370346197, "learning_rate": 3.6180187000092894e-06, "loss": 0.2791, "step": 401 }, { "epoch": 1.1420454545454546, "grad_norm": 0.3193544491508243, "learning_rate": 3.611078646821505e-06, "loss": 0.2326, "step": 402 }, { "epoch": 1.1448863636363638, "grad_norm": 0.30524333443799656, "learning_rate": 3.6041279073842684e-06, "loss": 0.2489, "step": 403 }, { "epoch": 1.1477272727272727, "grad_norm": 0.39683144371135337, "learning_rate": 3.597166548549136e-06, "loss": 0.2656, "step": 404 }, { "epoch": 1.1505681818181819, "grad_norm": 0.39975422805218463, "learning_rate": 3.590194637269798e-06, "loss": 0.2823, "step": 405 }, { "epoch": 1.1534090909090908, "grad_norm": 0.3781718281788356, "learning_rate": 3.5832122406014398e-06, "loss": 0.2545, "step": 406 }, { "epoch": 1.15625, "grad_norm": 0.39633632407524205, "learning_rate": 3.576219425700092e-06, "loss": 0.2656, "step": 407 }, { "epoch": 1.1590909090909092, "grad_norm": 0.503126670284463, "learning_rate": 3.5692162598219877e-06, "loss": 0.3106, "step": 408 }, { "epoch": 1.1619318181818181, "grad_norm": 0.3803993289484403, "learning_rate": 3.5622028103229154e-06, "loss": 0.2777, "step": 409 }, { "epoch": 1.1647727272727273, "grad_norm": 0.32896270814306483, "learning_rate": 3.555179144657568e-06, "loss": 0.2681, "step": 410 }, { "epoch": 1.1676136363636362, "grad_norm": 0.45079184347220275, "learning_rate": 3.548145330378901e-06, "loss": 0.298, "step": 411 }, { "epoch": 1.1704545454545454, "grad_norm": 0.3409745563125651, "learning_rate": 3.5411014351374735e-06, "loss": 0.2829, "step": 412 }, { "epoch": 1.1732954545454546, "grad_norm": 0.3524051821269997, "learning_rate": 3.5340475266808046e-06, "loss": 0.2897, "step": 413 }, { "epoch": 1.1761363636363638, "grad_norm": 0.31354296956532873, "learning_rate": 3.5269836728527194e-06, "loss": 0.2512, "step": 414 }, { "epoch": 1.1789772727272727, "grad_norm": 0.2819333444591201, "learning_rate": 3.5199099415926985e-06, "loss": 0.2336, "step": 415 }, { "epoch": 1.1818181818181819, "grad_norm": 0.3667062945127836, "learning_rate": 3.5128264009352177e-06, "loss": 0.2797, "step": 416 }, { "epoch": 1.1846590909090908, "grad_norm": 0.3717065816803459, "learning_rate": 3.5057331190091036e-06, "loss": 0.2625, "step": 417 }, { "epoch": 1.1875, "grad_norm": 0.34247191523071263, "learning_rate": 3.4986301640368726e-06, "loss": 0.2915, "step": 418 }, { "epoch": 1.1903409090909092, "grad_norm": 0.28055115946196074, "learning_rate": 3.4915176043340726e-06, "loss": 0.2323, "step": 419 }, { "epoch": 1.1931818181818181, "grad_norm": 0.3512617852047132, "learning_rate": 3.4843955083086315e-06, "loss": 0.276, "step": 420 }, { "epoch": 1.1960227272727273, "grad_norm": 0.3402592655838616, "learning_rate": 3.477263944460196e-06, "loss": 0.258, "step": 421 }, { "epoch": 1.1988636363636362, "grad_norm": 0.3440775197912379, "learning_rate": 3.4701229813794744e-06, "loss": 0.2686, "step": 422 }, { "epoch": 1.2017045454545454, "grad_norm": 0.32159613738142184, "learning_rate": 3.4629726877475733e-06, "loss": 0.2775, "step": 423 }, { "epoch": 1.2045454545454546, "grad_norm": 0.3405153808986929, "learning_rate": 3.4558131323353423e-06, "loss": 0.2947, "step": 424 }, { "epoch": 1.2073863636363638, "grad_norm": 0.4111884872726661, "learning_rate": 3.4486443840027084e-06, "loss": 0.2427, "step": 425 }, { "epoch": 1.2102272727272727, "grad_norm": 0.38692560086654654, "learning_rate": 3.4414665116980167e-06, "loss": 0.3084, "step": 426 }, { "epoch": 1.2130681818181819, "grad_norm": 0.4000466884476275, "learning_rate": 3.4342795844573634e-06, "loss": 0.2933, "step": 427 }, { "epoch": 1.2159090909090908, "grad_norm": 0.3605831840618787, "learning_rate": 3.427083671403937e-06, "loss": 0.2892, "step": 428 }, { "epoch": 1.21875, "grad_norm": 0.3225439729294941, "learning_rate": 3.4198788417473485e-06, "loss": 0.2579, "step": 429 }, { "epoch": 1.2215909090909092, "grad_norm": 0.3869565428112392, "learning_rate": 3.41266516478297e-06, "loss": 0.3349, "step": 430 }, { "epoch": 1.2244318181818181, "grad_norm": 0.3790938940448294, "learning_rate": 3.4054427098912636e-06, "loss": 0.2836, "step": 431 }, { "epoch": 1.2272727272727273, "grad_norm": 0.33485764653621325, "learning_rate": 3.3982115465371185e-06, "loss": 0.2465, "step": 432 }, { "epoch": 1.2301136363636362, "grad_norm": 0.3421027182025914, "learning_rate": 3.390971744269181e-06, "loss": 0.2436, "step": 433 }, { "epoch": 1.2329545454545454, "grad_norm": 0.3343569283936874, "learning_rate": 3.3837233727191856e-06, "loss": 0.2533, "step": 434 }, { "epoch": 1.2357954545454546, "grad_norm": 0.3490337805677148, "learning_rate": 3.3764665016012842e-06, "loss": 0.2401, "step": 435 }, { "epoch": 1.2386363636363638, "grad_norm": 0.3116736362955648, "learning_rate": 3.3692012007113776e-06, "loss": 0.2482, "step": 436 }, { "epoch": 1.2414772727272727, "grad_norm": 0.3963218536576595, "learning_rate": 3.3619275399264444e-06, "loss": 0.2944, "step": 437 }, { "epoch": 1.2443181818181819, "grad_norm": 0.39432480274886955, "learning_rate": 3.3546455892038666e-06, "loss": 0.2918, "step": 438 }, { "epoch": 1.2471590909090908, "grad_norm": 0.3775480283393243, "learning_rate": 3.3473554185807573e-06, "loss": 0.2771, "step": 439 }, { "epoch": 1.25, "grad_norm": 0.34490450741107803, "learning_rate": 3.340057098173288e-06, "loss": 0.2756, "step": 440 }, { "epoch": 1.2528409090909092, "grad_norm": 0.3324905873722346, "learning_rate": 3.3327506981760183e-06, "loss": 0.2608, "step": 441 }, { "epoch": 1.2556818181818181, "grad_norm": 0.47138267546166734, "learning_rate": 3.32543628886121e-06, "loss": 0.3077, "step": 442 }, { "epoch": 1.2585227272727273, "grad_norm": 0.2953842775844083, "learning_rate": 3.3181139405781616e-06, "loss": 0.2377, "step": 443 }, { "epoch": 1.2613636363636362, "grad_norm": 0.3612627525520785, "learning_rate": 3.3107837237525274e-06, "loss": 0.2427, "step": 444 }, { "epoch": 1.2642045454545454, "grad_norm": 0.3653963278501932, "learning_rate": 3.3034457088856396e-06, "loss": 0.2559, "step": 445 }, { "epoch": 1.2670454545454546, "grad_norm": 0.3129568330696853, "learning_rate": 3.2960999665538335e-06, "loss": 0.2534, "step": 446 }, { "epoch": 1.2698863636363638, "grad_norm": 0.3510947430261117, "learning_rate": 3.288746567407763e-06, "loss": 0.2502, "step": 447 }, { "epoch": 1.2727272727272727, "grad_norm": 0.3437157582636368, "learning_rate": 3.281385582171727e-06, "loss": 0.2525, "step": 448 }, { "epoch": 1.2755681818181819, "grad_norm": 0.3888446263801318, "learning_rate": 3.274017081642986e-06, "loss": 0.2885, "step": 449 }, { "epoch": 1.2784090909090908, "grad_norm": 0.35942811400817226, "learning_rate": 3.2666411366910827e-06, "loss": 0.2571, "step": 450 }, { "epoch": 1.28125, "grad_norm": 0.41674090701769867, "learning_rate": 3.2592578182571583e-06, "loss": 0.2973, "step": 451 }, { "epoch": 1.2840909090909092, "grad_norm": 0.3702323179560626, "learning_rate": 3.2518671973532704e-06, "loss": 0.2415, "step": 452 }, { "epoch": 1.2869318181818181, "grad_norm": 0.36007563550430505, "learning_rate": 3.244469345061715e-06, "loss": 0.2277, "step": 453 }, { "epoch": 1.2897727272727273, "grad_norm": 0.3914691699646844, "learning_rate": 3.237064332534336e-06, "loss": 0.2828, "step": 454 }, { "epoch": 1.2926136363636362, "grad_norm": 0.3522104855581335, "learning_rate": 3.229652230991843e-06, "loss": 0.2671, "step": 455 }, { "epoch": 1.2954545454545454, "grad_norm": 0.3553148108185653, "learning_rate": 3.2222331117231283e-06, "loss": 0.2817, "step": 456 }, { "epoch": 1.2982954545454546, "grad_norm": 0.3771227330111479, "learning_rate": 3.2148070460845814e-06, "loss": 0.274, "step": 457 }, { "epoch": 1.3011363636363638, "grad_norm": 0.41388528735027136, "learning_rate": 3.2073741054994e-06, "loss": 0.3181, "step": 458 }, { "epoch": 1.3039772727272727, "grad_norm": 0.33865063205260826, "learning_rate": 3.199934361456903e-06, "loss": 0.2634, "step": 459 }, { "epoch": 1.3068181818181819, "grad_norm": 0.3520115660135833, "learning_rate": 3.1924878855118475e-06, "loss": 0.2618, "step": 460 }, { "epoch": 1.3096590909090908, "grad_norm": 0.40034402955639337, "learning_rate": 3.185034749283734e-06, "loss": 0.2837, "step": 461 }, { "epoch": 1.3125, "grad_norm": 0.34422942980117177, "learning_rate": 3.1775750244561233e-06, "loss": 0.2638, "step": 462 }, { "epoch": 1.3153409090909092, "grad_norm": 0.38963794033279253, "learning_rate": 3.1701087827759434e-06, "loss": 0.294, "step": 463 }, { "epoch": 1.3181818181818181, "grad_norm": 0.4262376192411251, "learning_rate": 3.162636096052803e-06, "loss": 0.3342, "step": 464 }, { "epoch": 1.3210227272727273, "grad_norm": 0.38196782588004025, "learning_rate": 3.155157036158295e-06, "loss": 0.281, "step": 465 }, { "epoch": 1.3238636363636362, "grad_norm": 0.39128577037723217, "learning_rate": 3.147671675025313e-06, "loss": 0.2864, "step": 466 }, { "epoch": 1.3267045454545454, "grad_norm": 0.3622238856754979, "learning_rate": 3.1401800846473506e-06, "loss": 0.2742, "step": 467 }, { "epoch": 1.3295454545454546, "grad_norm": 0.3187408313823274, "learning_rate": 3.132682337077818e-06, "loss": 0.2549, "step": 468 }, { "epoch": 1.3323863636363638, "grad_norm": 0.33256196577073566, "learning_rate": 3.1251785044293425e-06, "loss": 0.2921, "step": 469 }, { "epoch": 1.3352272727272727, "grad_norm": 0.377119549478706, "learning_rate": 3.117668658873078e-06, "loss": 0.2722, "step": 470 }, { "epoch": 1.3380681818181819, "grad_norm": 0.31419013026351733, "learning_rate": 3.1101528726380085e-06, "loss": 0.2519, "step": 471 }, { "epoch": 1.3409090909090908, "grad_norm": 0.3471415869479363, "learning_rate": 3.102631218010257e-06, "loss": 0.2817, "step": 472 }, { "epoch": 1.34375, "grad_norm": 0.37953158089107286, "learning_rate": 3.0951037673323863e-06, "loss": 0.2642, "step": 473 }, { "epoch": 1.3465909090909092, "grad_norm": 0.34488245509452714, "learning_rate": 3.0875705930027065e-06, "loss": 0.2499, "step": 474 }, { "epoch": 1.3494318181818181, "grad_norm": 0.29818790329911665, "learning_rate": 3.0800317674745755e-06, "loss": 0.2572, "step": 475 }, { "epoch": 1.3522727272727273, "grad_norm": 0.35582979006101406, "learning_rate": 3.0724873632557068e-06, "loss": 0.2806, "step": 476 }, { "epoch": 1.3551136363636362, "grad_norm": 0.3886707765043663, "learning_rate": 3.064937452907465e-06, "loss": 0.2395, "step": 477 }, { "epoch": 1.3579545454545454, "grad_norm": 0.39452409132776717, "learning_rate": 3.057382109044177e-06, "loss": 0.2748, "step": 478 }, { "epoch": 1.3607954545454546, "grad_norm": 0.34362558608870675, "learning_rate": 3.049821404332424e-06, "loss": 0.2664, "step": 479 }, { "epoch": 1.3636363636363638, "grad_norm": 0.3923547533127044, "learning_rate": 3.0422554114903514e-06, "loss": 0.3134, "step": 480 }, { "epoch": 1.3664772727272727, "grad_norm": 0.42311598203108824, "learning_rate": 3.0346842032869624e-06, "loss": 0.3227, "step": 481 }, { "epoch": 1.3693181818181819, "grad_norm": 0.49341501720924236, "learning_rate": 3.0271078525414234e-06, "loss": 0.2789, "step": 482 }, { "epoch": 1.3721590909090908, "grad_norm": 0.3923870792288359, "learning_rate": 3.0195264321223584e-06, "loss": 0.3003, "step": 483 }, { "epoch": 1.375, "grad_norm": 0.5047411107384405, "learning_rate": 3.0119400149471535e-06, "loss": 0.2835, "step": 484 }, { "epoch": 1.3778409090909092, "grad_norm": 0.3431083613633404, "learning_rate": 3.004348673981252e-06, "loss": 0.2744, "step": 485 }, { "epoch": 1.3806818181818181, "grad_norm": 0.3370392701002557, "learning_rate": 2.996752482237456e-06, "loss": 0.2503, "step": 486 }, { "epoch": 1.3835227272727273, "grad_norm": 0.35789574905836263, "learning_rate": 2.9891515127752172e-06, "loss": 0.2558, "step": 487 }, { "epoch": 1.3863636363636362, "grad_norm": 0.39542709664531145, "learning_rate": 2.981545838699943e-06, "loss": 0.2499, "step": 488 }, { "epoch": 1.3892045454545454, "grad_norm": 0.4799271866705037, "learning_rate": 2.9739355331622886e-06, "loss": 0.2845, "step": 489 }, { "epoch": 1.3920454545454546, "grad_norm": 0.30250300604212543, "learning_rate": 2.966320669357453e-06, "loss": 0.2428, "step": 490 }, { "epoch": 1.3948863636363638, "grad_norm": 0.27928557627455064, "learning_rate": 2.9587013205244767e-06, "loss": 0.2354, "step": 491 }, { "epoch": 1.3977272727272727, "grad_norm": 0.3254689902299252, "learning_rate": 2.951077559945538e-06, "loss": 0.2719, "step": 492 }, { "epoch": 1.4005681818181819, "grad_norm": 0.38918459975286523, "learning_rate": 2.943449460945244e-06, "loss": 0.2726, "step": 493 }, { "epoch": 1.4034090909090908, "grad_norm": 0.29871192903714955, "learning_rate": 2.9358170968899323e-06, "loss": 0.263, "step": 494 }, { "epoch": 1.40625, "grad_norm": 0.3943630183447143, "learning_rate": 2.9281805411869573e-06, "loss": 0.2931, "step": 495 }, { "epoch": 1.4090909090909092, "grad_norm": 0.34932644595142737, "learning_rate": 2.920539867283992e-06, "loss": 0.2577, "step": 496 }, { "epoch": 1.4119318181818181, "grad_norm": 0.36296363929883135, "learning_rate": 2.9128951486683144e-06, "loss": 0.2884, "step": 497 }, { "epoch": 1.4147727272727273, "grad_norm": 0.3536090241186941, "learning_rate": 2.9052464588661076e-06, "loss": 0.2518, "step": 498 }, { "epoch": 1.4176136363636362, "grad_norm": 0.4071123114766137, "learning_rate": 2.8975938714417466e-06, "loss": 0.2955, "step": 499 }, { "epoch": 1.4204545454545454, "grad_norm": 0.36319240545094117, "learning_rate": 2.8899374599970943e-06, "loss": 0.2933, "step": 500 }, { "epoch": 1.4232954545454546, "grad_norm": 0.33541538203913807, "learning_rate": 2.882277298170792e-06, "loss": 0.2693, "step": 501 }, { "epoch": 1.4261363636363638, "grad_norm": 0.42293889077814073, "learning_rate": 2.8746134596375534e-06, "loss": 0.2907, "step": 502 }, { "epoch": 1.4289772727272727, "grad_norm": 0.3702782961580686, "learning_rate": 2.866946018107453e-06, "loss": 0.2701, "step": 503 }, { "epoch": 1.4318181818181819, "grad_norm": 0.3454390175085058, "learning_rate": 2.8592750473252197e-06, "loss": 0.2612, "step": 504 }, { "epoch": 1.4346590909090908, "grad_norm": 0.33107307095308464, "learning_rate": 2.8516006210695244e-06, "loss": 0.239, "step": 505 }, { "epoch": 1.4375, "grad_norm": 0.3569062909249772, "learning_rate": 2.843922813152275e-06, "loss": 0.2755, "step": 506 }, { "epoch": 1.4403409090909092, "grad_norm": 0.37131837135922086, "learning_rate": 2.836241697417902e-06, "loss": 0.2623, "step": 507 }, { "epoch": 1.4431818181818181, "grad_norm": 0.3699557028893426, "learning_rate": 2.8285573477426504e-06, "loss": 0.2811, "step": 508 }, { "epoch": 1.4460227272727273, "grad_norm": 0.33561480648358855, "learning_rate": 2.820869838033871e-06, "loss": 0.2686, "step": 509 }, { "epoch": 1.4488636363636362, "grad_norm": 0.4711840304366533, "learning_rate": 2.813179242229304e-06, "loss": 0.2946, "step": 510 }, { "epoch": 1.4517045454545454, "grad_norm": 0.382672820843295, "learning_rate": 2.805485634296374e-06, "loss": 0.2945, "step": 511 }, { "epoch": 1.4545454545454546, "grad_norm": 0.3264806302650397, "learning_rate": 2.7977890882314763e-06, "loss": 0.2658, "step": 512 }, { "epoch": 1.4573863636363638, "grad_norm": 0.3590459125833833, "learning_rate": 2.7900896780592616e-06, "loss": 0.2675, "step": 513 }, { "epoch": 1.4602272727272727, "grad_norm": 0.41777977412669154, "learning_rate": 2.7823874778319316e-06, "loss": 0.3133, "step": 514 }, { "epoch": 1.4630681818181819, "grad_norm": 0.3700743186678299, "learning_rate": 2.774682561628519e-06, "loss": 0.2781, "step": 515 }, { "epoch": 1.4659090909090908, "grad_norm": 0.3586139592020702, "learning_rate": 2.7669750035541798e-06, "loss": 0.2709, "step": 516 }, { "epoch": 1.46875, "grad_norm": 0.32668952210259083, "learning_rate": 2.759264877739481e-06, "loss": 0.2628, "step": 517 }, { "epoch": 1.4715909090909092, "grad_norm": 0.3304970455370839, "learning_rate": 2.7515522583396825e-06, "loss": 0.2859, "step": 518 }, { "epoch": 1.4744318181818181, "grad_norm": 0.3188860297893081, "learning_rate": 2.74383721953403e-06, "loss": 0.2435, "step": 519 }, { "epoch": 1.4772727272727273, "grad_norm": 0.3701340525867732, "learning_rate": 2.736119835525037e-06, "loss": 0.2571, "step": 520 }, { "epoch": 1.4801136363636362, "grad_norm": 0.4888553988204271, "learning_rate": 2.728400180537775e-06, "loss": 0.2956, "step": 521 }, { "epoch": 1.4829545454545454, "grad_norm": 0.4110586830757001, "learning_rate": 2.720678328819155e-06, "loss": 0.2396, "step": 522 }, { "epoch": 1.4857954545454546, "grad_norm": 0.3828799651532281, "learning_rate": 2.712954354637218e-06, "loss": 0.2701, "step": 523 }, { "epoch": 1.4886363636363638, "grad_norm": 0.359763211121689, "learning_rate": 2.705228332280418e-06, "loss": 0.2387, "step": 524 }, { "epoch": 1.4914772727272727, "grad_norm": 0.3785795319364518, "learning_rate": 2.6975003360569087e-06, "loss": 0.2761, "step": 525 }, { "epoch": 1.4943181818181819, "grad_norm": 0.34255573500581615, "learning_rate": 2.689770440293825e-06, "loss": 0.267, "step": 526 }, { "epoch": 1.4971590909090908, "grad_norm": 0.37025650452574843, "learning_rate": 2.6820387193365764e-06, "loss": 0.2781, "step": 527 }, { "epoch": 1.5, "grad_norm": 0.35002281689988746, "learning_rate": 2.674305247548125e-06, "loss": 0.2947, "step": 528 }, { "epoch": 1.5028409090909092, "grad_norm": 0.34143779580523753, "learning_rate": 2.6665700993082705e-06, "loss": 0.2658, "step": 529 }, { "epoch": 1.5056818181818183, "grad_norm": 0.3560924867441854, "learning_rate": 2.6588333490129376e-06, "loss": 0.2742, "step": 530 }, { "epoch": 1.5085227272727273, "grad_norm": 0.32295396334903814, "learning_rate": 2.65109507107346e-06, "loss": 0.2382, "step": 531 }, { "epoch": 1.5113636363636362, "grad_norm": 0.33859114158227865, "learning_rate": 2.6433553399158652e-06, "loss": 0.2937, "step": 532 }, { "epoch": 1.5142045454545454, "grad_norm": 0.35244369608972004, "learning_rate": 2.6356142299801544e-06, "loss": 0.3037, "step": 533 }, { "epoch": 1.5170454545454546, "grad_norm": 0.3336662584141403, "learning_rate": 2.6278718157195924e-06, "loss": 0.2844, "step": 534 }, { "epoch": 1.5198863636363638, "grad_norm": 0.35862845558521106, "learning_rate": 2.620128171599989e-06, "loss": 0.246, "step": 535 }, { "epoch": 1.5227272727272727, "grad_norm": 0.31358277794725126, "learning_rate": 2.6123833720989796e-06, "loss": 0.2653, "step": 536 }, { "epoch": 1.5255681818181817, "grad_norm": 0.36029376106362876, "learning_rate": 2.6046374917053156e-06, "loss": 0.2785, "step": 537 }, { "epoch": 1.5284090909090908, "grad_norm": 0.3512123146788697, "learning_rate": 2.5968906049181425e-06, "loss": 0.2723, "step": 538 }, { "epoch": 1.53125, "grad_norm": 0.35559911829983626, "learning_rate": 2.5891427862462853e-06, "loss": 0.2939, "step": 539 }, { "epoch": 1.5340909090909092, "grad_norm": 0.3774459233336894, "learning_rate": 2.581394110207532e-06, "loss": 0.2593, "step": 540 }, { "epoch": 1.5369318181818183, "grad_norm": 0.3213295704503383, "learning_rate": 2.5736446513279166e-06, "loss": 0.2615, "step": 541 }, { "epoch": 1.5397727272727273, "grad_norm": 0.33894998490392014, "learning_rate": 2.5658944841410032e-06, "loss": 0.2856, "step": 542 }, { "epoch": 1.5426136363636362, "grad_norm": 0.4085808452620872, "learning_rate": 2.5581436831871666e-06, "loss": 0.2611, "step": 543 }, { "epoch": 1.5454545454545454, "grad_norm": 0.3377548562078041, "learning_rate": 2.5503923230128787e-06, "loss": 0.2445, "step": 544 }, { "epoch": 1.5482954545454546, "grad_norm": 0.2986016210832829, "learning_rate": 2.5426404781699886e-06, "loss": 0.2345, "step": 545 }, { "epoch": 1.5511363636363638, "grad_norm": 0.3130189679053128, "learning_rate": 2.534888223215008e-06, "loss": 0.2648, "step": 546 }, { "epoch": 1.5539772727272727, "grad_norm": 0.29362772394820585, "learning_rate": 2.5271356327083927e-06, "loss": 0.2231, "step": 547 }, { "epoch": 1.5568181818181817, "grad_norm": 0.3371287342113354, "learning_rate": 2.5193827812138268e-06, "loss": 0.2801, "step": 548 }, { "epoch": 1.5596590909090908, "grad_norm": 0.438680590348071, "learning_rate": 2.511629743297502e-06, "loss": 0.3117, "step": 549 }, { "epoch": 1.5625, "grad_norm": 0.3623332826643985, "learning_rate": 2.5038765935274038e-06, "loss": 0.2582, "step": 550 }, { "epoch": 1.5653409090909092, "grad_norm": 0.3611764461964591, "learning_rate": 2.4961234064725966e-06, "loss": 0.2606, "step": 551 }, { "epoch": 1.5681818181818183, "grad_norm": 0.6683755911265977, "learning_rate": 2.488370256702499e-06, "loss": 0.2686, "step": 552 }, { "epoch": 1.5710227272727273, "grad_norm": 0.3699878510363697, "learning_rate": 2.4806172187861736e-06, "loss": 0.2823, "step": 553 }, { "epoch": 1.5738636363636362, "grad_norm": 0.3603575134404355, "learning_rate": 2.4728643672916073e-06, "loss": 0.2696, "step": 554 }, { "epoch": 1.5767045454545454, "grad_norm": 0.5708462895257692, "learning_rate": 2.465111776784993e-06, "loss": 0.3003, "step": 555 }, { "epoch": 1.5795454545454546, "grad_norm": 0.414861092800249, "learning_rate": 2.4573595218300127e-06, "loss": 0.2878, "step": 556 }, { "epoch": 1.5823863636363638, "grad_norm": 0.36176025431242964, "learning_rate": 2.4496076769871226e-06, "loss": 0.2614, "step": 557 }, { "epoch": 1.5852272727272727, "grad_norm": 0.4170474058146532, "learning_rate": 2.4418563168128346e-06, "loss": 0.2868, "step": 558 }, { "epoch": 1.5880681818181817, "grad_norm": 0.3270649689091589, "learning_rate": 2.4341055158589976e-06, "loss": 0.2699, "step": 559 }, { "epoch": 1.5909090909090908, "grad_norm": 0.3807070125410976, "learning_rate": 2.4263553486720838e-06, "loss": 0.303, "step": 560 }, { "epoch": 1.59375, "grad_norm": 0.3848553762149162, "learning_rate": 2.4186058897924685e-06, "loss": 0.2748, "step": 561 }, { "epoch": 1.5965909090909092, "grad_norm": 0.3232840810454203, "learning_rate": 2.410857213753715e-06, "loss": 0.2445, "step": 562 }, { "epoch": 1.5994318181818183, "grad_norm": 0.3092676360533537, "learning_rate": 2.4031093950818583e-06, "loss": 0.2356, "step": 563 }, { "epoch": 1.6022727272727273, "grad_norm": 0.45118596036379494, "learning_rate": 2.3953625082946856e-06, "loss": 0.2837, "step": 564 }, { "epoch": 1.6051136363636362, "grad_norm": 0.34970482571526373, "learning_rate": 2.3876166279010212e-06, "loss": 0.2973, "step": 565 }, { "epoch": 1.6079545454545454, "grad_norm": 0.3364465296058301, "learning_rate": 2.379871828400012e-06, "loss": 0.2423, "step": 566 }, { "epoch": 1.6107954545454546, "grad_norm": 0.363328151622841, "learning_rate": 2.372128184280408e-06, "loss": 0.269, "step": 567 }, { "epoch": 1.6136363636363638, "grad_norm": 0.26766248199292697, "learning_rate": 2.364385770019846e-06, "loss": 0.2346, "step": 568 }, { "epoch": 1.6164772727272727, "grad_norm": 0.3913465078730921, "learning_rate": 2.356644660084135e-06, "loss": 0.2866, "step": 569 }, { "epoch": 1.6193181818181817, "grad_norm": 0.31905393138162685, "learning_rate": 2.34890492892654e-06, "loss": 0.2666, "step": 570 }, { "epoch": 1.6221590909090908, "grad_norm": 0.3432468450311117, "learning_rate": 2.341166650987064e-06, "loss": 0.2443, "step": 571 }, { "epoch": 1.625, "grad_norm": 0.34070598347786063, "learning_rate": 2.333429900691731e-06, "loss": 0.2968, "step": 572 }, { "epoch": 1.6278409090909092, "grad_norm": 0.4257323783577944, "learning_rate": 2.3256947524518756e-06, "loss": 0.275, "step": 573 }, { "epoch": 1.6306818181818183, "grad_norm": 0.35120372623976087, "learning_rate": 2.317961280663424e-06, "loss": 0.2779, "step": 574 }, { "epoch": 1.6335227272727273, "grad_norm": 0.3288834361465399, "learning_rate": 2.3102295597061757e-06, "loss": 0.262, "step": 575 }, { "epoch": 1.6363636363636362, "grad_norm": 0.3781083785525166, "learning_rate": 2.3024996639430925e-06, "loss": 0.2705, "step": 576 }, { "epoch": 1.6392045454545454, "grad_norm": 0.3309675255585671, "learning_rate": 2.2947716677195823e-06, "loss": 0.2607, "step": 577 }, { "epoch": 1.6420454545454546, "grad_norm": 0.4097606078658523, "learning_rate": 2.2870456453627823e-06, "loss": 0.3267, "step": 578 }, { "epoch": 1.6448863636363638, "grad_norm": 0.25572751310886616, "learning_rate": 2.2793216711808456e-06, "loss": 0.2278, "step": 579 }, { "epoch": 1.6477272727272727, "grad_norm": 0.3060607584281395, "learning_rate": 2.2715998194622257e-06, "loss": 0.2517, "step": 580 }, { "epoch": 1.6505681818181817, "grad_norm": 0.37963871119456877, "learning_rate": 2.2638801644749636e-06, "loss": 0.2634, "step": 581 }, { "epoch": 1.6534090909090908, "grad_norm": 0.3762574705485531, "learning_rate": 2.2561627804659704e-06, "loss": 0.2534, "step": 582 }, { "epoch": 1.65625, "grad_norm": 0.34282885282451137, "learning_rate": 2.2484477416603183e-06, "loss": 0.2666, "step": 583 }, { "epoch": 1.6590909090909092, "grad_norm": 0.3508691585265268, "learning_rate": 2.24073512226052e-06, "loss": 0.2589, "step": 584 }, { "epoch": 1.6619318181818183, "grad_norm": 0.38903092342578377, "learning_rate": 2.2330249964458202e-06, "loss": 0.2853, "step": 585 }, { "epoch": 1.6647727272727273, "grad_norm": 0.3466002683474289, "learning_rate": 2.2253174383714816e-06, "loss": 0.2812, "step": 586 }, { "epoch": 1.6676136363636362, "grad_norm": 0.46395674632161, "learning_rate": 2.21761252216807e-06, "loss": 0.2692, "step": 587 }, { "epoch": 1.6704545454545454, "grad_norm": 0.3699824822038089, "learning_rate": 2.2099103219407392e-06, "loss": 0.2699, "step": 588 }, { "epoch": 1.6732954545454546, "grad_norm": 0.3805031596017454, "learning_rate": 2.2022109117685246e-06, "loss": 0.2953, "step": 589 }, { "epoch": 1.6761363636363638, "grad_norm": 0.37764726137134685, "learning_rate": 2.1945143657036267e-06, "loss": 0.2753, "step": 590 }, { "epoch": 1.6789772727272727, "grad_norm": 0.3304479070305256, "learning_rate": 2.1868207577706964e-06, "loss": 0.2524, "step": 591 }, { "epoch": 1.6818181818181817, "grad_norm": 0.3587520279737923, "learning_rate": 2.1791301619661297e-06, "loss": 0.2602, "step": 592 }, { "epoch": 1.6846590909090908, "grad_norm": 0.3323465218687911, "learning_rate": 2.17144265225735e-06, "loss": 0.2692, "step": 593 }, { "epoch": 1.6875, "grad_norm": 0.3572276587914552, "learning_rate": 2.1637583025820985e-06, "loss": 0.2858, "step": 594 }, { "epoch": 1.6903409090909092, "grad_norm": 0.37800630772529514, "learning_rate": 2.156077186847726e-06, "loss": 0.294, "step": 595 }, { "epoch": 1.6931818181818183, "grad_norm": 0.3421660175170903, "learning_rate": 2.148399378930476e-06, "loss": 0.2573, "step": 596 }, { "epoch": 1.6960227272727273, "grad_norm": 0.34254475964042214, "learning_rate": 2.1407249526747816e-06, "loss": 0.275, "step": 597 }, { "epoch": 1.6988636363636362, "grad_norm": 0.3715201904697272, "learning_rate": 2.133053981892547e-06, "loss": 0.2833, "step": 598 }, { "epoch": 1.7017045454545454, "grad_norm": 0.36015289752626467, "learning_rate": 2.125386540362447e-06, "loss": 0.2828, "step": 599 }, { "epoch": 1.7045454545454546, "grad_norm": 0.40367397113055686, "learning_rate": 2.1177227018292086e-06, "loss": 0.2621, "step": 600 }, { "epoch": 1.7073863636363638, "grad_norm": 0.32129619035430856, "learning_rate": 2.110062540002906e-06, "loss": 0.2757, "step": 601 }, { "epoch": 1.7102272727272727, "grad_norm": 0.3137451287766472, "learning_rate": 2.1024061285582546e-06, "loss": 0.2535, "step": 602 }, { "epoch": 1.7130681818181817, "grad_norm": 0.4280343421587481, "learning_rate": 2.0947535411338936e-06, "loss": 0.2559, "step": 603 }, { "epoch": 1.7159090909090908, "grad_norm": 0.38561258389624026, "learning_rate": 2.087104851331686e-06, "loss": 0.339, "step": 604 }, { "epoch": 1.71875, "grad_norm": 0.3187139343663328, "learning_rate": 2.0794601327160083e-06, "loss": 0.224, "step": 605 }, { "epoch": 1.7215909090909092, "grad_norm": 0.4058807325173988, "learning_rate": 2.0718194588130435e-06, "loss": 0.2743, "step": 606 }, { "epoch": 1.7244318181818183, "grad_norm": 0.3501025253129524, "learning_rate": 2.0641829031100685e-06, "loss": 0.2534, "step": 607 }, { "epoch": 1.7272727272727273, "grad_norm": 0.34621897515864436, "learning_rate": 2.0565505390547558e-06, "loss": 0.2565, "step": 608 }, { "epoch": 1.7301136363636362, "grad_norm": 0.2972165110796837, "learning_rate": 2.0489224400544626e-06, "loss": 0.2472, "step": 609 }, { "epoch": 1.7329545454545454, "grad_norm": 0.28430139406095895, "learning_rate": 2.041298679475524e-06, "loss": 0.2278, "step": 610 }, { "epoch": 1.7357954545454546, "grad_norm": 0.3424108937746101, "learning_rate": 2.033679330642548e-06, "loss": 0.2708, "step": 611 }, { "epoch": 1.7386363636363638, "grad_norm": 0.34689691643105225, "learning_rate": 2.026064466837712e-06, "loss": 0.2489, "step": 612 }, { "epoch": 1.7414772727272727, "grad_norm": 0.36538604704717154, "learning_rate": 2.018454161300058e-06, "loss": 0.2959, "step": 613 }, { "epoch": 1.7443181818181817, "grad_norm": 0.3914980478603566, "learning_rate": 2.0108484872247836e-06, "loss": 0.2877, "step": 614 }, { "epoch": 1.7471590909090908, "grad_norm": 0.3460591534025964, "learning_rate": 2.003247517762545e-06, "loss": 0.2392, "step": 615 }, { "epoch": 1.75, "grad_norm": 0.35201168894909723, "learning_rate": 1.995651326018748e-06, "loss": 0.2775, "step": 616 }, { "epoch": 1.7528409090909092, "grad_norm": 0.3907457148602396, "learning_rate": 1.988059985052847e-06, "loss": 0.2649, "step": 617 }, { "epoch": 1.7556818181818183, "grad_norm": 0.31089272434312254, "learning_rate": 1.980473567877643e-06, "loss": 0.2717, "step": 618 }, { "epoch": 1.7585227272727273, "grad_norm": 0.39029862965581613, "learning_rate": 1.9728921474585783e-06, "loss": 0.2996, "step": 619 }, { "epoch": 1.7613636363636362, "grad_norm": 0.37522254054472837, "learning_rate": 1.965315796713038e-06, "loss": 0.3206, "step": 620 }, { "epoch": 1.7642045454545454, "grad_norm": 0.37421333571503007, "learning_rate": 1.957744588509649e-06, "loss": 0.2953, "step": 621 }, { "epoch": 1.7670454545454546, "grad_norm": 0.4113713231201874, "learning_rate": 1.9501785956675767e-06, "loss": 0.2587, "step": 622 }, { "epoch": 1.7698863636363638, "grad_norm": 0.3775256295092349, "learning_rate": 1.942617890955824e-06, "loss": 0.2706, "step": 623 }, { "epoch": 1.7727272727272727, "grad_norm": 0.361676860315546, "learning_rate": 1.935062547092535e-06, "loss": 0.2573, "step": 624 }, { "epoch": 1.7755681818181817, "grad_norm": 0.3828484280989141, "learning_rate": 1.927512636744294e-06, "loss": 0.2635, "step": 625 }, { "epoch": 1.7784090909090908, "grad_norm": 0.3194894627210845, "learning_rate": 1.9199682325254258e-06, "loss": 0.2412, "step": 626 }, { "epoch": 1.78125, "grad_norm": 0.3467465431720772, "learning_rate": 1.9124294069972947e-06, "loss": 0.2558, "step": 627 }, { "epoch": 1.7840909090909092, "grad_norm": 0.40591415428499084, "learning_rate": 1.9048962326676145e-06, "loss": 0.2591, "step": 628 }, { "epoch": 1.7869318181818183, "grad_norm": 0.324247081690912, "learning_rate": 1.897368781989744e-06, "loss": 0.2525, "step": 629 }, { "epoch": 1.7897727272727273, "grad_norm": 0.30168524950243947, "learning_rate": 1.889847127361992e-06, "loss": 0.2414, "step": 630 }, { "epoch": 1.7926136363636362, "grad_norm": 0.3391445741041072, "learning_rate": 1.8823313411269226e-06, "loss": 0.2666, "step": 631 }, { "epoch": 1.7954545454545454, "grad_norm": 0.3695919372425977, "learning_rate": 1.874821495570658e-06, "loss": 0.2738, "step": 632 }, { "epoch": 1.7982954545454546, "grad_norm": 0.41985233793486193, "learning_rate": 1.8673176629221824e-06, "loss": 0.2843, "step": 633 }, { "epoch": 1.8011363636363638, "grad_norm": 0.34508550168400526, "learning_rate": 1.8598199153526502e-06, "loss": 0.2762, "step": 634 }, { "epoch": 1.8039772727272727, "grad_norm": 0.34432258391495646, "learning_rate": 1.852328324974688e-06, "loss": 0.2746, "step": 635 }, { "epoch": 1.8068181818181817, "grad_norm": 0.432219335772206, "learning_rate": 1.8448429638417053e-06, "loss": 0.293, "step": 636 }, { "epoch": 1.8096590909090908, "grad_norm": 0.30494323840811877, "learning_rate": 1.8373639039471974e-06, "loss": 0.2483, "step": 637 }, { "epoch": 1.8125, "grad_norm": 0.38979888807881874, "learning_rate": 1.8298912172240568e-06, "loss": 0.2665, "step": 638 }, { "epoch": 1.8153409090909092, "grad_norm": 0.4409357967627925, "learning_rate": 1.8224249755438773e-06, "loss": 0.2979, "step": 639 }, { "epoch": 1.8181818181818183, "grad_norm": 0.40058073253392457, "learning_rate": 1.8149652507162662e-06, "loss": 0.2402, "step": 640 }, { "epoch": 1.8210227272727273, "grad_norm": 0.3781947301668901, "learning_rate": 1.807512114488153e-06, "loss": 0.2815, "step": 641 }, { "epoch": 1.8238636363636362, "grad_norm": 0.32982880475917153, "learning_rate": 1.8000656385430978e-06, "loss": 0.274, "step": 642 }, { "epoch": 1.8267045454545454, "grad_norm": 0.34588388650165885, "learning_rate": 1.7926258945006008e-06, "loss": 0.2415, "step": 643 }, { "epoch": 1.8295454545454546, "grad_norm": 0.46509270816531234, "learning_rate": 1.7851929539154188e-06, "loss": 0.2352, "step": 644 }, { "epoch": 1.8323863636363638, "grad_norm": 0.3949892127680776, "learning_rate": 1.7777668882768723e-06, "loss": 0.2731, "step": 645 }, { "epoch": 1.8352272727272727, "grad_norm": 0.33118981202064834, "learning_rate": 1.7703477690081584e-06, "loss": 0.2062, "step": 646 }, { "epoch": 1.8380681818181817, "grad_norm": 0.41123429927968475, "learning_rate": 1.762935667465665e-06, "loss": 0.2603, "step": 647 }, { "epoch": 1.8409090909090908, "grad_norm": 0.4086985175493265, "learning_rate": 1.7555306549382853e-06, "loss": 0.2633, "step": 648 }, { "epoch": 1.84375, "grad_norm": 0.3829776136552432, "learning_rate": 1.7481328026467292e-06, "loss": 0.2645, "step": 649 }, { "epoch": 1.8465909090909092, "grad_norm": 0.36580249698143114, "learning_rate": 1.7407421817428432e-06, "loss": 0.2907, "step": 650 }, { "epoch": 1.8494318181818183, "grad_norm": 0.5114322764325684, "learning_rate": 1.733358863308918e-06, "loss": 0.2491, "step": 651 }, { "epoch": 1.8522727272727273, "grad_norm": 0.3758211802363351, "learning_rate": 1.7259829183570146e-06, "loss": 0.275, "step": 652 }, { "epoch": 1.8551136363636362, "grad_norm": 0.44005362349975546, "learning_rate": 1.7186144178282735e-06, "loss": 0.2759, "step": 653 }, { "epoch": 1.8579545454545454, "grad_norm": 0.41121803130231066, "learning_rate": 1.7112534325922381e-06, "loss": 0.2835, "step": 654 }, { "epoch": 1.8607954545454546, "grad_norm": 0.37656111256141905, "learning_rate": 1.7039000334461673e-06, "loss": 0.2808, "step": 655 }, { "epoch": 1.8636363636363638, "grad_norm": 0.3651987202447528, "learning_rate": 1.6965542911143601e-06, "loss": 0.3218, "step": 656 }, { "epoch": 1.8664772727272727, "grad_norm": 0.40004844795530625, "learning_rate": 1.6892162762474732e-06, "loss": 0.2945, "step": 657 }, { "epoch": 1.8693181818181817, "grad_norm": 0.33043091198634184, "learning_rate": 1.6818860594218396e-06, "loss": 0.2277, "step": 658 }, { "epoch": 1.8721590909090908, "grad_norm": 0.3346497899463932, "learning_rate": 1.674563711138791e-06, "loss": 0.2324, "step": 659 }, { "epoch": 1.875, "grad_norm": 0.32658486289094646, "learning_rate": 1.6672493018239828e-06, "loss": 0.242, "step": 660 }, { "epoch": 1.8778409090909092, "grad_norm": 0.3483520142042606, "learning_rate": 1.659942901826712e-06, "loss": 0.2724, "step": 661 }, { "epoch": 1.8806818181818183, "grad_norm": 0.3447989906256544, "learning_rate": 1.6526445814192437e-06, "loss": 0.2522, "step": 662 }, { "epoch": 1.8835227272727273, "grad_norm": 0.3745982582543309, "learning_rate": 1.6453544107961338e-06, "loss": 0.268, "step": 663 }, { "epoch": 1.8863636363636362, "grad_norm": 0.47460009049304464, "learning_rate": 1.638072460073556e-06, "loss": 0.3004, "step": 664 }, { "epoch": 1.8892045454545454, "grad_norm": 0.38922747831910864, "learning_rate": 1.6307987992886221e-06, "loss": 0.2923, "step": 665 }, { "epoch": 1.8920454545454546, "grad_norm": 0.3619334724335469, "learning_rate": 1.6235334983987166e-06, "loss": 0.2929, "step": 666 }, { "epoch": 1.8948863636363638, "grad_norm": 0.4134447223169521, "learning_rate": 1.6162766272808153e-06, "loss": 0.2443, "step": 667 }, { "epoch": 1.8977272727272727, "grad_norm": 0.37827695457409233, "learning_rate": 1.6090282557308199e-06, "loss": 0.2634, "step": 668 }, { "epoch": 1.9005681818181817, "grad_norm": 0.37553439336248, "learning_rate": 1.6017884534628821e-06, "loss": 0.2624, "step": 669 }, { "epoch": 1.9034090909090908, "grad_norm": 0.30503546597237136, "learning_rate": 1.594557290108737e-06, "loss": 0.2448, "step": 670 }, { "epoch": 1.90625, "grad_norm": 0.33139361815750534, "learning_rate": 1.5873348352170309e-06, "loss": 0.2344, "step": 671 }, { "epoch": 1.9090909090909092, "grad_norm": 0.4071705047497215, "learning_rate": 1.5801211582526515e-06, "loss": 0.2972, "step": 672 }, { "epoch": 1.9119318181818183, "grad_norm": 0.3520108684037794, "learning_rate": 1.5729163285960636e-06, "loss": 0.3064, "step": 673 }, { "epoch": 1.9147727272727273, "grad_norm": 1.044294639450523, "learning_rate": 1.5657204155426372e-06, "loss": 0.2764, "step": 674 }, { "epoch": 1.9176136363636362, "grad_norm": 0.2733575442921981, "learning_rate": 1.5585334883019845e-06, "loss": 0.2115, "step": 675 }, { "epoch": 1.9204545454545454, "grad_norm": 0.3436043484209694, "learning_rate": 1.551355615997292e-06, "loss": 0.2613, "step": 676 }, { "epoch": 1.9232954545454546, "grad_norm": 0.347973792440035, "learning_rate": 1.5441868676646588e-06, "loss": 0.2984, "step": 677 }, { "epoch": 1.9261363636363638, "grad_norm": 0.3714627621893232, "learning_rate": 1.537027312252427e-06, "loss": 0.2939, "step": 678 }, { "epoch": 1.9289772727272727, "grad_norm": 0.36946185178466473, "learning_rate": 1.5298770186205262e-06, "loss": 0.3133, "step": 679 }, { "epoch": 1.9318181818181817, "grad_norm": 0.4445653274012168, "learning_rate": 1.522736055539804e-06, "loss": 0.2638, "step": 680 }, { "epoch": 1.9346590909090908, "grad_norm": 0.36757539498984404, "learning_rate": 1.5156044916913687e-06, "loss": 0.2594, "step": 681 }, { "epoch": 1.9375, "grad_norm": 0.3740375047815139, "learning_rate": 1.5084823956659284e-06, "loss": 0.2816, "step": 682 }, { "epoch": 1.9403409090909092, "grad_norm": 0.3691390978665013, "learning_rate": 1.5013698359631284e-06, "loss": 0.3269, "step": 683 }, { "epoch": 1.9431818181818183, "grad_norm": 0.3882033860276734, "learning_rate": 1.4942668809908966e-06, "loss": 0.2926, "step": 684 }, { "epoch": 1.9460227272727273, "grad_norm": 0.3303389257139215, "learning_rate": 1.487173599064783e-06, "loss": 0.2813, "step": 685 }, { "epoch": 1.9488636363636362, "grad_norm": 0.385716686112769, "learning_rate": 1.4800900584073025e-06, "loss": 0.3027, "step": 686 }, { "epoch": 1.9517045454545454, "grad_norm": 0.30969234063219786, "learning_rate": 1.4730163271472808e-06, "loss": 0.2848, "step": 687 }, { "epoch": 1.9545454545454546, "grad_norm": 0.39460846418007084, "learning_rate": 1.465952473319196e-06, "loss": 0.2638, "step": 688 }, { "epoch": 1.9573863636363638, "grad_norm": 0.38043423948555954, "learning_rate": 1.458898564862528e-06, "loss": 0.3017, "step": 689 }, { "epoch": 1.9602272727272727, "grad_norm": 0.344190102552331, "learning_rate": 1.4518546696211003e-06, "loss": 0.2475, "step": 690 }, { "epoch": 1.9630681818181817, "grad_norm": 0.3584182768945062, "learning_rate": 1.4448208553424318e-06, "loss": 0.2599, "step": 691 }, { "epoch": 1.9659090909090908, "grad_norm": 0.4193998956615056, "learning_rate": 1.4377971896770854e-06, "loss": 0.2932, "step": 692 }, { "epoch": 1.96875, "grad_norm": 0.3183638489077071, "learning_rate": 1.4307837401780129e-06, "loss": 0.2353, "step": 693 }, { "epoch": 1.9715909090909092, "grad_norm": 0.551291367904842, "learning_rate": 1.4237805742999078e-06, "loss": 0.2888, "step": 694 }, { "epoch": 1.9744318181818183, "grad_norm": 0.3836625936106596, "learning_rate": 1.4167877593985604e-06, "loss": 0.2606, "step": 695 }, { "epoch": 1.9772727272727273, "grad_norm": 0.33238753899253215, "learning_rate": 1.4098053627302021e-06, "loss": 0.2511, "step": 696 }, { "epoch": 1.9801136363636362, "grad_norm": 0.3697431181915492, "learning_rate": 1.402833451450865e-06, "loss": 0.2592, "step": 697 }, { "epoch": 1.9829545454545454, "grad_norm": 0.39546306881879256, "learning_rate": 1.3958720926157326e-06, "loss": 0.2867, "step": 698 }, { "epoch": 1.9857954545454546, "grad_norm": 0.37081182355768993, "learning_rate": 1.3889213531784967e-06, "loss": 0.2774, "step": 699 }, { "epoch": 1.9886363636363638, "grad_norm": 0.31730996135018236, "learning_rate": 1.3819812999907112e-06, "loss": 0.2558, "step": 700 }, { "epoch": 1.9914772727272727, "grad_norm": 0.48697102294004946, "learning_rate": 1.3750519998011545e-06, "loss": 0.2807, "step": 701 }, { "epoch": 1.9943181818181817, "grad_norm": 0.32660834038500147, "learning_rate": 1.3681335192551795e-06, "loss": 0.266, "step": 702 }, { "epoch": 1.9971590909090908, "grad_norm": 0.37088856838391165, "learning_rate": 1.3612259248940778e-06, "loss": 0.3023, "step": 703 }, { "epoch": 2.0, "grad_norm": 0.46910986149494815, "learning_rate": 1.354329283154442e-06, "loss": 0.354, "step": 704 }, { "epoch": 2.002840909090909, "grad_norm": 0.3578196229806462, "learning_rate": 1.3474436603675195e-06, "loss": 0.2863, "step": 705 }, { "epoch": 2.0056818181818183, "grad_norm": 0.3320147175830239, "learning_rate": 1.3405691227585774e-06, "loss": 0.2791, "step": 706 }, { "epoch": 2.008522727272727, "grad_norm": 0.4104267883722151, "learning_rate": 1.333705736446272e-06, "loss": 0.2599, "step": 707 }, { "epoch": 2.0113636363636362, "grad_norm": 0.44025732665188794, "learning_rate": 1.326853567442003e-06, "loss": 0.2648, "step": 708 }, { "epoch": 2.0142045454545454, "grad_norm": 0.4463091829454087, "learning_rate": 1.320012681649284e-06, "loss": 0.3235, "step": 709 }, { "epoch": 2.0170454545454546, "grad_norm": 0.3977418006694515, "learning_rate": 1.3131831448631099e-06, "loss": 0.2494, "step": 710 }, { "epoch": 2.0198863636363638, "grad_norm": 0.30294420075479717, "learning_rate": 1.3063650227693192e-06, "loss": 0.2274, "step": 711 }, { "epoch": 2.022727272727273, "grad_norm": 0.3580935126068431, "learning_rate": 1.2995583809439655e-06, "loss": 0.2641, "step": 712 }, { "epoch": 2.0255681818181817, "grad_norm": 0.3633999760316955, "learning_rate": 1.2927632848526892e-06, "loss": 0.2664, "step": 713 }, { "epoch": 2.028409090909091, "grad_norm": 0.39362626572566367, "learning_rate": 1.285979799850079e-06, "loss": 0.3028, "step": 714 }, { "epoch": 2.03125, "grad_norm": 0.3732307387516034, "learning_rate": 1.2792079911790554e-06, "loss": 0.2903, "step": 715 }, { "epoch": 2.034090909090909, "grad_norm": 0.348231549102206, "learning_rate": 1.2724479239702334e-06, "loss": 0.2776, "step": 716 }, { "epoch": 2.0369318181818183, "grad_norm": 0.32154175294270404, "learning_rate": 1.2656996632413e-06, "loss": 0.2363, "step": 717 }, { "epoch": 2.039772727272727, "grad_norm": 0.3738689076803405, "learning_rate": 1.2589632738963915e-06, "loss": 0.2747, "step": 718 }, { "epoch": 2.0426136363636362, "grad_norm": 0.370533612023648, "learning_rate": 1.2522388207254624e-06, "loss": 0.2568, "step": 719 }, { "epoch": 2.0454545454545454, "grad_norm": 0.3839434235801676, "learning_rate": 1.2455263684036687e-06, "loss": 0.2792, "step": 720 }, { "epoch": 2.0482954545454546, "grad_norm": 0.5003341324574189, "learning_rate": 1.2388259814907421e-06, "loss": 0.2769, "step": 721 }, { "epoch": 2.0511363636363638, "grad_norm": 0.3351671952514299, "learning_rate": 1.2321377244303718e-06, "loss": 0.2296, "step": 722 }, { "epoch": 2.053977272727273, "grad_norm": 0.2999985412422647, "learning_rate": 1.22546166154958e-06, "loss": 0.2284, "step": 723 }, { "epoch": 2.0568181818181817, "grad_norm": 0.3135859144132813, "learning_rate": 1.2187978570581118e-06, "loss": 0.251, "step": 724 }, { "epoch": 2.059659090909091, "grad_norm": 0.4125239171099722, "learning_rate": 1.212146375047808e-06, "loss": 0.2569, "step": 725 }, { "epoch": 2.0625, "grad_norm": 0.5126461046016878, "learning_rate": 1.2055072794919927e-06, "loss": 0.2867, "step": 726 }, { "epoch": 2.065340909090909, "grad_norm": 0.32428865281600694, "learning_rate": 1.198880634244862e-06, "loss": 0.2526, "step": 727 }, { "epoch": 2.0681818181818183, "grad_norm": 0.5892083787676873, "learning_rate": 1.192266503040863e-06, "loss": 0.2827, "step": 728 }, { "epoch": 2.071022727272727, "grad_norm": 0.2947475596312562, "learning_rate": 1.1856649494940842e-06, "loss": 0.2288, "step": 729 }, { "epoch": 2.0738636363636362, "grad_norm": 0.35972607487628616, "learning_rate": 1.1790760370976445e-06, "loss": 0.268, "step": 730 }, { "epoch": 2.0767045454545454, "grad_norm": 0.36619988601771414, "learning_rate": 1.1724998292230804e-06, "loss": 0.2832, "step": 731 }, { "epoch": 2.0795454545454546, "grad_norm": 0.3733558388597783, "learning_rate": 1.1659363891197373e-06, "loss": 0.2723, "step": 732 }, { "epoch": 2.0823863636363638, "grad_norm": 0.39404340487463446, "learning_rate": 1.1593857799141635e-06, "loss": 0.2823, "step": 733 }, { "epoch": 2.085227272727273, "grad_norm": 0.39535002691603904, "learning_rate": 1.152848064609499e-06, "loss": 0.2765, "step": 734 }, { "epoch": 2.0880681818181817, "grad_norm": 0.4562125910263655, "learning_rate": 1.1463233060848701e-06, "loss": 0.2229, "step": 735 }, { "epoch": 2.090909090909091, "grad_norm": 0.34157106543064586, "learning_rate": 1.139811567094791e-06, "loss": 0.251, "step": 736 }, { "epoch": 2.09375, "grad_norm": 0.3975912471137775, "learning_rate": 1.1333129102685504e-06, "loss": 0.2953, "step": 737 }, { "epoch": 2.096590909090909, "grad_norm": 0.4344936348962993, "learning_rate": 1.1268273981096154e-06, "loss": 0.2481, "step": 738 }, { "epoch": 2.0994318181818183, "grad_norm": 0.40663820339750667, "learning_rate": 1.1203550929950296e-06, "loss": 0.2704, "step": 739 }, { "epoch": 2.102272727272727, "grad_norm": 0.4525407147079834, "learning_rate": 1.1138960571748122e-06, "loss": 0.2308, "step": 740 }, { "epoch": 2.1051136363636362, "grad_norm": 0.36101599924638966, "learning_rate": 1.107450352771358e-06, "loss": 0.3198, "step": 741 }, { "epoch": 2.1079545454545454, "grad_norm": 0.4132570992405224, "learning_rate": 1.1010180417788458e-06, "loss": 0.3157, "step": 742 }, { "epoch": 2.1107954545454546, "grad_norm": 0.4296796806025471, "learning_rate": 1.094599186062633e-06, "loss": 0.2719, "step": 743 }, { "epoch": 2.1136363636363638, "grad_norm": 0.4115860705303619, "learning_rate": 1.0881938473586672e-06, "loss": 0.2588, "step": 744 }, { "epoch": 2.116477272727273, "grad_norm": 0.3341390354972397, "learning_rate": 1.0818020872728935e-06, "loss": 0.2803, "step": 745 }, { "epoch": 2.1193181818181817, "grad_norm": 0.386666143661149, "learning_rate": 1.0754239672806526e-06, "loss": 0.2954, "step": 746 }, { "epoch": 2.122159090909091, "grad_norm": 0.39729795109834065, "learning_rate": 1.0690595487261032e-06, "loss": 0.292, "step": 747 }, { "epoch": 2.125, "grad_norm": 0.4632063849794996, "learning_rate": 1.0627088928216203e-06, "loss": 0.3011, "step": 748 }, { "epoch": 2.127840909090909, "grad_norm": 0.364788422480122, "learning_rate": 1.0563720606472116e-06, "loss": 0.2887, "step": 749 }, { "epoch": 2.1306818181818183, "grad_norm": 0.3613800764493521, "learning_rate": 1.050049113149932e-06, "loss": 0.2698, "step": 750 }, { "epoch": 2.133522727272727, "grad_norm": 0.4840054604670755, "learning_rate": 1.0437401111432928e-06, "loss": 0.2671, "step": 751 }, { "epoch": 2.1363636363636362, "grad_norm": 0.35647589283664843, "learning_rate": 1.0374451153066773e-06, "loss": 0.277, "step": 752 }, { "epoch": 2.1392045454545454, "grad_norm": 0.3070617647042118, "learning_rate": 1.0311641861847644e-06, "loss": 0.2262, "step": 753 }, { "epoch": 2.1420454545454546, "grad_norm": 0.36421008528422827, "learning_rate": 1.0248973841869336e-06, "loss": 0.2541, "step": 754 }, { "epoch": 2.1448863636363638, "grad_norm": 0.36442145568995793, "learning_rate": 1.018644769586695e-06, "loss": 0.2968, "step": 755 }, { "epoch": 2.147727272727273, "grad_norm": 0.5392899583290776, "learning_rate": 1.0124064025211063e-06, "loss": 0.2338, "step": 756 }, { "epoch": 2.1505681818181817, "grad_norm": 0.40485627469450297, "learning_rate": 1.006182342990192e-06, "loss": 0.2734, "step": 757 }, { "epoch": 2.153409090909091, "grad_norm": 0.36165309778969656, "learning_rate": 9.99972650856368e-07, "loss": 0.2717, "step": 758 }, { "epoch": 2.15625, "grad_norm": 0.37054356564143653, "learning_rate": 9.937773858438677e-07, "loss": 0.2867, "step": 759 }, { "epoch": 2.159090909090909, "grad_norm": 0.3209190334600411, "learning_rate": 9.87596607538164e-07, "loss": 0.2026, "step": 760 }, { "epoch": 2.1619318181818183, "grad_norm": 0.33862908014599463, "learning_rate": 9.81430375385399e-07, "loss": 0.2589, "step": 761 }, { "epoch": 2.164772727272727, "grad_norm": 0.33768216225160724, "learning_rate": 9.752787486918108e-07, "loss": 0.2832, "step": 762 }, { "epoch": 2.1676136363636362, "grad_norm": 0.33566640920720886, "learning_rate": 9.691417866231633e-07, "loss": 0.2646, "step": 763 }, { "epoch": 2.1704545454545454, "grad_norm": 0.296999788237227, "learning_rate": 9.630195482041778e-07, "loss": 0.2405, "step": 764 }, { "epoch": 2.1732954545454546, "grad_norm": 0.36623960819597895, "learning_rate": 9.569120923179661e-07, "loss": 0.2997, "step": 765 }, { "epoch": 2.1761363636363638, "grad_norm": 0.35989187708509074, "learning_rate": 9.508194777054613e-07, "loss": 0.2627, "step": 766 }, { "epoch": 2.178977272727273, "grad_norm": 0.45558444510597795, "learning_rate": 9.447417629648542e-07, "loss": 0.2939, "step": 767 }, { "epoch": 2.1818181818181817, "grad_norm": 1.352661504436191, "learning_rate": 9.386790065510326e-07, "loss": 0.2674, "step": 768 }, { "epoch": 2.184659090909091, "grad_norm": 0.3484066310248953, "learning_rate": 9.326312667750143e-07, "loss": 0.2647, "step": 769 }, { "epoch": 2.1875, "grad_norm": 0.3372643949746599, "learning_rate": 9.265986018033887e-07, "loss": 0.2712, "step": 770 }, { "epoch": 2.190340909090909, "grad_norm": 0.45171014423025785, "learning_rate": 9.205810696577577e-07, "loss": 0.2531, "step": 771 }, { "epoch": 2.1931818181818183, "grad_norm": 0.3426033696862187, "learning_rate": 9.14578728214176e-07, "loss": 0.2594, "step": 772 }, { "epoch": 2.196022727272727, "grad_norm": 0.44139931309445984, "learning_rate": 9.085916352025983e-07, "loss": 0.2747, "step": 773 }, { "epoch": 2.1988636363636362, "grad_norm": 0.3644501914038969, "learning_rate": 9.02619848206319e-07, "loss": 0.3172, "step": 774 }, { "epoch": 2.2017045454545454, "grad_norm": 0.41216240398841364, "learning_rate": 8.966634246614208e-07, "loss": 0.2614, "step": 775 }, { "epoch": 2.2045454545454546, "grad_norm": 0.34732770899892357, "learning_rate": 8.907224218562219e-07, "loss": 0.248, "step": 776 }, { "epoch": 2.2073863636363638, "grad_norm": 0.34245013389418555, "learning_rate": 8.847968969307283e-07, "loss": 0.295, "step": 777 }, { "epoch": 2.210227272727273, "grad_norm": 0.3893001282929315, "learning_rate": 8.788869068760758e-07, "loss": 0.2967, "step": 778 }, { "epoch": 2.2130681818181817, "grad_norm": 0.48226856220499215, "learning_rate": 8.729925085339929e-07, "loss": 0.3055, "step": 779 }, { "epoch": 2.215909090909091, "grad_norm": 0.36479498548889644, "learning_rate": 8.67113758596245e-07, "loss": 0.2944, "step": 780 }, { "epoch": 2.21875, "grad_norm": 0.311563765449273, "learning_rate": 8.612507136040926e-07, "loss": 0.2208, "step": 781 }, { "epoch": 2.221590909090909, "grad_norm": 0.39153959534391375, "learning_rate": 8.554034299477506e-07, "loss": 0.2955, "step": 782 }, { "epoch": 2.2244318181818183, "grad_norm": 0.3752941766025436, "learning_rate": 8.495719638658395e-07, "loss": 0.2882, "step": 783 }, { "epoch": 2.227272727272727, "grad_norm": 0.34306207357731855, "learning_rate": 8.437563714448496e-07, "loss": 0.2855, "step": 784 }, { "epoch": 2.2301136363636362, "grad_norm": 0.2911256041409022, "learning_rate": 8.379567086185989e-07, "loss": 0.2245, "step": 785 }, { "epoch": 2.2329545454545454, "grad_norm": 0.38423726910475914, "learning_rate": 8.321730311676965e-07, "loss": 0.2881, "step": 786 }, { "epoch": 2.2357954545454546, "grad_norm": 0.28685959612362666, "learning_rate": 8.264053947190051e-07, "loss": 0.2168, "step": 787 }, { "epoch": 2.2386363636363638, "grad_norm": 0.3177020831576707, "learning_rate": 8.206538547451088e-07, "loss": 0.2392, "step": 788 }, { "epoch": 2.241477272727273, "grad_norm": 0.314674201211804, "learning_rate": 8.149184665637746e-07, "loss": 0.2244, "step": 789 }, { "epoch": 2.2443181818181817, "grad_norm": 0.34609325605203806, "learning_rate": 8.091992853374239e-07, "loss": 0.2506, "step": 790 }, { "epoch": 2.247159090909091, "grad_norm": 0.37417875469018747, "learning_rate": 8.034963660726022e-07, "loss": 0.297, "step": 791 }, { "epoch": 2.25, "grad_norm": 0.4190001624824225, "learning_rate": 7.978097636194482e-07, "loss": 0.2822, "step": 792 }, { "epoch": 2.252840909090909, "grad_norm": 0.31172594700443673, "learning_rate": 7.921395326711664e-07, "loss": 0.2277, "step": 793 }, { "epoch": 2.2556818181818183, "grad_norm": 0.35515884644954326, "learning_rate": 7.864857277635027e-07, "loss": 0.252, "step": 794 }, { "epoch": 2.2585227272727275, "grad_norm": 0.48510568393864467, "learning_rate": 7.808484032742184e-07, "loss": 0.2661, "step": 795 }, { "epoch": 2.2613636363636362, "grad_norm": 0.40576550011180185, "learning_rate": 7.75227613422567e-07, "loss": 0.2624, "step": 796 }, { "epoch": 2.2642045454545454, "grad_norm": 0.3153702935106711, "learning_rate": 7.696234122687756e-07, "loss": 0.2423, "step": 797 }, { "epoch": 2.2670454545454546, "grad_norm": 0.45813794434618704, "learning_rate": 7.640358537135214e-07, "loss": 0.2773, "step": 798 }, { "epoch": 2.2698863636363638, "grad_norm": 0.43799221687287815, "learning_rate": 7.584649914974132e-07, "loss": 0.2543, "step": 799 }, { "epoch": 2.2727272727272725, "grad_norm": 0.36099400774254925, "learning_rate": 7.5291087920048e-07, "loss": 0.2554, "step": 800 }, { "epoch": 2.2755681818181817, "grad_norm": 0.3681744190202427, "learning_rate": 7.47373570241646e-07, "loss": 0.2393, "step": 801 }, { "epoch": 2.278409090909091, "grad_norm": 0.30088848462434675, "learning_rate": 7.418531178782281e-07, "loss": 0.2443, "step": 802 }, { "epoch": 2.28125, "grad_norm": 0.36658882990515207, "learning_rate": 7.363495752054145e-07, "loss": 0.2716, "step": 803 }, { "epoch": 2.284090909090909, "grad_norm": 0.3691396379554879, "learning_rate": 7.30862995155758e-07, "loss": 0.281, "step": 804 }, { "epoch": 2.2869318181818183, "grad_norm": 0.3976865364065572, "learning_rate": 7.25393430498669e-07, "loss": 0.3126, "step": 805 }, { "epoch": 2.2897727272727275, "grad_norm": 0.34972134382431147, "learning_rate": 7.199409338399024e-07, "loss": 0.2716, "step": 806 }, { "epoch": 2.2926136363636362, "grad_norm": 0.359990470488163, "learning_rate": 7.145055576210552e-07, "loss": 0.282, "step": 807 }, { "epoch": 2.2954545454545454, "grad_norm": 0.32127716098200765, "learning_rate": 7.090873541190649e-07, "loss": 0.2537, "step": 808 }, { "epoch": 2.2982954545454546, "grad_norm": 0.3386422816466643, "learning_rate": 7.036863754456985e-07, "loss": 0.2663, "step": 809 }, { "epoch": 2.3011363636363638, "grad_norm": 0.43294818109617667, "learning_rate": 6.983026735470586e-07, "loss": 0.3144, "step": 810 }, { "epoch": 2.3039772727272725, "grad_norm": 0.3668974255373313, "learning_rate": 6.929363002030829e-07, "loss": 0.2665, "step": 811 }, { "epoch": 2.3068181818181817, "grad_norm": 0.3372045903540735, "learning_rate": 6.875873070270423e-07, "loss": 0.2291, "step": 812 }, { "epoch": 2.309659090909091, "grad_norm": 0.3686361653405783, "learning_rate": 6.822557454650472e-07, "loss": 0.3127, "step": 813 }, { "epoch": 2.3125, "grad_norm": 0.3287416264369441, "learning_rate": 6.769416667955545e-07, "loss": 0.2497, "step": 814 }, { "epoch": 2.315340909090909, "grad_norm": 0.378493696975223, "learning_rate": 6.7164512212887e-07, "loss": 0.2538, "step": 815 }, { "epoch": 2.3181818181818183, "grad_norm": 0.3527906349071735, "learning_rate": 6.6636616240666e-07, "loss": 0.2759, "step": 816 }, { "epoch": 2.3210227272727275, "grad_norm": 0.3283146351073707, "learning_rate": 6.611048384014601e-07, "loss": 0.2787, "step": 817 }, { "epoch": 2.3238636363636362, "grad_norm": 0.4262766716182643, "learning_rate": 6.558612007161876e-07, "loss": 0.3367, "step": 818 }, { "epoch": 2.3267045454545454, "grad_norm": 0.29243285573134076, "learning_rate": 6.506352997836537e-07, "loss": 0.2312, "step": 819 }, { "epoch": 2.3295454545454546, "grad_norm": 0.3708515561207515, "learning_rate": 6.454271858660816e-07, "loss": 0.2947, "step": 820 }, { "epoch": 2.3323863636363638, "grad_norm": 0.3031026424988807, "learning_rate": 6.402369090546173e-07, "loss": 0.2376, "step": 821 }, { "epoch": 2.3352272727272725, "grad_norm": 0.40063837240803074, "learning_rate": 6.350645192688531e-07, "loss": 0.2706, "step": 822 }, { "epoch": 2.3380681818181817, "grad_norm": 0.3931219211524187, "learning_rate": 6.299100662563459e-07, "loss": 0.2245, "step": 823 }, { "epoch": 2.340909090909091, "grad_norm": 0.496053050631395, "learning_rate": 6.247735995921375e-07, "loss": 0.2665, "step": 824 }, { "epoch": 2.34375, "grad_norm": 0.36983619426324377, "learning_rate": 6.19655168678279e-07, "loss": 0.2437, "step": 825 }, { "epoch": 2.346590909090909, "grad_norm": 0.31853434490396093, "learning_rate": 6.145548227433551e-07, "loss": 0.237, "step": 826 }, { "epoch": 2.3494318181818183, "grad_norm": 0.3833013165526796, "learning_rate": 6.094726108420105e-07, "loss": 0.2321, "step": 827 }, { "epoch": 2.3522727272727275, "grad_norm": 0.34709948082141423, "learning_rate": 6.044085818544807e-07, "loss": 0.2435, "step": 828 }, { "epoch": 2.3551136363636362, "grad_norm": 0.346027003213824, "learning_rate": 5.993627844861172e-07, "loss": 0.2536, "step": 829 }, { "epoch": 2.3579545454545454, "grad_norm": 0.3350399776737133, "learning_rate": 5.943352672669215e-07, "loss": 0.2403, "step": 830 }, { "epoch": 2.3607954545454546, "grad_norm": 0.32396672340715865, "learning_rate": 5.89326078551081e-07, "loss": 0.2213, "step": 831 }, { "epoch": 2.3636363636363638, "grad_norm": 0.3844292483072848, "learning_rate": 5.843352665164992e-07, "loss": 0.249, "step": 832 }, { "epoch": 2.3664772727272725, "grad_norm": 0.35019451009540753, "learning_rate": 5.793628791643327e-07, "loss": 0.2888, "step": 833 }, { "epoch": 2.3693181818181817, "grad_norm": 0.3164025713303425, "learning_rate": 5.744089643185355e-07, "loss": 0.2515, "step": 834 }, { "epoch": 2.372159090909091, "grad_norm": 0.3308520526667594, "learning_rate": 5.69473569625392e-07, "loss": 0.2587, "step": 835 }, { "epoch": 2.375, "grad_norm": 0.3378919456195333, "learning_rate": 5.645567425530607e-07, "loss": 0.2433, "step": 836 }, { "epoch": 2.377840909090909, "grad_norm": 0.3354025023866522, "learning_rate": 5.596585303911217e-07, "loss": 0.2542, "step": 837 }, { "epoch": 2.3806818181818183, "grad_norm": 0.3756871055057431, "learning_rate": 5.547789802501164e-07, "loss": 0.2755, "step": 838 }, { "epoch": 2.3835227272727275, "grad_norm": 0.3363888579054467, "learning_rate": 5.499181390610958e-07, "loss": 0.2545, "step": 839 }, { "epoch": 2.3863636363636362, "grad_norm": 0.3726730517187886, "learning_rate": 5.450760535751734e-07, "loss": 0.2679, "step": 840 }, { "epoch": 2.3892045454545454, "grad_norm": 0.34128788657594616, "learning_rate": 5.402527703630681e-07, "loss": 0.2744, "step": 841 }, { "epoch": 2.3920454545454546, "grad_norm": 0.3112695417600679, "learning_rate": 5.354483358146617e-07, "loss": 0.2231, "step": 842 }, { "epoch": 2.3948863636363638, "grad_norm": 0.34694374550516704, "learning_rate": 5.306627961385538e-07, "loss": 0.2571, "step": 843 }, { "epoch": 2.3977272727272725, "grad_norm": 0.302981543192964, "learning_rate": 5.258961973616117e-07, "loss": 0.2427, "step": 844 }, { "epoch": 2.4005681818181817, "grad_norm": 0.3008721863153869, "learning_rate": 5.211485853285314e-07, "loss": 0.2251, "step": 845 }, { "epoch": 2.403409090909091, "grad_norm": 0.33302783458473956, "learning_rate": 5.164200057013985e-07, "loss": 0.2711, "step": 846 }, { "epoch": 2.40625, "grad_norm": 0.3898327860869564, "learning_rate": 5.117105039592444e-07, "loss": 0.2869, "step": 847 }, { "epoch": 2.409090909090909, "grad_norm": 0.3770328552305208, "learning_rate": 5.070201253976115e-07, "loss": 0.2777, "step": 848 }, { "epoch": 2.4119318181818183, "grad_norm": 0.32513904942970184, "learning_rate": 5.02348915128118e-07, "loss": 0.2655, "step": 849 }, { "epoch": 2.4147727272727275, "grad_norm": 0.3173329184832482, "learning_rate": 4.976969180780225e-07, "loss": 0.2398, "step": 850 }, { "epoch": 2.4176136363636362, "grad_norm": 0.3853995789331807, "learning_rate": 4.930641789897938e-07, "loss": 0.2699, "step": 851 }, { "epoch": 2.4204545454545454, "grad_norm": 0.3880784265747346, "learning_rate": 4.884507424206788e-07, "loss": 0.2649, "step": 852 }, { "epoch": 2.4232954545454546, "grad_norm": 0.3710421332719178, "learning_rate": 4.838566527422742e-07, "loss": 0.2604, "step": 853 }, { "epoch": 2.4261363636363638, "grad_norm": 0.42114780257384915, "learning_rate": 4.792819541400998e-07, "loss": 0.2982, "step": 854 }, { "epoch": 2.4289772727272725, "grad_norm": 0.3704518376341159, "learning_rate": 4.747266906131759e-07, "loss": 0.2916, "step": 855 }, { "epoch": 2.4318181818181817, "grad_norm": 0.3664937063178789, "learning_rate": 4.7019090597359624e-07, "loss": 0.2586, "step": 856 }, { "epoch": 2.434659090909091, "grad_norm": 0.30129914419743803, "learning_rate": 4.656746438461085e-07, "loss": 0.233, "step": 857 }, { "epoch": 2.4375, "grad_norm": 0.3610260573998573, "learning_rate": 4.611779476676956e-07, "loss": 0.2295, "step": 858 }, { "epoch": 2.440340909090909, "grad_norm": 0.31555005162338934, "learning_rate": 4.5670086068715564e-07, "loss": 0.2324, "step": 859 }, { "epoch": 2.4431818181818183, "grad_norm": 0.38647155996115823, "learning_rate": 4.522434259646896e-07, "loss": 0.2509, "step": 860 }, { "epoch": 2.4460227272727275, "grad_norm": 0.3295294330692125, "learning_rate": 4.4780568637148277e-07, "loss": 0.2409, "step": 861 }, { "epoch": 2.4488636363636362, "grad_norm": 0.40919134523297795, "learning_rate": 4.4338768458929455e-07, "loss": 0.2753, "step": 862 }, { "epoch": 2.4517045454545454, "grad_norm": 0.3281509333195072, "learning_rate": 4.3898946311005054e-07, "loss": 0.2776, "step": 863 }, { "epoch": 2.4545454545454546, "grad_norm": 0.3003640118064134, "learning_rate": 4.346110642354284e-07, "loss": 0.2288, "step": 864 }, { "epoch": 2.4573863636363638, "grad_norm": 0.2856917980597871, "learning_rate": 4.30252530076454e-07, "loss": 0.2262, "step": 865 }, { "epoch": 2.4602272727272725, "grad_norm": 0.3716917156792666, "learning_rate": 4.259139025530981e-07, "loss": 0.2704, "step": 866 }, { "epoch": 2.4630681818181817, "grad_norm": 0.3646615055009088, "learning_rate": 4.2159522339387027e-07, "loss": 0.2422, "step": 867 }, { "epoch": 2.465909090909091, "grad_norm": 0.352013885171188, "learning_rate": 4.1729653413541795e-07, "loss": 0.2586, "step": 868 }, { "epoch": 2.46875, "grad_norm": 0.3341889105921635, "learning_rate": 4.13017876122129e-07, "loss": 0.2514, "step": 869 }, { "epoch": 2.471590909090909, "grad_norm": 0.3436869862214985, "learning_rate": 4.087592905057319e-07, "loss": 0.2663, "step": 870 }, { "epoch": 2.4744318181818183, "grad_norm": 0.3459285477446355, "learning_rate": 4.0452081824490007e-07, "loss": 0.2274, "step": 871 }, { "epoch": 2.4772727272727275, "grad_norm": 0.39474776701060227, "learning_rate": 4.0030250010486106e-07, "loss": 0.2635, "step": 872 }, { "epoch": 2.4801136363636362, "grad_norm": 0.3588162845171683, "learning_rate": 3.9610437665699803e-07, "loss": 0.2702, "step": 873 }, { "epoch": 2.4829545454545454, "grad_norm": 0.3055170644052573, "learning_rate": 3.919264882784662e-07, "loss": 0.2642, "step": 874 }, { "epoch": 2.4857954545454546, "grad_norm": 0.4004147388266674, "learning_rate": 3.8776887515180215e-07, "loss": 0.2673, "step": 875 }, { "epoch": 2.4886363636363638, "grad_norm": 0.3435684772838886, "learning_rate": 3.836315772645355e-07, "loss": 0.2572, "step": 876 }, { "epoch": 2.4914772727272725, "grad_norm": 0.3929983920357782, "learning_rate": 3.79514634408806e-07, "loss": 0.314, "step": 877 }, { "epoch": 2.4943181818181817, "grad_norm": 0.3402456651574272, "learning_rate": 3.7541808618098225e-07, "loss": 0.2742, "step": 878 }, { "epoch": 2.497159090909091, "grad_norm": 0.3391484648776555, "learning_rate": 3.713419719812775e-07, "loss": 0.2957, "step": 879 }, { "epoch": 2.5, "grad_norm": 0.3300372482602716, "learning_rate": 3.6728633101337283e-07, "loss": 0.2402, "step": 880 }, { "epoch": 2.502840909090909, "grad_norm": 0.3880324057454857, "learning_rate": 3.632512022840401e-07, "loss": 0.225, "step": 881 }, { "epoch": 2.5056818181818183, "grad_norm": 0.40083562156829194, "learning_rate": 3.592366246027654e-07, "loss": 0.2885, "step": 882 }, { "epoch": 2.5085227272727275, "grad_norm": 0.3898508645513151, "learning_rate": 3.552426365813791e-07, "loss": 0.279, "step": 883 }, { "epoch": 2.5113636363636362, "grad_norm": 0.34747356344583896, "learning_rate": 3.512692766336795e-07, "loss": 0.2551, "step": 884 }, { "epoch": 2.5142045454545454, "grad_norm": 0.3697878476145354, "learning_rate": 3.4731658297506717e-07, "loss": 0.2584, "step": 885 }, { "epoch": 2.5170454545454546, "grad_norm": 0.3442593343497222, "learning_rate": 3.433845936221772e-07, "loss": 0.2323, "step": 886 }, { "epoch": 2.5198863636363638, "grad_norm": 0.4052191198887473, "learning_rate": 3.394733463925115e-07, "loss": 0.2895, "step": 887 }, { "epoch": 2.5227272727272725, "grad_norm": 0.3639886136390821, "learning_rate": 3.355828789040752e-07, "loss": 0.276, "step": 888 }, { "epoch": 2.5255681818181817, "grad_norm": 0.39883666474289897, "learning_rate": 3.3171322857501796e-07, "loss": 0.2858, "step": 889 }, { "epoch": 2.528409090909091, "grad_norm": 0.40889869044433336, "learning_rate": 3.278644326232713e-07, "loss": 0.257, "step": 890 }, { "epoch": 2.53125, "grad_norm": 0.3284086126915543, "learning_rate": 3.2403652806619e-07, "loss": 0.2699, "step": 891 }, { "epoch": 2.534090909090909, "grad_norm": 0.3806103148982155, "learning_rate": 3.2022955172019947e-07, "loss": 0.2607, "step": 892 }, { "epoch": 2.5369318181818183, "grad_norm": 0.414262076377764, "learning_rate": 3.1644354020043846e-07, "loss": 0.2709, "step": 893 }, { "epoch": 2.5397727272727275, "grad_norm": 0.3564646964673218, "learning_rate": 3.1267852992040715e-07, "loss": 0.2845, "step": 894 }, { "epoch": 2.5426136363636362, "grad_norm": 0.35912306046922576, "learning_rate": 3.0893455709162023e-07, "loss": 0.2466, "step": 895 }, { "epoch": 2.5454545454545454, "grad_norm": 0.3605968532309376, "learning_rate": 3.052116577232533e-07, "loss": 0.2868, "step": 896 }, { "epoch": 2.5482954545454546, "grad_norm": 0.4129969651465434, "learning_rate": 3.015098676218009e-07, "loss": 0.2738, "step": 897 }, { "epoch": 2.5511363636363638, "grad_norm": 0.38800081862705826, "learning_rate": 2.9782922239073084e-07, "loss": 0.274, "step": 898 }, { "epoch": 2.5539772727272725, "grad_norm": 0.28725463626604075, "learning_rate": 2.9416975743014134e-07, "loss": 0.246, "step": 899 }, { "epoch": 2.5568181818181817, "grad_norm": 0.33194490572792595, "learning_rate": 2.9053150793642013e-07, "loss": 0.2418, "step": 900 }, { "epoch": 2.559659090909091, "grad_norm": 0.31927368240055043, "learning_rate": 2.8691450890190794e-07, "loss": 0.259, "step": 901 }, { "epoch": 2.5625, "grad_norm": 0.4514488260064792, "learning_rate": 2.833187951145588e-07, "loss": 0.2674, "step": 902 }, { "epoch": 2.565340909090909, "grad_norm": 0.3952590748072181, "learning_rate": 2.797444011576089e-07, "loss": 0.2764, "step": 903 }, { "epoch": 2.5681818181818183, "grad_norm": 0.3035956390116324, "learning_rate": 2.7619136140924153e-07, "loss": 0.2361, "step": 904 }, { "epoch": 2.5710227272727275, "grad_norm": 0.365463810965996, "learning_rate": 2.726597100422565e-07, "loss": 0.2955, "step": 905 }, { "epoch": 2.5738636363636362, "grad_norm": 0.37417152902560946, "learning_rate": 2.6914948102374384e-07, "loss": 0.3007, "step": 906 }, { "epoch": 2.5767045454545454, "grad_norm": 0.36872656495257466, "learning_rate": 2.656607081147547e-07, "loss": 0.2647, "step": 907 }, { "epoch": 2.5795454545454546, "grad_norm": 0.381314240650295, "learning_rate": 2.621934248699767e-07, "loss": 0.3176, "step": 908 }, { "epoch": 2.5823863636363638, "grad_norm": 0.41529466546435734, "learning_rate": 2.5874766463741263e-07, "loss": 0.2482, "step": 909 }, { "epoch": 2.5852272727272725, "grad_norm": 0.4258649726301599, "learning_rate": 2.553234605580593e-07, "loss": 0.2618, "step": 910 }, { "epoch": 2.5880681818181817, "grad_norm": 0.3762825021021476, "learning_rate": 2.5192084556558776e-07, "loss": 0.2914, "step": 911 }, { "epoch": 2.590909090909091, "grad_norm": 0.3627506684619514, "learning_rate": 2.4853985238602745e-07, "loss": 0.2875, "step": 912 }, { "epoch": 2.59375, "grad_norm": 0.3173651745814326, "learning_rate": 2.451805135374516e-07, "loss": 0.2421, "step": 913 }, { "epoch": 2.596590909090909, "grad_norm": 0.44802208559240897, "learning_rate": 2.4184286132966305e-07, "loss": 0.2803, "step": 914 }, { "epoch": 2.5994318181818183, "grad_norm": 0.36772649044669337, "learning_rate": 2.3852692786388634e-07, "loss": 0.3018, "step": 915 }, { "epoch": 2.6022727272727275, "grad_norm": 0.3473737442586536, "learning_rate": 2.3523274503245624e-07, "loss": 0.2565, "step": 916 }, { "epoch": 2.6051136363636362, "grad_norm": 0.31723371911082704, "learning_rate": 2.319603445185109e-07, "loss": 0.2769, "step": 917 }, { "epoch": 2.6079545454545454, "grad_norm": 0.36837062880150556, "learning_rate": 2.2870975779569066e-07, "loss": 0.294, "step": 918 }, { "epoch": 2.6107954545454546, "grad_norm": 0.34124708806422904, "learning_rate": 2.2548101612783147e-07, "loss": 0.2516, "step": 919 }, { "epoch": 2.6136363636363638, "grad_norm": 0.3202170151424555, "learning_rate": 2.2227415056866431e-07, "loss": 0.254, "step": 920 }, { "epoch": 2.6164772727272725, "grad_norm": 0.4260342271233267, "learning_rate": 2.1908919196152013e-07, "loss": 0.2719, "step": 921 }, { "epoch": 2.6193181818181817, "grad_norm": 0.37728441327420986, "learning_rate": 2.1592617093902978e-07, "loss": 0.2753, "step": 922 }, { "epoch": 2.622159090909091, "grad_norm": 0.39060937907330195, "learning_rate": 2.1278511792283018e-07, "loss": 0.2947, "step": 923 }, { "epoch": 2.625, "grad_norm": 0.30888479325881507, "learning_rate": 2.0966606312327303e-07, "loss": 0.2284, "step": 924 }, { "epoch": 2.627840909090909, "grad_norm": 0.40561974710485005, "learning_rate": 2.065690365391329e-07, "loss": 0.2943, "step": 925 }, { "epoch": 2.6306818181818183, "grad_norm": 0.355886681039042, "learning_rate": 2.0349406795731774e-07, "loss": 0.2462, "step": 926 }, { "epoch": 2.6335227272727275, "grad_norm": 0.37901081172880524, "learning_rate": 2.0044118695258657e-07, "loss": 0.2918, "step": 927 }, { "epoch": 2.6363636363636362, "grad_norm": 0.48522777901179487, "learning_rate": 1.9741042288725893e-07, "loss": 0.3463, "step": 928 }, { "epoch": 2.6392045454545454, "grad_norm": 0.35552067688931177, "learning_rate": 1.944018049109375e-07, "loss": 0.2589, "step": 929 }, { "epoch": 2.6420454545454546, "grad_norm": 0.3245196964527464, "learning_rate": 1.9141536196022658e-07, "loss": 0.2667, "step": 930 }, { "epoch": 2.6448863636363638, "grad_norm": 0.397373448701769, "learning_rate": 1.884511227584518e-07, "loss": 0.2635, "step": 931 }, { "epoch": 2.6477272727272725, "grad_norm": 0.3230165219575403, "learning_rate": 1.8550911581538517e-07, "loss": 0.2524, "step": 932 }, { "epoch": 2.6505681818181817, "grad_norm": 0.3201491067518106, "learning_rate": 1.825893694269723e-07, "loss": 0.2704, "step": 933 }, { "epoch": 2.653409090909091, "grad_norm": 0.3806372642940993, "learning_rate": 1.7969191167505811e-07, "loss": 0.2891, "step": 934 }, { "epoch": 2.65625, "grad_norm": 0.3315048294973883, "learning_rate": 1.7681677042711732e-07, "loss": 0.2469, "step": 935 }, { "epoch": 2.659090909090909, "grad_norm": 0.3429832481491404, "learning_rate": 1.7396397333598657e-07, "loss": 0.2344, "step": 936 }, { "epoch": 2.6619318181818183, "grad_norm": 0.31805225672924486, "learning_rate": 1.711335478395984e-07, "loss": 0.2301, "step": 937 }, { "epoch": 2.6647727272727275, "grad_norm": 0.347431193735004, "learning_rate": 1.6832552116071905e-07, "loss": 0.274, "step": 938 }, { "epoch": 2.6676136363636362, "grad_norm": 0.3276581659477082, "learning_rate": 1.6553992030668293e-07, "loss": 0.2569, "step": 939 }, { "epoch": 2.6704545454545454, "grad_norm": 0.4181936566989231, "learning_rate": 1.6277677206913588e-07, "loss": 0.2737, "step": 940 }, { "epoch": 2.6732954545454546, "grad_norm": 0.37610721012897674, "learning_rate": 1.6003610302377708e-07, "loss": 0.2999, "step": 941 }, { "epoch": 2.6761363636363638, "grad_norm": 0.33046264353939814, "learning_rate": 1.5731793953010193e-07, "loss": 0.2427, "step": 942 }, { "epoch": 2.6789772727272725, "grad_norm": 0.3494974820800891, "learning_rate": 1.5462230773115066e-07, "loss": 0.264, "step": 943 }, { "epoch": 2.6818181818181817, "grad_norm": 0.3468159326122336, "learning_rate": 1.5194923355325464e-07, "loss": 0.3076, "step": 944 }, { "epoch": 2.684659090909091, "grad_norm": 0.40045232274054987, "learning_rate": 1.492987427057893e-07, "loss": 0.3051, "step": 945 }, { "epoch": 2.6875, "grad_norm": 0.4030575958079979, "learning_rate": 1.4667086068092446e-07, "loss": 0.2437, "step": 946 }, { "epoch": 2.690340909090909, "grad_norm": 0.34082328928674294, "learning_rate": 1.440656127533821e-07, "loss": 0.2501, "step": 947 }, { "epoch": 2.6931818181818183, "grad_norm": 0.34010796962843276, "learning_rate": 1.414830239801898e-07, "loss": 0.27, "step": 948 }, { "epoch": 2.6960227272727275, "grad_norm": 0.4274695728838406, "learning_rate": 1.3892311920044282e-07, "loss": 0.2964, "step": 949 }, { "epoch": 2.6988636363636362, "grad_norm": 0.35443571450269734, "learning_rate": 1.3638592303506364e-07, "loss": 0.252, "step": 950 }, { "epoch": 2.7017045454545454, "grad_norm": 0.40737204314859, "learning_rate": 1.3387145988656537e-07, "loss": 0.2891, "step": 951 }, { "epoch": 2.7045454545454546, "grad_norm": 0.352138799387513, "learning_rate": 1.313797539388159e-07, "loss": 0.2439, "step": 952 }, { "epoch": 2.7073863636363638, "grad_norm": 0.33845536331763004, "learning_rate": 1.2891082915680864e-07, "loss": 0.2802, "step": 953 }, { "epoch": 2.7102272727272725, "grad_norm": 0.35504925601892684, "learning_rate": 1.264647092864288e-07, "loss": 0.2514, "step": 954 }, { "epoch": 2.7130681818181817, "grad_norm": 0.3609121713893806, "learning_rate": 1.2404141785422568e-07, "loss": 0.25, "step": 955 }, { "epoch": 2.715909090909091, "grad_norm": 0.3936221787085924, "learning_rate": 1.2164097816718818e-07, "loss": 0.2312, "step": 956 }, { "epoch": 2.71875, "grad_norm": 0.38365034429115125, "learning_rate": 1.1926341331251756e-07, "loss": 0.2682, "step": 957 }, { "epoch": 2.721590909090909, "grad_norm": 0.31959559051327435, "learning_rate": 1.169087461574081e-07, "loss": 0.2457, "step": 958 }, { "epoch": 2.7244318181818183, "grad_norm": 0.3799557870602865, "learning_rate": 1.1457699934882715e-07, "loss": 0.2968, "step": 959 }, { "epoch": 2.7272727272727275, "grad_norm": 0.27723093935677195, "learning_rate": 1.1226819531329342e-07, "loss": 0.2219, "step": 960 }, { "epoch": 2.7301136363636362, "grad_norm": 0.3534828660456155, "learning_rate": 1.0998235625666708e-07, "loss": 0.2433, "step": 961 }, { "epoch": 2.7329545454545454, "grad_norm": 0.35791787748576426, "learning_rate": 1.0771950416393228e-07, "loss": 0.2597, "step": 962 }, { "epoch": 2.7357954545454546, "grad_norm": 0.4475649717820448, "learning_rate": 1.0547966079898637e-07, "loss": 0.2636, "step": 963 }, { "epoch": 2.7386363636363638, "grad_norm": 0.39027504830647813, "learning_rate": 1.0326284770443063e-07, "loss": 0.2728, "step": 964 }, { "epoch": 2.7414772727272725, "grad_norm": 0.4315977251477179, "learning_rate": 1.0106908620136525e-07, "loss": 0.2588, "step": 965 }, { "epoch": 2.7443181818181817, "grad_norm": 0.32246952155074843, "learning_rate": 9.889839738918022e-08, "loss": 0.2369, "step": 966 }, { "epoch": 2.747159090909091, "grad_norm": 0.3333508436923039, "learning_rate": 9.675080214535559e-08, "loss": 0.2574, "step": 967 }, { "epoch": 2.75, "grad_norm": 0.3654840156563527, "learning_rate": 9.46263211252596e-08, "loss": 0.3222, "step": 968 }, { "epoch": 2.752840909090909, "grad_norm": 0.3366414190912868, "learning_rate": 9.252497476194972e-08, "loss": 0.2926, "step": 969 }, { "epoch": 2.7556818181818183, "grad_norm": 0.3243823618475195, "learning_rate": 9.044678326597722e-08, "loss": 0.2484, "step": 970 }, { "epoch": 2.7585227272727275, "grad_norm": 0.34777278160161157, "learning_rate": 8.839176662519155e-08, "loss": 0.2349, "step": 971 }, { "epoch": 2.7613636363636362, "grad_norm": 0.34671371366502046, "learning_rate": 8.635994460454766e-08, "loss": 0.2574, "step": 972 }, { "epoch": 2.7642045454545454, "grad_norm": 0.38617683116302787, "learning_rate": 8.435133674591922e-08, "loss": 0.3007, "step": 973 }, { "epoch": 2.7670454545454546, "grad_norm": 0.4218961579649425, "learning_rate": 8.2365962367906e-08, "loss": 0.2916, "step": 974 }, { "epoch": 2.7698863636363638, "grad_norm": 0.3971792338298757, "learning_rate": 8.040384056565098e-08, "loss": 0.2563, "step": 975 }, { "epoch": 2.7727272727272725, "grad_norm": 0.3226524769417545, "learning_rate": 7.846499021065684e-08, "loss": 0.266, "step": 976 }, { "epoch": 2.7755681818181817, "grad_norm": 0.3540519465775941, "learning_rate": 7.654942995060283e-08, "loss": 0.2616, "step": 977 }, { "epoch": 2.778409090909091, "grad_norm": 0.3781537081979966, "learning_rate": 7.465717820916624e-08, "loss": 0.2698, "step": 978 }, { "epoch": 2.78125, "grad_norm": 0.3564755050368105, "learning_rate": 7.278825318584647e-08, "loss": 0.27, "step": 979 }, { "epoch": 2.784090909090909, "grad_norm": 0.3510249393237661, "learning_rate": 7.094267285578688e-08, "loss": 0.2666, "step": 980 }, { "epoch": 2.7869318181818183, "grad_norm": 0.3998246424539849, "learning_rate": 6.912045496960507e-08, "loss": 0.2851, "step": 981 }, { "epoch": 2.7897727272727275, "grad_norm": 0.37123966300816885, "learning_rate": 6.732161705322093e-08, "loss": 0.2528, "step": 982 }, { "epoch": 2.7926136363636362, "grad_norm": 0.32607742324666744, "learning_rate": 6.554617640768674e-08, "loss": 0.2682, "step": 983 }, { "epoch": 2.7954545454545454, "grad_norm": 0.3091883263291907, "learning_rate": 6.379415010902362e-08, "loss": 0.2431, "step": 984 }, { "epoch": 2.7982954545454546, "grad_norm": 0.3896435979654701, "learning_rate": 6.206555500805455e-08, "loss": 0.2662, "step": 985 }, { "epoch": 2.8011363636363638, "grad_norm": 0.36244662485716045, "learning_rate": 6.036040773024387e-08, "loss": 0.2708, "step": 986 }, { "epoch": 2.8039772727272725, "grad_norm": 0.3558651773572941, "learning_rate": 5.867872467553715e-08, "loss": 0.3004, "step": 987 }, { "epoch": 2.8068181818181817, "grad_norm": 0.37311773304851065, "learning_rate": 5.702052201820352e-08, "loss": 0.3088, "step": 988 }, { "epoch": 2.809659090909091, "grad_norm": 0.411421481665237, "learning_rate": 5.5385815706678894e-08, "loss": 0.2923, "step": 989 }, { "epoch": 2.8125, "grad_norm": 0.3759229007631887, "learning_rate": 5.377462146341439e-08, "loss": 0.2945, "step": 990 }, { "epoch": 2.815340909090909, "grad_norm": 0.3014861546323833, "learning_rate": 5.218695478472397e-08, "loss": 0.2119, "step": 991 }, { "epoch": 2.8181818181818183, "grad_norm": 0.4021583403485505, "learning_rate": 5.062283094063536e-08, "loss": 0.2878, "step": 992 }, { "epoch": 2.8210227272727275, "grad_norm": 0.3293364475828707, "learning_rate": 4.9082264974744665e-08, "loss": 0.266, "step": 993 }, { "epoch": 2.8238636363636362, "grad_norm": 0.30933470398564117, "learning_rate": 4.756527170406922e-08, "loss": 0.2314, "step": 994 }, { "epoch": 2.8267045454545454, "grad_norm": 0.37909174739130147, "learning_rate": 4.607186571890715e-08, "loss": 0.2667, "step": 995 }, { "epoch": 2.8295454545454546, "grad_norm": 0.37878603560502083, "learning_rate": 4.46020613826964e-08, "loss": 0.2937, "step": 996 }, { "epoch": 2.8323863636363638, "grad_norm": 0.408496513297682, "learning_rate": 4.3155872831875946e-08, "loss": 0.2757, "step": 997 }, { "epoch": 2.8352272727272725, "grad_norm": 0.3566593848752578, "learning_rate": 4.1733313975750586e-08, "loss": 0.2584, "step": 998 }, { "epoch": 2.8380681818181817, "grad_norm": 0.352150696238673, "learning_rate": 4.033439849635695e-08, "loss": 0.2115, "step": 999 }, { "epoch": 2.840909090909091, "grad_norm": 0.39392089147895293, "learning_rate": 3.895913984833216e-08, "loss": 0.2816, "step": 1000 }, { "epoch": 2.84375, "grad_norm": 0.3412262767323334, "learning_rate": 3.760755125878368e-08, "loss": 0.2431, "step": 1001 }, { "epoch": 2.846590909090909, "grad_norm": 0.3325324503502811, "learning_rate": 3.627964572716331e-08, "loss": 0.264, "step": 1002 }, { "epoch": 2.8494318181818183, "grad_norm": 0.35296040111990046, "learning_rate": 3.497543602514059e-08, "loss": 0.2614, "step": 1003 }, { "epoch": 2.8522727272727275, "grad_norm": 0.2837474483774213, "learning_rate": 3.3694934696481275e-08, "loss": 0.2123, "step": 1004 }, { "epoch": 2.8551136363636362, "grad_norm": 0.34272040018575495, "learning_rate": 3.24381540569263e-08, "loss": 0.2808, "step": 1005 }, { "epoch": 2.8579545454545454, "grad_norm": 0.3498353760521046, "learning_rate": 3.120510619407324e-08, "loss": 0.251, "step": 1006 }, { "epoch": 2.8607954545454546, "grad_norm": 0.4069913912888687, "learning_rate": 2.9995802967259516e-08, "loss": 0.316, "step": 1007 }, { "epoch": 2.8636363636363638, "grad_norm": 0.3361233831001831, "learning_rate": 2.8810256007449632e-08, "loss": 0.2293, "step": 1008 }, { "epoch": 2.8664772727272725, "grad_norm": 0.4519558529396144, "learning_rate": 2.7648476717122287e-08, "loss": 0.2792, "step": 1009 }, { "epoch": 2.8693181818181817, "grad_norm": 0.409732040720535, "learning_rate": 2.651047627016068e-08, "loss": 0.2904, "step": 1010 }, { "epoch": 2.872159090909091, "grad_norm": 0.3250171306579268, "learning_rate": 2.5396265611745687e-08, "loss": 0.2463, "step": 1011 }, { "epoch": 2.875, "grad_norm": 0.3856346320602474, "learning_rate": 2.4305855458250373e-08, "loss": 0.2356, "step": 1012 }, { "epoch": 2.877840909090909, "grad_norm": 0.3526716263439721, "learning_rate": 2.3239256297136193e-08, "loss": 0.258, "step": 1013 }, { "epoch": 2.8806818181818183, "grad_norm": 0.41510762650616695, "learning_rate": 2.2196478386853624e-08, "loss": 0.3018, "step": 1014 }, { "epoch": 2.8835227272727275, "grad_norm": 0.2827340090469283, "learning_rate": 2.117753175674142e-08, "loss": 0.1949, "step": 1015 }, { "epoch": 2.8863636363636362, "grad_norm": 0.42491879871002564, "learning_rate": 2.0182426206932503e-08, "loss": 0.2607, "step": 1016 }, { "epoch": 2.8892045454545454, "grad_norm": 0.3281820518654654, "learning_rate": 1.921117130825767e-08, "loss": 0.266, "step": 1017 }, { "epoch": 2.8920454545454546, "grad_norm": 0.5241869397210815, "learning_rate": 1.82637764021551e-08, "loss": 0.2566, "step": 1018 }, { "epoch": 2.8948863636363638, "grad_norm": 0.36254656882284764, "learning_rate": 1.7340250600579588e-08, "loss": 0.2683, "step": 1019 }, { "epoch": 2.8977272727272725, "grad_norm": 0.32113348760758087, "learning_rate": 1.6440602785914584e-08, "loss": 0.2495, "step": 1020 }, { "epoch": 2.9005681818181817, "grad_norm": 0.39293475539987827, "learning_rate": 1.556484161088806e-08, "loss": 0.2673, "step": 1021 }, { "epoch": 2.903409090909091, "grad_norm": 0.3692023050105476, "learning_rate": 1.4712975498488158e-08, "loss": 0.2676, "step": 1022 }, { "epoch": 2.90625, "grad_norm": 0.3301143389304983, "learning_rate": 1.3885012641882967e-08, "loss": 0.2549, "step": 1023 }, { "epoch": 2.909090909090909, "grad_norm": 0.35000768054036296, "learning_rate": 1.3080961004340308e-08, "loss": 0.2769, "step": 1024 }, { "epoch": 2.9119318181818183, "grad_norm": 0.3257952826732702, "learning_rate": 1.2300828319153635e-08, "loss": 0.2455, "step": 1025 }, { "epoch": 2.9147727272727275, "grad_norm": 0.40990948777245817, "learning_rate": 1.1544622089565139e-08, "loss": 0.2999, "step": 1026 }, { "epoch": 2.9176136363636362, "grad_norm": 0.5767979073587421, "learning_rate": 1.0812349588694426e-08, "loss": 0.2985, "step": 1027 }, { "epoch": 2.9204545454545454, "grad_norm": 0.39277960762361686, "learning_rate": 1.010401785947024e-08, "loss": 0.3085, "step": 1028 }, { "epoch": 2.9232954545454546, "grad_norm": 0.3487911223998262, "learning_rate": 9.419633714559118e-09, "loss": 0.2771, "step": 1029 }, { "epoch": 2.9261363636363638, "grad_norm": 0.35424561590037207, "learning_rate": 8.759203736304067e-09, "loss": 0.2753, "step": 1030 }, { "epoch": 2.9289772727272725, "grad_norm": 0.29485879447799396, "learning_rate": 8.122734276657384e-09, "loss": 0.2089, "step": 1031 }, { "epoch": 2.9318181818181817, "grad_norm": 0.6294311065483419, "learning_rate": 7.51023145712293e-09, "loss": 0.3052, "step": 1032 }, { "epoch": 2.934659090909091, "grad_norm": 0.354680706106559, "learning_rate": 6.921701168694228e-09, "loss": 0.2638, "step": 1033 }, { "epoch": 2.9375, "grad_norm": 0.31404010890218703, "learning_rate": 6.357149071800628e-09, "loss": 0.2396, "step": 1034 }, { "epoch": 2.940340909090909, "grad_norm": 0.3191510959590836, "learning_rate": 5.816580596250676e-09, "loss": 0.2652, "step": 1035 }, { "epoch": 2.9431818181818183, "grad_norm": 0.33936071059872674, "learning_rate": 5.300000941180494e-09, "loss": 0.2761, "step": 1036 }, { "epoch": 2.9460227272727275, "grad_norm": 0.6694940206582203, "learning_rate": 4.807415075005206e-09, "loss": 0.2716, "step": 1037 }, { "epoch": 2.9488636363636362, "grad_norm": 0.3022654996639677, "learning_rate": 4.338827735368423e-09, "loss": 0.267, "step": 1038 }, { "epoch": 2.9517045454545454, "grad_norm": 0.31223716729746726, "learning_rate": 3.894243429098943e-09, "loss": 0.2556, "step": 1039 }, { "epoch": 2.9545454545454546, "grad_norm": 0.33999761359381697, "learning_rate": 3.4736664321671777e-09, "loss": 0.2234, "step": 1040 }, { "epoch": 2.9573863636363638, "grad_norm": 0.38818142260184346, "learning_rate": 3.0771007896424066e-09, "loss": 0.2822, "step": 1041 }, { "epoch": 2.9602272727272725, "grad_norm": 0.3915644733747401, "learning_rate": 2.7045503156555853e-09, "loss": 0.3089, "step": 1042 }, { "epoch": 2.9630681818181817, "grad_norm": 0.35070734375473045, "learning_rate": 2.3560185933621526e-09, "loss": 0.2485, "step": 1043 }, { "epoch": 2.965909090909091, "grad_norm": 0.32676962221864597, "learning_rate": 2.031508974907337e-09, "loss": 0.2564, "step": 1044 }, { "epoch": 2.96875, "grad_norm": 0.37376434665996433, "learning_rate": 1.7310245813939586e-09, "loss": 0.2843, "step": 1045 }, { "epoch": 2.971590909090909, "grad_norm": 0.3812123549505928, "learning_rate": 1.4545683028521772e-09, "loss": 0.2642, "step": 1046 }, { "epoch": 2.9744318181818183, "grad_norm": 0.40366173461812144, "learning_rate": 1.2021427982128463e-09, "loss": 0.2714, "step": 1047 }, { "epoch": 2.9772727272727275, "grad_norm": 0.38234650853272395, "learning_rate": 9.737504952803124e-10, "loss": 0.2483, "step": 1048 }, { "epoch": 2.9801136363636362, "grad_norm": 0.3581632163317752, "learning_rate": 7.693935907102102e-10, "loss": 0.2448, "step": 1049 }, { "epoch": 2.9829545454545454, "grad_norm": 0.44654505449503146, "learning_rate": 5.890740499878145e-10, "loss": 0.295, "step": 1050 }, { "epoch": 2.9857954545454546, "grad_norm": 0.33560840489821, "learning_rate": 4.3279360740972053e-10, "loss": 0.2217, "step": 1051 }, { "epoch": 2.9886363636363638, "grad_norm": 0.3283855292339783, "learning_rate": 3.005537660663582e-10, "loss": 0.219, "step": 1052 }, { "epoch": 2.9914772727272725, "grad_norm": 0.35996516047736465, "learning_rate": 1.923557978281143e-10, "loss": 0.2571, "step": 1053 }, { "epoch": 2.9943181818181817, "grad_norm": 0.30525860331677324, "learning_rate": 1.0820074333256492e-10, "loss": 0.2571, "step": 1054 }, { "epoch": 2.997159090909091, "grad_norm": 0.37025001234738963, "learning_rate": 4.808941197531614e-11, "loss": 0.269, "step": 1055 }, { "epoch": 3.0, "grad_norm": 0.3619551402376093, "learning_rate": 1.2022381901399815e-11, "loss": 0.24, "step": 1056 }, { "epoch": 3.0, "step": 1056, "total_flos": 1454552492015616.0, "train_loss": 0.28625056774101476, "train_runtime": 131042.0993, "train_samples_per_second": 0.258, "train_steps_per_second": 0.008 } ], "logging_steps": 1.0, "max_steps": 1056, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1454552492015616.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }