diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7435 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1056, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002840909090909091, + "grad_norm": 9.802746734876441, + "learning_rate": 0.0, + "loss": 0.6182, + "step": 1 + }, + { + "epoch": 0.005681818181818182, + "grad_norm": 12.388093403552265, + "learning_rate": 1.1627906976744187e-07, + "loss": 0.6649, + "step": 2 + }, + { + "epoch": 0.008522727272727272, + "grad_norm": 12.643095212909474, + "learning_rate": 2.3255813953488374e-07, + "loss": 0.6794, + "step": 3 + }, + { + "epoch": 0.011363636363636364, + "grad_norm": 9.628453582962425, + "learning_rate": 3.488372093023256e-07, + "loss": 0.5426, + "step": 4 + }, + { + "epoch": 0.014204545454545454, + "grad_norm": 12.114285873199693, + "learning_rate": 4.651162790697675e-07, + "loss": 0.6628, + "step": 5 + }, + { + "epoch": 0.017045454545454544, + "grad_norm": 12.590069642332757, + "learning_rate": 5.813953488372094e-07, + "loss": 0.6635, + "step": 6 + }, + { + "epoch": 0.019886363636363636, + "grad_norm": 11.894881604292143, + "learning_rate": 6.976744186046513e-07, + "loss": 0.6478, + "step": 7 + }, + { + "epoch": 0.022727272727272728, + "grad_norm": 10.523659604864859, + "learning_rate": 8.139534883720931e-07, + "loss": 0.6382, + "step": 8 + }, + { + "epoch": 0.02556818181818182, + "grad_norm": 9.260520595400251, + "learning_rate": 9.30232558139535e-07, + "loss": 0.5683, + "step": 9 + }, + { + "epoch": 0.028409090909090908, + "grad_norm": 8.701673712634479, + "learning_rate": 1.0465116279069768e-06, + "loss": 0.5677, + "step": 10 + }, + { + "epoch": 0.03125, + "grad_norm": 7.754246744436588, + "learning_rate": 1.1627906976744188e-06, + "loss": 0.5026, + "step": 11 + }, + { + "epoch": 0.03409090909090909, + "grad_norm": 8.663705476348797, + "learning_rate": 1.2790697674418605e-06, + "loss": 0.6104, + "step": 12 + }, + { + "epoch": 0.036931818181818184, + "grad_norm": 5.045315784322545, + "learning_rate": 1.3953488372093025e-06, + "loss": 0.4227, + "step": 13 + }, + { + "epoch": 0.03977272727272727, + "grad_norm": 4.926402953478099, + "learning_rate": 1.5116279069767443e-06, + "loss": 0.4896, + "step": 14 + }, + { + "epoch": 0.04261363636363636, + "grad_norm": 4.591926718398226, + "learning_rate": 1.6279069767441862e-06, + "loss": 0.4869, + "step": 15 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 4.197025239911461, + "learning_rate": 1.7441860465116282e-06, + "loss": 0.4637, + "step": 16 + }, + { + "epoch": 0.048295454545454544, + "grad_norm": 3.8588657903560684, + "learning_rate": 1.86046511627907e-06, + "loss": 0.4426, + "step": 17 + }, + { + "epoch": 0.05113636363636364, + "grad_norm": 1.8811670709600292, + "learning_rate": 1.976744186046512e-06, + "loss": 0.4305, + "step": 18 + }, + { + "epoch": 0.05397727272727273, + "grad_norm": 1.6752451580220031, + "learning_rate": 2.0930232558139536e-06, + "loss": 0.4529, + "step": 19 + }, + { + "epoch": 0.056818181818181816, + "grad_norm": 1.2090823975791671, + "learning_rate": 2.2093023255813954e-06, + "loss": 0.3613, + "step": 20 + }, + { + "epoch": 0.05965909090909091, + "grad_norm": 1.1814336772386804, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.4037, + "step": 21 + }, + { + "epoch": 0.0625, + "grad_norm": 0.8954725283144086, + "learning_rate": 2.4418604651162793e-06, + "loss": 0.3702, + "step": 22 + }, + { + "epoch": 0.06534090909090909, + "grad_norm": 0.8798870296631145, + "learning_rate": 2.558139534883721e-06, + "loss": 0.3973, + "step": 23 + }, + { + "epoch": 0.06818181818181818, + "grad_norm": 0.5832983194953867, + "learning_rate": 2.674418604651163e-06, + "loss": 0.3262, + "step": 24 + }, + { + "epoch": 0.07102272727272728, + "grad_norm": 0.8732475291899245, + "learning_rate": 2.790697674418605e-06, + "loss": 0.3909, + "step": 25 + }, + { + "epoch": 0.07386363636363637, + "grad_norm": 1.100897285846476, + "learning_rate": 2.9069767441860468e-06, + "loss": 0.3817, + "step": 26 + }, + { + "epoch": 0.07670454545454546, + "grad_norm": 1.0608377951702355, + "learning_rate": 3.0232558139534885e-06, + "loss": 0.3583, + "step": 27 + }, + { + "epoch": 0.07954545454545454, + "grad_norm": 1.0224952192594947, + "learning_rate": 3.1395348837209307e-06, + "loss": 0.4162, + "step": 28 + }, + { + "epoch": 0.08238636363636363, + "grad_norm": 0.8097165887156961, + "learning_rate": 3.2558139534883724e-06, + "loss": 0.3477, + "step": 29 + }, + { + "epoch": 0.08522727272727272, + "grad_norm": 0.7315228867679278, + "learning_rate": 3.372093023255814e-06, + "loss": 0.3951, + "step": 30 + }, + { + "epoch": 0.08806818181818182, + "grad_norm": 0.6032121177421607, + "learning_rate": 3.4883720930232564e-06, + "loss": 0.3414, + "step": 31 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.5651833216962348, + "learning_rate": 3.6046511627906977e-06, + "loss": 0.3635, + "step": 32 + }, + { + "epoch": 0.09375, + "grad_norm": 0.5192255380315864, + "learning_rate": 3.72093023255814e-06, + "loss": 0.3888, + "step": 33 + }, + { + "epoch": 0.09659090909090909, + "grad_norm": 0.49173473741498314, + "learning_rate": 3.837209302325582e-06, + "loss": 0.3749, + "step": 34 + }, + { + "epoch": 0.09943181818181818, + "grad_norm": 0.48300590116190206, + "learning_rate": 3.953488372093024e-06, + "loss": 0.3719, + "step": 35 + }, + { + "epoch": 0.10227272727272728, + "grad_norm": 0.47568795818970555, + "learning_rate": 4.0697674418604655e-06, + "loss": 0.3502, + "step": 36 + }, + { + "epoch": 0.10511363636363637, + "grad_norm": 0.5738976486828545, + "learning_rate": 4.186046511627907e-06, + "loss": 0.3553, + "step": 37 + }, + { + "epoch": 0.10795454545454546, + "grad_norm": 0.48281438241706864, + "learning_rate": 4.302325581395349e-06, + "loss": 0.3194, + "step": 38 + }, + { + "epoch": 0.11079545454545454, + "grad_norm": 0.6040813728082152, + "learning_rate": 4.418604651162791e-06, + "loss": 0.3753, + "step": 39 + }, + { + "epoch": 0.11363636363636363, + "grad_norm": 0.5510018703021852, + "learning_rate": 4.5348837209302326e-06, + "loss": 0.3497, + "step": 40 + }, + { + "epoch": 0.11647727272727272, + "grad_norm": 0.4265614122633672, + "learning_rate": 4.651162790697675e-06, + "loss": 0.3067, + "step": 41 + }, + { + "epoch": 0.11931818181818182, + "grad_norm": 0.3982552723726358, + "learning_rate": 4.767441860465117e-06, + "loss": 0.3166, + "step": 42 + }, + { + "epoch": 0.12215909090909091, + "grad_norm": 0.42319934937905634, + "learning_rate": 4.883720930232559e-06, + "loss": 0.3406, + "step": 43 + }, + { + "epoch": 0.125, + "grad_norm": 0.48844669962812265, + "learning_rate": 5e-06, + "loss": 0.3809, + "step": 44 + }, + { + "epoch": 0.1278409090909091, + "grad_norm": 0.4283299903892573, + "learning_rate": 4.999987977618099e-06, + "loss": 0.3487, + "step": 45 + }, + { + "epoch": 0.13068181818181818, + "grad_norm": 0.45165901843941525, + "learning_rate": 4.999951910588025e-06, + "loss": 0.3261, + "step": 46 + }, + { + "epoch": 0.13352272727272727, + "grad_norm": 0.3309060296669714, + "learning_rate": 4.999891799256668e-06, + "loss": 0.3122, + "step": 47 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 0.3836084760514636, + "learning_rate": 4.9998076442021725e-06, + "loss": 0.3001, + "step": 48 + }, + { + "epoch": 0.13920454545454544, + "grad_norm": 0.425230874245839, + "learning_rate": 4.999699446233934e-06, + "loss": 0.3341, + "step": 49 + }, + { + "epoch": 0.14204545454545456, + "grad_norm": 0.4444798732501407, + "learning_rate": 4.999567206392591e-06, + "loss": 0.3373, + "step": 50 + }, + { + "epoch": 0.14488636363636365, + "grad_norm": 0.381536539310927, + "learning_rate": 4.999410925950012e-06, + "loss": 0.3267, + "step": 51 + }, + { + "epoch": 0.14772727272727273, + "grad_norm": 0.3767650025962174, + "learning_rate": 4.99923060640929e-06, + "loss": 0.328, + "step": 52 + }, + { + "epoch": 0.15056818181818182, + "grad_norm": 0.3903203005773619, + "learning_rate": 4.99902624950472e-06, + "loss": 0.3367, + "step": 53 + }, + { + "epoch": 0.1534090909090909, + "grad_norm": 0.47731540090520985, + "learning_rate": 4.9987978572017875e-06, + "loss": 0.3749, + "step": 54 + }, + { + "epoch": 0.15625, + "grad_norm": 0.36341294567474813, + "learning_rate": 4.998545431697149e-06, + "loss": 0.2952, + "step": 55 + }, + { + "epoch": 0.1590909090909091, + "grad_norm": 0.4160548663852485, + "learning_rate": 4.998268975418606e-06, + "loss": 0.3779, + "step": 56 + }, + { + "epoch": 0.16193181818181818, + "grad_norm": 0.3664734921308225, + "learning_rate": 4.997968491025093e-06, + "loss": 0.3105, + "step": 57 + }, + { + "epoch": 0.16477272727272727, + "grad_norm": 0.35755496009312704, + "learning_rate": 4.997643981406638e-06, + "loss": 0.3508, + "step": 58 + }, + { + "epoch": 0.16761363636363635, + "grad_norm": 0.3738253178296096, + "learning_rate": 4.997295449684345e-06, + "loss": 0.349, + "step": 59 + }, + { + "epoch": 0.17045454545454544, + "grad_norm": 0.3175005755892801, + "learning_rate": 4.996922899210358e-06, + "loss": 0.2984, + "step": 60 + }, + { + "epoch": 0.17329545454545456, + "grad_norm": 0.39931619691125575, + "learning_rate": 4.996526333567833e-06, + "loss": 0.3627, + "step": 61 + }, + { + "epoch": 0.17613636363636365, + "grad_norm": 0.3726199489633269, + "learning_rate": 4.9961057565709015e-06, + "loss": 0.3274, + "step": 62 + }, + { + "epoch": 0.17897727272727273, + "grad_norm": 0.3954308613768431, + "learning_rate": 4.995661172264632e-06, + "loss": 0.34, + "step": 63 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.3814105011898473, + "learning_rate": 4.995192584924995e-06, + "loss": 0.3122, + "step": 64 + }, + { + "epoch": 0.1846590909090909, + "grad_norm": 0.3655156038716592, + "learning_rate": 4.99469999905882e-06, + "loss": 0.35, + "step": 65 + }, + { + "epoch": 0.1875, + "grad_norm": 0.39224970009402493, + "learning_rate": 4.99418341940375e-06, + "loss": 0.3057, + "step": 66 + }, + { + "epoch": 0.1903409090909091, + "grad_norm": 0.33083770067354695, + "learning_rate": 4.9936428509282e-06, + "loss": 0.3144, + "step": 67 + }, + { + "epoch": 0.19318181818181818, + "grad_norm": 0.31786460924484966, + "learning_rate": 4.9930782988313065e-06, + "loss": 0.3156, + "step": 68 + }, + { + "epoch": 0.19602272727272727, + "grad_norm": 0.3515587615165226, + "learning_rate": 4.992489768542877e-06, + "loss": 0.318, + "step": 69 + }, + { + "epoch": 0.19886363636363635, + "grad_norm": 0.39680326873271354, + "learning_rate": 4.991877265723343e-06, + "loss": 0.3319, + "step": 70 + }, + { + "epoch": 0.20170454545454544, + "grad_norm": 0.3532333123348208, + "learning_rate": 4.9912407962636965e-06, + "loss": 0.3343, + "step": 71 + }, + { + "epoch": 0.20454545454545456, + "grad_norm": 0.3684851475562903, + "learning_rate": 4.990580366285441e-06, + "loss": 0.3214, + "step": 72 + }, + { + "epoch": 0.20738636363636365, + "grad_norm": 0.34703126004025847, + "learning_rate": 4.98989598214053e-06, + "loss": 0.3497, + "step": 73 + }, + { + "epoch": 0.21022727272727273, + "grad_norm": 0.331786659705209, + "learning_rate": 4.989187650411306e-06, + "loss": 0.3119, + "step": 74 + }, + { + "epoch": 0.21306818181818182, + "grad_norm": 0.3514432926351399, + "learning_rate": 4.988455377910436e-06, + "loss": 0.3276, + "step": 75 + }, + { + "epoch": 0.2159090909090909, + "grad_norm": 0.45669134699095365, + "learning_rate": 4.987699171680846e-06, + "loss": 0.3502, + "step": 76 + }, + { + "epoch": 0.21875, + "grad_norm": 0.3799997391446089, + "learning_rate": 4.98691903899566e-06, + "loss": 0.3389, + "step": 77 + }, + { + "epoch": 0.2215909090909091, + "grad_norm": 0.32987905423731806, + "learning_rate": 4.986114987358118e-06, + "loss": 0.3154, + "step": 78 + }, + { + "epoch": 0.22443181818181818, + "grad_norm": 0.37320907794023317, + "learning_rate": 4.985287024501512e-06, + "loss": 0.2865, + "step": 79 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.3606727238448836, + "learning_rate": 4.9844351583891125e-06, + "loss": 0.3352, + "step": 80 + }, + { + "epoch": 0.23011363636363635, + "grad_norm": 0.28704484493903537, + "learning_rate": 4.983559397214086e-06, + "loss": 0.2761, + "step": 81 + }, + { + "epoch": 0.23295454545454544, + "grad_norm": 0.3395805127723043, + "learning_rate": 4.982659749399421e-06, + "loss": 0.3013, + "step": 82 + }, + { + "epoch": 0.23579545454545456, + "grad_norm": 0.32754503212231606, + "learning_rate": 4.981736223597845e-06, + "loss": 0.3291, + "step": 83 + }, + { + "epoch": 0.23863636363636365, + "grad_norm": 0.3278411182469415, + "learning_rate": 4.9807888286917425e-06, + "loss": 0.281, + "step": 84 + }, + { + "epoch": 0.24147727272727273, + "grad_norm": 0.3312034883074764, + "learning_rate": 4.979817573793068e-06, + "loss": 0.3484, + "step": 85 + }, + { + "epoch": 0.24431818181818182, + "grad_norm": 0.3001329867151946, + "learning_rate": 4.978822468243259e-06, + "loss": 0.2842, + "step": 86 + }, + { + "epoch": 0.2471590909090909, + "grad_norm": 0.3516159032278349, + "learning_rate": 4.977803521613147e-06, + "loss": 0.3084, + "step": 87 + }, + { + "epoch": 0.25, + "grad_norm": 0.3782753735314241, + "learning_rate": 4.9767607437028645e-06, + "loss": 0.3381, + "step": 88 + }, + { + "epoch": 0.2528409090909091, + "grad_norm": 0.3170089268559784, + "learning_rate": 4.97569414454175e-06, + "loss": 0.3215, + "step": 89 + }, + { + "epoch": 0.2556818181818182, + "grad_norm": 0.29420316873312097, + "learning_rate": 4.9746037343882545e-06, + "loss": 0.2998, + "step": 90 + }, + { + "epoch": 0.2585227272727273, + "grad_norm": 0.45657642279690197, + "learning_rate": 4.97348952372984e-06, + "loss": 0.3354, + "step": 91 + }, + { + "epoch": 0.26136363636363635, + "grad_norm": 0.32675165284478025, + "learning_rate": 4.972351523282878e-06, + "loss": 0.2715, + "step": 92 + }, + { + "epoch": 0.26420454545454547, + "grad_norm": 0.37411987401338476, + "learning_rate": 4.97118974399255e-06, + "loss": 0.331, + "step": 93 + }, + { + "epoch": 0.26704545454545453, + "grad_norm": 0.2906231907319114, + "learning_rate": 4.970004197032741e-06, + "loss": 0.2635, + "step": 94 + }, + { + "epoch": 0.26988636363636365, + "grad_norm": 0.42609899782651967, + "learning_rate": 4.968794893805927e-06, + "loss": 0.3662, + "step": 95 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.35277264498485456, + "learning_rate": 4.967561845943074e-06, + "loss": 0.3656, + "step": 96 + }, + { + "epoch": 0.2755681818181818, + "grad_norm": 0.33825537104063047, + "learning_rate": 4.966305065303519e-06, + "loss": 0.2949, + "step": 97 + }, + { + "epoch": 0.2784090909090909, + "grad_norm": 0.36200881129772927, + "learning_rate": 4.96502456397486e-06, + "loss": 0.3457, + "step": 98 + }, + { + "epoch": 0.28125, + "grad_norm": 0.31133758943801504, + "learning_rate": 4.963720354272837e-06, + "loss": 0.2831, + "step": 99 + }, + { + "epoch": 0.2840909090909091, + "grad_norm": 0.3398462998770164, + "learning_rate": 4.962392448741216e-06, + "loss": 0.308, + "step": 100 + }, + { + "epoch": 0.2869318181818182, + "grad_norm": 0.2825796948908475, + "learning_rate": 4.961040860151669e-06, + "loss": 0.2634, + "step": 101 + }, + { + "epoch": 0.2897727272727273, + "grad_norm": 0.38927704510942096, + "learning_rate": 4.9596656015036434e-06, + "loss": 0.2942, + "step": 102 + }, + { + "epoch": 0.29261363636363635, + "grad_norm": 0.35680520232446933, + "learning_rate": 4.95826668602425e-06, + "loss": 0.3148, + "step": 103 + }, + { + "epoch": 0.29545454545454547, + "grad_norm": 0.40848691247631896, + "learning_rate": 4.956844127168124e-06, + "loss": 0.3475, + "step": 104 + }, + { + "epoch": 0.29829545454545453, + "grad_norm": 0.3675982469780909, + "learning_rate": 4.955397938617304e-06, + "loss": 0.3223, + "step": 105 + }, + { + "epoch": 0.30113636363636365, + "grad_norm": 0.32048567892217283, + "learning_rate": 4.953928134281093e-06, + "loss": 0.316, + "step": 106 + }, + { + "epoch": 0.3039772727272727, + "grad_norm": 0.3107707861319827, + "learning_rate": 4.952434728295931e-06, + "loss": 0.3031, + "step": 107 + }, + { + "epoch": 0.3068181818181818, + "grad_norm": 0.38878643961644715, + "learning_rate": 4.950917735025256e-06, + "loss": 0.3355, + "step": 108 + }, + { + "epoch": 0.3096590909090909, + "grad_norm": 0.3735768679081344, + "learning_rate": 4.949377169059365e-06, + "loss": 0.3008, + "step": 109 + }, + { + "epoch": 0.3125, + "grad_norm": 0.3808439931809935, + "learning_rate": 4.947813045215277e-06, + "loss": 0.3002, + "step": 110 + }, + { + "epoch": 0.3153409090909091, + "grad_norm": 0.3256292929675435, + "learning_rate": 4.946225378536587e-06, + "loss": 0.2988, + "step": 111 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 0.35150877205189135, + "learning_rate": 4.944614184293321e-06, + "loss": 0.2993, + "step": 112 + }, + { + "epoch": 0.3210227272727273, + "grad_norm": 0.37494589367664166, + "learning_rate": 4.942979477981797e-06, + "loss": 0.3129, + "step": 113 + }, + { + "epoch": 0.32386363636363635, + "grad_norm": 0.3506621432286222, + "learning_rate": 4.941321275324463e-06, + "loss": 0.3015, + "step": 114 + }, + { + "epoch": 0.32670454545454547, + "grad_norm": 0.30804865814837706, + "learning_rate": 4.939639592269757e-06, + "loss": 0.2709, + "step": 115 + }, + { + "epoch": 0.32954545454545453, + "grad_norm": 0.4334401140811609, + "learning_rate": 4.9379344449919465e-06, + "loss": 0.3211, + "step": 116 + }, + { + "epoch": 0.33238636363636365, + "grad_norm": 0.4113976286859321, + "learning_rate": 4.936205849890977e-06, + "loss": 0.3486, + "step": 117 + }, + { + "epoch": 0.3352272727272727, + "grad_norm": 0.38143204868428404, + "learning_rate": 4.934453823592313e-06, + "loss": 0.3248, + "step": 118 + }, + { + "epoch": 0.3380681818181818, + "grad_norm": 0.3935231496732602, + "learning_rate": 4.9326783829467795e-06, + "loss": 0.3369, + "step": 119 + }, + { + "epoch": 0.3409090909090909, + "grad_norm": 0.3715854335519974, + "learning_rate": 4.930879545030395e-06, + "loss": 0.3162, + "step": 120 + }, + { + "epoch": 0.34375, + "grad_norm": 0.2987173708346766, + "learning_rate": 4.929057327144213e-06, + "loss": 0.2704, + "step": 121 + }, + { + "epoch": 0.3465909090909091, + "grad_norm": 0.3505876441509565, + "learning_rate": 4.927211746814155e-06, + "loss": 0.2897, + "step": 122 + }, + { + "epoch": 0.3494318181818182, + "grad_norm": 0.3808807666150658, + "learning_rate": 4.925342821790834e-06, + "loss": 0.298, + "step": 123 + }, + { + "epoch": 0.3522727272727273, + "grad_norm": 0.40265933198110954, + "learning_rate": 4.923450570049398e-06, + "loss": 0.3063, + "step": 124 + }, + { + "epoch": 0.35511363636363635, + "grad_norm": 0.329984359578131, + "learning_rate": 4.921535009789344e-06, + "loss": 0.281, + "step": 125 + }, + { + "epoch": 0.35795454545454547, + "grad_norm": 0.3327810259029677, + "learning_rate": 4.91959615943435e-06, + "loss": 0.3035, + "step": 126 + }, + { + "epoch": 0.36079545454545453, + "grad_norm": 0.33832701513333335, + "learning_rate": 4.917634037632095e-06, + "loss": 0.2817, + "step": 127 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.3446767418817894, + "learning_rate": 4.915648663254081e-06, + "loss": 0.3275, + "step": 128 + }, + { + "epoch": 0.3664772727272727, + "grad_norm": 0.4067285176470478, + "learning_rate": 4.9136400553954526e-06, + "loss": 0.2644, + "step": 129 + }, + { + "epoch": 0.3693181818181818, + "grad_norm": 0.32647438056937467, + "learning_rate": 4.91160823337481e-06, + "loss": 0.3012, + "step": 130 + }, + { + "epoch": 0.3721590909090909, + "grad_norm": 0.2641653305047082, + "learning_rate": 4.909553216734024e-06, + "loss": 0.2551, + "step": 131 + }, + { + "epoch": 0.375, + "grad_norm": 0.3587439503975781, + "learning_rate": 4.907475025238051e-06, + "loss": 0.3429, + "step": 132 + }, + { + "epoch": 0.3778409090909091, + "grad_norm": 0.39094595293189244, + "learning_rate": 4.905373678874741e-06, + "loss": 0.3428, + "step": 133 + }, + { + "epoch": 0.3806818181818182, + "grad_norm": 0.33295666810345625, + "learning_rate": 4.903249197854645e-06, + "loss": 0.3024, + "step": 134 + }, + { + "epoch": 0.3835227272727273, + "grad_norm": 0.4067834961803898, + "learning_rate": 4.90110160261082e-06, + "loss": 0.388, + "step": 135 + }, + { + "epoch": 0.38636363636363635, + "grad_norm": 0.3041105753158812, + "learning_rate": 4.898930913798635e-06, + "loss": 0.2791, + "step": 136 + }, + { + "epoch": 0.38920454545454547, + "grad_norm": 0.3854716077313248, + "learning_rate": 4.89673715229557e-06, + "loss": 0.3516, + "step": 137 + }, + { + "epoch": 0.39204545454545453, + "grad_norm": 0.41029172649451373, + "learning_rate": 4.894520339201014e-06, + "loss": 0.3221, + "step": 138 + }, + { + "epoch": 0.39488636363636365, + "grad_norm": 0.31953693308642406, + "learning_rate": 4.892280495836068e-06, + "loss": 0.3268, + "step": 139 + }, + { + "epoch": 0.3977272727272727, + "grad_norm": 0.4798811586379984, + "learning_rate": 4.890017643743334e-06, + "loss": 0.3115, + "step": 140 + }, + { + "epoch": 0.4005681818181818, + "grad_norm": 0.3603031050892597, + "learning_rate": 4.887731804686707e-06, + "loss": 0.2844, + "step": 141 + }, + { + "epoch": 0.4034090909090909, + "grad_norm": 0.40465606169589835, + "learning_rate": 4.885423000651174e-06, + "loss": 0.3573, + "step": 142 + }, + { + "epoch": 0.40625, + "grad_norm": 0.3643063680731307, + "learning_rate": 4.883091253842592e-06, + "loss": 0.2861, + "step": 143 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 0.2855806950882976, + "learning_rate": 4.8807365866874825e-06, + "loss": 0.2856, + "step": 144 + }, + { + "epoch": 0.4119318181818182, + "grad_norm": 0.43700846878534866, + "learning_rate": 4.878359021832812e-06, + "loss": 0.3025, + "step": 145 + }, + { + "epoch": 0.4147727272727273, + "grad_norm": 0.3691328488500052, + "learning_rate": 4.875958582145775e-06, + "loss": 0.3516, + "step": 146 + }, + { + "epoch": 0.41761363636363635, + "grad_norm": 0.3602263970719629, + "learning_rate": 4.873535290713571e-06, + "loss": 0.3276, + "step": 147 + }, + { + "epoch": 0.42045454545454547, + "grad_norm": 0.2873285630204768, + "learning_rate": 4.871089170843192e-06, + "loss": 0.272, + "step": 148 + }, + { + "epoch": 0.42329545454545453, + "grad_norm": 0.3275589221978115, + "learning_rate": 4.868620246061185e-06, + "loss": 0.3127, + "step": 149 + }, + { + "epoch": 0.42613636363636365, + "grad_norm": 0.3595600686315243, + "learning_rate": 4.866128540113436e-06, + "loss": 0.293, + "step": 150 + }, + { + "epoch": 0.4289772727272727, + "grad_norm": 0.39412366891247624, + "learning_rate": 4.863614076964937e-06, + "loss": 0.3105, + "step": 151 + }, + { + "epoch": 0.4318181818181818, + "grad_norm": 0.2967856642106585, + "learning_rate": 4.8610768807995575e-06, + "loss": 0.2488, + "step": 152 + }, + { + "epoch": 0.4346590909090909, + "grad_norm": 0.3353960107255814, + "learning_rate": 4.85851697601981e-06, + "loss": 0.31, + "step": 153 + }, + { + "epoch": 0.4375, + "grad_norm": 0.3293934153604414, + "learning_rate": 4.855934387246619e-06, + "loss": 0.31, + "step": 154 + }, + { + "epoch": 0.4403409090909091, + "grad_norm": 0.4020477745824599, + "learning_rate": 4.853329139319076e-06, + "loss": 0.3607, + "step": 155 + }, + { + "epoch": 0.4431818181818182, + "grad_norm": 0.40194438779646285, + "learning_rate": 4.850701257294212e-06, + "loss": 0.3194, + "step": 156 + }, + { + "epoch": 0.4460227272727273, + "grad_norm": 0.35880107189234606, + "learning_rate": 4.848050766446746e-06, + "loss": 0.3257, + "step": 157 + }, + { + "epoch": 0.44886363636363635, + "grad_norm": 0.3225921590602741, + "learning_rate": 4.84537769226885e-06, + "loss": 0.2865, + "step": 158 + }, + { + "epoch": 0.45170454545454547, + "grad_norm": 0.43105913904133064, + "learning_rate": 4.842682060469899e-06, + "loss": 0.2917, + "step": 159 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.3984098156673031, + "learning_rate": 4.839963896976223e-06, + "loss": 0.3137, + "step": 160 + }, + { + "epoch": 0.45738636363636365, + "grad_norm": 0.34203541957482897, + "learning_rate": 4.837223227930864e-06, + "loss": 0.3021, + "step": 161 + }, + { + "epoch": 0.4602272727272727, + "grad_norm": 0.3410914811625815, + "learning_rate": 4.834460079693317e-06, + "loss": 0.3197, + "step": 162 + }, + { + "epoch": 0.4630681818181818, + "grad_norm": 0.3668120756523038, + "learning_rate": 4.831674478839281e-06, + "loss": 0.3242, + "step": 163 + }, + { + "epoch": 0.4659090909090909, + "grad_norm": 0.34128762447014865, + "learning_rate": 4.828866452160402e-06, + "loss": 0.2626, + "step": 164 + }, + { + "epoch": 0.46875, + "grad_norm": 0.34134817423813496, + "learning_rate": 4.826036026664014e-06, + "loss": 0.2771, + "step": 165 + }, + { + "epoch": 0.4715909090909091, + "grad_norm": 0.3270025125687817, + "learning_rate": 4.823183229572883e-06, + "loss": 0.2921, + "step": 166 + }, + { + "epoch": 0.4744318181818182, + "grad_norm": 0.3701876487404051, + "learning_rate": 4.820308088324942e-06, + "loss": 0.3315, + "step": 167 + }, + { + "epoch": 0.4772727272727273, + "grad_norm": 0.4223541290676315, + "learning_rate": 4.8174106305730284e-06, + "loss": 0.3458, + "step": 168 + }, + { + "epoch": 0.48011363636363635, + "grad_norm": 0.36826807946452467, + "learning_rate": 4.814490884184615e-06, + "loss": 0.3098, + "step": 169 + }, + { + "epoch": 0.48295454545454547, + "grad_norm": 0.34247450811498126, + "learning_rate": 4.811548877241549e-06, + "loss": 0.2794, + "step": 170 + }, + { + "epoch": 0.48579545454545453, + "grad_norm": 0.36931394013248037, + "learning_rate": 4.808584638039774e-06, + "loss": 0.3075, + "step": 171 + }, + { + "epoch": 0.48863636363636365, + "grad_norm": 0.38654212773141833, + "learning_rate": 4.805598195089063e-06, + "loss": 0.2957, + "step": 172 + }, + { + "epoch": 0.4914772727272727, + "grad_norm": 0.327791247654709, + "learning_rate": 4.802589577112742e-06, + "loss": 0.317, + "step": 173 + }, + { + "epoch": 0.4943181818181818, + "grad_norm": 0.4180368575468772, + "learning_rate": 4.7995588130474145e-06, + "loss": 0.2873, + "step": 174 + }, + { + "epoch": 0.4971590909090909, + "grad_norm": 0.41772200012858535, + "learning_rate": 4.7965059320426825e-06, + "loss": 0.3365, + "step": 175 + }, + { + "epoch": 0.5, + "grad_norm": 0.3622810863279747, + "learning_rate": 4.7934309634608676e-06, + "loss": 0.3406, + "step": 176 + }, + { + "epoch": 0.5028409090909091, + "grad_norm": 0.33039829085718986, + "learning_rate": 4.790333936876727e-06, + "loss": 0.2582, + "step": 177 + }, + { + "epoch": 0.5056818181818182, + "grad_norm": 0.2963847161562058, + "learning_rate": 4.78721488207717e-06, + "loss": 0.2621, + "step": 178 + }, + { + "epoch": 0.5085227272727273, + "grad_norm": 0.3688579036529526, + "learning_rate": 4.7840738290609714e-06, + "loss": 0.3106, + "step": 179 + }, + { + "epoch": 0.5113636363636364, + "grad_norm": 0.3882009236138182, + "learning_rate": 4.78091080803848e-06, + "loss": 0.2615, + "step": 180 + }, + { + "epoch": 0.5142045454545454, + "grad_norm": 0.35367280178437593, + "learning_rate": 4.777725849431336e-06, + "loss": 0.3045, + "step": 181 + }, + { + "epoch": 0.5170454545454546, + "grad_norm": 0.3874603305325755, + "learning_rate": 4.774518983872169e-06, + "loss": 0.3151, + "step": 182 + }, + { + "epoch": 0.5198863636363636, + "grad_norm": 0.3089601400335368, + "learning_rate": 4.77129024220431e-06, + "loss": 0.2565, + "step": 183 + }, + { + "epoch": 0.5227272727272727, + "grad_norm": 0.3741939570187776, + "learning_rate": 4.7680396554814886e-06, + "loss": 0.2824, + "step": 184 + }, + { + "epoch": 0.5255681818181818, + "grad_norm": 0.3684238808190501, + "learning_rate": 4.764767254967544e-06, + "loss": 0.2717, + "step": 185 + }, + { + "epoch": 0.5284090909090909, + "grad_norm": 0.34181925499552346, + "learning_rate": 4.761473072136114e-06, + "loss": 0.2984, + "step": 186 + }, + { + "epoch": 0.53125, + "grad_norm": 0.44267647661167453, + "learning_rate": 4.758157138670337e-06, + "loss": 0.3472, + "step": 187 + }, + { + "epoch": 0.5340909090909091, + "grad_norm": 0.3887831736377981, + "learning_rate": 4.75481948646255e-06, + "loss": 0.3111, + "step": 188 + }, + { + "epoch": 0.5369318181818182, + "grad_norm": 0.3683856304101638, + "learning_rate": 4.751460147613973e-06, + "loss": 0.3146, + "step": 189 + }, + { + "epoch": 0.5397727272727273, + "grad_norm": 0.38527593119976, + "learning_rate": 4.748079154434413e-06, + "loss": 0.3314, + "step": 190 + }, + { + "epoch": 0.5426136363636364, + "grad_norm": 0.4031772051747187, + "learning_rate": 4.744676539441941e-06, + "loss": 0.315, + "step": 191 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.3353722780310112, + "learning_rate": 4.741252335362588e-06, + "loss": 0.269, + "step": 192 + }, + { + "epoch": 0.5482954545454546, + "grad_norm": 0.3394618273632171, + "learning_rate": 4.737806575130024e-06, + "loss": 0.2745, + "step": 193 + }, + { + "epoch": 0.5511363636363636, + "grad_norm": 0.4450532210463518, + "learning_rate": 4.734339291885246e-06, + "loss": 0.3188, + "step": 194 + }, + { + "epoch": 0.5539772727272727, + "grad_norm": 0.397975066441739, + "learning_rate": 4.7308505189762565e-06, + "loss": 0.2985, + "step": 195 + }, + { + "epoch": 0.5568181818181818, + "grad_norm": 0.3440535351319966, + "learning_rate": 4.727340289957744e-06, + "loss": 0.2809, + "step": 196 + }, + { + "epoch": 0.5596590909090909, + "grad_norm": 0.6446149440778554, + "learning_rate": 4.723808638590759e-06, + "loss": 0.3218, + "step": 197 + }, + { + "epoch": 0.5625, + "grad_norm": 0.37606508969708213, + "learning_rate": 4.720255598842392e-06, + "loss": 0.3176, + "step": 198 + }, + { + "epoch": 0.5653409090909091, + "grad_norm": 0.43147254520622674, + "learning_rate": 4.716681204885442e-06, + "loss": 0.3268, + "step": 199 + }, + { + "epoch": 0.5681818181818182, + "grad_norm": 0.41993041372097106, + "learning_rate": 4.713085491098093e-06, + "loss": 0.2804, + "step": 200 + }, + { + "epoch": 0.5710227272727273, + "grad_norm": 0.48960282010679945, + "learning_rate": 4.70946849206358e-06, + "loss": 0.3996, + "step": 201 + }, + { + "epoch": 0.5738636363636364, + "grad_norm": 0.3375570582028718, + "learning_rate": 4.705830242569859e-06, + "loss": 0.2914, + "step": 202 + }, + { + "epoch": 0.5767045454545454, + "grad_norm": 0.33067898836626264, + "learning_rate": 4.70217077760927e-06, + "loss": 0.2717, + "step": 203 + }, + { + "epoch": 0.5795454545454546, + "grad_norm": 0.3919628586280393, + "learning_rate": 4.6984901323781996e-06, + "loss": 0.2758, + "step": 204 + }, + { + "epoch": 0.5823863636363636, + "grad_norm": 0.37621132131624546, + "learning_rate": 4.6947883422767475e-06, + "loss": 0.2927, + "step": 205 + }, + { + "epoch": 0.5852272727272727, + "grad_norm": 0.3588621280506994, + "learning_rate": 4.69106544290838e-06, + "loss": 0.3202, + "step": 206 + }, + { + "epoch": 0.5880681818181818, + "grad_norm": 0.36135048731331515, + "learning_rate": 4.687321470079593e-06, + "loss": 0.3075, + "step": 207 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 0.3804960320633388, + "learning_rate": 4.683556459799562e-06, + "loss": 0.304, + "step": 208 + }, + { + "epoch": 0.59375, + "grad_norm": 0.32482777456644224, + "learning_rate": 4.679770448279801e-06, + "loss": 0.2333, + "step": 209 + }, + { + "epoch": 0.5965909090909091, + "grad_norm": 0.38423666885394503, + "learning_rate": 4.6759634719338106e-06, + "loss": 0.3079, + "step": 210 + }, + { + "epoch": 0.5994318181818182, + "grad_norm": 0.3584077009643052, + "learning_rate": 4.672135567376729e-06, + "loss": 0.3078, + "step": 211 + }, + { + "epoch": 0.6022727272727273, + "grad_norm": 0.43190228684358967, + "learning_rate": 4.668286771424982e-06, + "loss": 0.3693, + "step": 212 + }, + { + "epoch": 0.6051136363636364, + "grad_norm": 0.3335333217535499, + "learning_rate": 4.664417121095925e-06, + "loss": 0.2978, + "step": 213 + }, + { + "epoch": 0.6079545454545454, + "grad_norm": 0.3343126694937098, + "learning_rate": 4.660526653607489e-06, + "loss": 0.2654, + "step": 214 + }, + { + "epoch": 0.6107954545454546, + "grad_norm": 0.400588578067547, + "learning_rate": 4.656615406377824e-06, + "loss": 0.3541, + "step": 215 + }, + { + "epoch": 0.6136363636363636, + "grad_norm": 0.28366454469863744, + "learning_rate": 4.652683417024933e-06, + "loss": 0.2595, + "step": 216 + }, + { + "epoch": 0.6164772727272727, + "grad_norm": 0.3333388085745537, + "learning_rate": 4.648730723366321e-06, + "loss": 0.3034, + "step": 217 + }, + { + "epoch": 0.6193181818181818, + "grad_norm": 0.3802324883963107, + "learning_rate": 4.644757363418622e-06, + "loss": 0.3149, + "step": 218 + }, + { + "epoch": 0.6221590909090909, + "grad_norm": 0.3323209944938239, + "learning_rate": 4.640763375397235e-06, + "loss": 0.2831, + "step": 219 + }, + { + "epoch": 0.625, + "grad_norm": 0.3816473948946037, + "learning_rate": 4.636748797715961e-06, + "loss": 0.2901, + "step": 220 + }, + { + "epoch": 0.6278409090909091, + "grad_norm": 0.45087508944423654, + "learning_rate": 4.632713668986628e-06, + "loss": 0.2668, + "step": 221 + }, + { + "epoch": 0.6306818181818182, + "grad_norm": 0.3277834281020941, + "learning_rate": 4.628658028018723e-06, + "loss": 0.3115, + "step": 222 + }, + { + "epoch": 0.6335227272727273, + "grad_norm": 0.4149700033604779, + "learning_rate": 4.624581913819019e-06, + "loss": 0.3049, + "step": 223 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.2986911926260575, + "learning_rate": 4.6204853655911945e-06, + "loss": 0.2828, + "step": 224 + }, + { + "epoch": 0.6392045454545454, + "grad_norm": 0.38662077935688544, + "learning_rate": 4.6163684227354656e-06, + "loss": 0.3019, + "step": 225 + }, + { + "epoch": 0.6420454545454546, + "grad_norm": 0.3670137115048512, + "learning_rate": 4.612231124848199e-06, + "loss": 0.2998, + "step": 226 + }, + { + "epoch": 0.6448863636363636, + "grad_norm": 0.3820920011764151, + "learning_rate": 4.608073511721534e-06, + "loss": 0.3627, + "step": 227 + }, + { + "epoch": 0.6477272727272727, + "grad_norm": 0.26469955866368194, + "learning_rate": 4.6038956233430034e-06, + "loss": 0.2419, + "step": 228 + }, + { + "epoch": 0.6505681818181818, + "grad_norm": 0.32240469660709375, + "learning_rate": 4.59969749989514e-06, + "loss": 0.2692, + "step": 229 + }, + { + "epoch": 0.6534090909090909, + "grad_norm": 0.3896277142098736, + "learning_rate": 4.5954791817551e-06, + "loss": 0.2789, + "step": 230 + }, + { + "epoch": 0.65625, + "grad_norm": 0.3510490299412409, + "learning_rate": 4.591240709494269e-06, + "loss": 0.281, + "step": 231 + }, + { + "epoch": 0.6590909090909091, + "grad_norm": 0.3636438474583087, + "learning_rate": 4.586982123877871e-06, + "loss": 0.2998, + "step": 232 + }, + { + "epoch": 0.6619318181818182, + "grad_norm": 0.3274578399993675, + "learning_rate": 4.582703465864582e-06, + "loss": 0.2758, + "step": 233 + }, + { + "epoch": 0.6647727272727273, + "grad_norm": 0.3205713499503409, + "learning_rate": 4.5784047766061305e-06, + "loss": 0.2716, + "step": 234 + }, + { + "epoch": 0.6676136363636364, + "grad_norm": 0.47159005981022434, + "learning_rate": 4.574086097446903e-06, + "loss": 0.3236, + "step": 235 + }, + { + "epoch": 0.6704545454545454, + "grad_norm": 0.3617567220761258, + "learning_rate": 4.569747469923547e-06, + "loss": 0.2863, + "step": 236 + }, + { + "epoch": 0.6732954545454546, + "grad_norm": 0.32166940611651096, + "learning_rate": 4.565388935764572e-06, + "loss": 0.31, + "step": 237 + }, + { + "epoch": 0.6761363636363636, + "grad_norm": 0.3982166865116622, + "learning_rate": 4.56101053688995e-06, + "loss": 0.2874, + "step": 238 + }, + { + "epoch": 0.6789772727272727, + "grad_norm": 0.4339388465917976, + "learning_rate": 4.5566123154107055e-06, + "loss": 0.3374, + "step": 239 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.36030799942916975, + "learning_rate": 4.552194313628518e-06, + "loss": 0.2668, + "step": 240 + }, + { + "epoch": 0.6846590909090909, + "grad_norm": 0.3940718141510353, + "learning_rate": 4.547756574035311e-06, + "loss": 0.3277, + "step": 241 + }, + { + "epoch": 0.6875, + "grad_norm": 0.4326472723953054, + "learning_rate": 4.5432991393128446e-06, + "loss": 0.3227, + "step": 242 + }, + { + "epoch": 0.6903409090909091, + "grad_norm": 0.41998189617141085, + "learning_rate": 4.538822052332306e-06, + "loss": 0.339, + "step": 243 + }, + { + "epoch": 0.6931818181818182, + "grad_norm": 0.36510653915186314, + "learning_rate": 4.534325356153892e-06, + "loss": 0.2637, + "step": 244 + }, + { + "epoch": 0.6960227272727273, + "grad_norm": 0.4748073641254545, + "learning_rate": 4.529809094026404e-06, + "loss": 0.3226, + "step": 245 + }, + { + "epoch": 0.6988636363636364, + "grad_norm": 0.3848777680236735, + "learning_rate": 4.525273309386825e-06, + "loss": 0.3401, + "step": 246 + }, + { + "epoch": 0.7017045454545454, + "grad_norm": 0.286675785535149, + "learning_rate": 4.5207180458599e-06, + "loss": 0.2495, + "step": 247 + }, + { + "epoch": 0.7045454545454546, + "grad_norm": 0.3770143744991594, + "learning_rate": 4.516143347257726e-06, + "loss": 0.2923, + "step": 248 + }, + { + "epoch": 0.7073863636363636, + "grad_norm": 0.37240976329747977, + "learning_rate": 4.511549257579322e-06, + "loss": 0.2968, + "step": 249 + }, + { + "epoch": 0.7102272727272727, + "grad_norm": 0.53790018713925, + "learning_rate": 4.506935821010206e-06, + "loss": 0.298, + "step": 250 + }, + { + "epoch": 0.7130681818181818, + "grad_norm": 0.3896643010491094, + "learning_rate": 4.502303081921978e-06, + "loss": 0.3125, + "step": 251 + }, + { + "epoch": 0.7159090909090909, + "grad_norm": 0.32770126981260167, + "learning_rate": 4.497651084871883e-06, + "loss": 0.2781, + "step": 252 + }, + { + "epoch": 0.71875, + "grad_norm": 0.3541924637393212, + "learning_rate": 4.492979874602389e-06, + "loss": 0.3023, + "step": 253 + }, + { + "epoch": 0.7215909090909091, + "grad_norm": 0.3735099253437524, + "learning_rate": 4.4882894960407566e-06, + "loss": 0.3225, + "step": 254 + }, + { + "epoch": 0.7244318181818182, + "grad_norm": 0.3853359485269271, + "learning_rate": 4.483579994298602e-06, + "loss": 0.3119, + "step": 255 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.4232262055395998, + "learning_rate": 4.478851414671469e-06, + "loss": 0.2996, + "step": 256 + }, + { + "epoch": 0.7301136363636364, + "grad_norm": 0.3403475343187684, + "learning_rate": 4.474103802638389e-06, + "loss": 0.2948, + "step": 257 + }, + { + "epoch": 0.7329545454545454, + "grad_norm": 0.4197482437210073, + "learning_rate": 4.469337203861447e-06, + "loss": 0.2999, + "step": 258 + }, + { + "epoch": 0.7357954545454546, + "grad_norm": 0.33941700168906186, + "learning_rate": 4.464551664185339e-06, + "loss": 0.2636, + "step": 259 + }, + { + "epoch": 0.7386363636363636, + "grad_norm": 0.35067662494508334, + "learning_rate": 4.459747229636933e-06, + "loss": 0.3153, + "step": 260 + }, + { + "epoch": 0.7414772727272727, + "grad_norm": 0.33432839847763335, + "learning_rate": 4.454923946424827e-06, + "loss": 0.2646, + "step": 261 + }, + { + "epoch": 0.7443181818181818, + "grad_norm": 0.3486384565640427, + "learning_rate": 4.450081860938904e-06, + "loss": 0.3026, + "step": 262 + }, + { + "epoch": 0.7471590909090909, + "grad_norm": 0.3647193452879592, + "learning_rate": 4.4452210197498845e-06, + "loss": 0.3208, + "step": 263 + }, + { + "epoch": 0.75, + "grad_norm": 0.3621939393169193, + "learning_rate": 4.440341469608879e-06, + "loss": 0.3042, + "step": 264 + }, + { + "epoch": 0.7528409090909091, + "grad_norm": 0.2856803312521231, + "learning_rate": 4.43544325744694e-06, + "loss": 0.2548, + "step": 265 + }, + { + "epoch": 0.7556818181818182, + "grad_norm": 0.41636147550676134, + "learning_rate": 4.4305264303746085e-06, + "loss": 0.2743, + "step": 266 + }, + { + "epoch": 0.7585227272727273, + "grad_norm": 0.3149004485762251, + "learning_rate": 4.425591035681465e-06, + "loss": 0.2768, + "step": 267 + }, + { + "epoch": 0.7613636363636364, + "grad_norm": 0.39793987625802313, + "learning_rate": 4.420637120835668e-06, + "loss": 0.3055, + "step": 268 + }, + { + "epoch": 0.7642045454545454, + "grad_norm": 0.4058178861459375, + "learning_rate": 4.415664733483502e-06, + "loss": 0.3168, + "step": 269 + }, + { + "epoch": 0.7670454545454546, + "grad_norm": 0.3732878019312248, + "learning_rate": 4.4106739214489195e-06, + "loss": 0.2935, + "step": 270 + }, + { + "epoch": 0.7698863636363636, + "grad_norm": 0.31801887671340195, + "learning_rate": 4.405664732733079e-06, + "loss": 0.2768, + "step": 271 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 0.43538965465048635, + "learning_rate": 4.400637215513883e-06, + "loss": 0.2644, + "step": 272 + }, + { + "epoch": 0.7755681818181818, + "grad_norm": 0.3619890541849985, + "learning_rate": 4.395591418145519e-06, + "loss": 0.2671, + "step": 273 + }, + { + "epoch": 0.7784090909090909, + "grad_norm": 0.43611885998338823, + "learning_rate": 4.390527389157989e-06, + "loss": 0.3481, + "step": 274 + }, + { + "epoch": 0.78125, + "grad_norm": 0.411038314679305, + "learning_rate": 4.385445177256646e-06, + "loss": 0.3283, + "step": 275 + }, + { + "epoch": 0.7840909090909091, + "grad_norm": 0.4004177118376606, + "learning_rate": 4.380344831321722e-06, + "loss": 0.3421, + "step": 276 + }, + { + "epoch": 0.7869318181818182, + "grad_norm": 0.31547958031028983, + "learning_rate": 4.375226400407863e-06, + "loss": 0.2541, + "step": 277 + }, + { + "epoch": 0.7897727272727273, + "grad_norm": 0.36900762280860266, + "learning_rate": 4.370089933743654e-06, + "loss": 0.3097, + "step": 278 + }, + { + "epoch": 0.7926136363636364, + "grad_norm": 0.4686945698836896, + "learning_rate": 4.364935480731147e-06, + "loss": 0.2918, + "step": 279 + }, + { + "epoch": 0.7954545454545454, + "grad_norm": 0.3509902009735286, + "learning_rate": 4.3597630909453835e-06, + "loss": 0.2646, + "step": 280 + }, + { + "epoch": 0.7982954545454546, + "grad_norm": 0.30875325359327965, + "learning_rate": 4.35457281413392e-06, + "loss": 0.2349, + "step": 281 + }, + { + "epoch": 0.8011363636363636, + "grad_norm": 0.3943745151294021, + "learning_rate": 4.349364700216346e-06, + "loss": 0.2764, + "step": 282 + }, + { + "epoch": 0.8039772727272727, + "grad_norm": 0.35558604531483284, + "learning_rate": 4.344138799283814e-06, + "loss": 0.2442, + "step": 283 + }, + { + "epoch": 0.8068181818181818, + "grad_norm": 0.38278211936173095, + "learning_rate": 4.338895161598541e-06, + "loss": 0.3294, + "step": 284 + }, + { + "epoch": 0.8096590909090909, + "grad_norm": 0.3932974746294698, + "learning_rate": 4.333633837593341e-06, + "loss": 0.2951, + "step": 285 + }, + { + "epoch": 0.8125, + "grad_norm": 0.31762648150994005, + "learning_rate": 4.328354877871131e-06, + "loss": 0.2612, + "step": 286 + }, + { + "epoch": 0.8153409090909091, + "grad_norm": 0.3405862130473983, + "learning_rate": 4.323058333204446e-06, + "loss": 0.2833, + "step": 287 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 0.31883855959276614, + "learning_rate": 4.317744254534954e-06, + "loss": 0.2609, + "step": 288 + }, + { + "epoch": 0.8210227272727273, + "grad_norm": 0.39913277335187336, + "learning_rate": 4.312412692972959e-06, + "loss": 0.2758, + "step": 289 + }, + { + "epoch": 0.8238636363636364, + "grad_norm": 0.39064418227258985, + "learning_rate": 4.307063699796918e-06, + "loss": 0.2664, + "step": 290 + }, + { + "epoch": 0.8267045454545454, + "grad_norm": 0.3126978473531618, + "learning_rate": 4.301697326452942e-06, + "loss": 0.2572, + "step": 291 + }, + { + "epoch": 0.8295454545454546, + "grad_norm": 0.3641340405050646, + "learning_rate": 4.296313624554303e-06, + "loss": 0.286, + "step": 292 + }, + { + "epoch": 0.8323863636363636, + "grad_norm": 0.4168496899263259, + "learning_rate": 4.290912645880936e-06, + "loss": 0.3035, + "step": 293 + }, + { + "epoch": 0.8352272727272727, + "grad_norm": 0.3466683321895305, + "learning_rate": 4.285494442378945e-06, + "loss": 0.2853, + "step": 294 + }, + { + "epoch": 0.8380681818181818, + "grad_norm": 0.3572355149237221, + "learning_rate": 4.280059066160098e-06, + "loss": 0.3021, + "step": 295 + }, + { + "epoch": 0.8409090909090909, + "grad_norm": 0.36054386776426756, + "learning_rate": 4.274606569501332e-06, + "loss": 0.3041, + "step": 296 + }, + { + "epoch": 0.84375, + "grad_norm": 0.3220431488871405, + "learning_rate": 4.269137004844242e-06, + "loss": 0.2542, + "step": 297 + }, + { + "epoch": 0.8465909090909091, + "grad_norm": 0.4103185848899213, + "learning_rate": 4.2636504247945865e-06, + "loss": 0.2859, + "step": 298 + }, + { + "epoch": 0.8494318181818182, + "grad_norm": 0.3444474167498623, + "learning_rate": 4.258146882121772e-06, + "loss": 0.3082, + "step": 299 + }, + { + "epoch": 0.8522727272727273, + "grad_norm": 0.35145064032825696, + "learning_rate": 4.252626429758354e-06, + "loss": 0.2679, + "step": 300 + }, + { + "epoch": 0.8551136363636364, + "grad_norm": 0.39931471518127176, + "learning_rate": 4.247089120799521e-06, + "loss": 0.3486, + "step": 301 + }, + { + "epoch": 0.8579545454545454, + "grad_norm": 0.2860970262972797, + "learning_rate": 4.241535008502587e-06, + "loss": 0.23, + "step": 302 + }, + { + "epoch": 0.8607954545454546, + "grad_norm": 0.4649020596412495, + "learning_rate": 4.235964146286479e-06, + "loss": 0.3252, + "step": 303 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 0.3482820070437071, + "learning_rate": 4.230376587731225e-06, + "loss": 0.2854, + "step": 304 + }, + { + "epoch": 0.8664772727272727, + "grad_norm": 0.3269410990279316, + "learning_rate": 4.2247723865774336e-06, + "loss": 0.2563, + "step": 305 + }, + { + "epoch": 0.8693181818181818, + "grad_norm": 0.31949294830520775, + "learning_rate": 4.219151596725782e-06, + "loss": 0.2688, + "step": 306 + }, + { + "epoch": 0.8721590909090909, + "grad_norm": 0.43502447469171057, + "learning_rate": 4.213514272236499e-06, + "loss": 0.3386, + "step": 307 + }, + { + "epoch": 0.875, + "grad_norm": 0.3797117211719601, + "learning_rate": 4.207860467328835e-06, + "loss": 0.2855, + "step": 308 + }, + { + "epoch": 0.8778409090909091, + "grad_norm": 0.3799062699361923, + "learning_rate": 4.202190236380552e-06, + "loss": 0.2545, + "step": 309 + }, + { + "epoch": 0.8806818181818182, + "grad_norm": 0.3360385792661154, + "learning_rate": 4.196503633927398e-06, + "loss": 0.2909, + "step": 310 + }, + { + "epoch": 0.8835227272727273, + "grad_norm": 0.4188943106281552, + "learning_rate": 4.190800714662576e-06, + "loss": 0.3291, + "step": 311 + }, + { + "epoch": 0.8863636363636364, + "grad_norm": 0.43183074366157487, + "learning_rate": 4.185081533436226e-06, + "loss": 0.3303, + "step": 312 + }, + { + "epoch": 0.8892045454545454, + "grad_norm": 0.35087669084397133, + "learning_rate": 4.179346145254892e-06, + "loss": 0.3152, + "step": 313 + }, + { + "epoch": 0.8920454545454546, + "grad_norm": 0.34360678080641915, + "learning_rate": 4.173594605280995e-06, + "loss": 0.2726, + "step": 314 + }, + { + "epoch": 0.8948863636363636, + "grad_norm": 0.39638626020449463, + "learning_rate": 4.1678269688323045e-06, + "loss": 0.3369, + "step": 315 + }, + { + "epoch": 0.8977272727272727, + "grad_norm": 0.3566510725505037, + "learning_rate": 4.1620432913814026e-06, + "loss": 0.2469, + "step": 316 + }, + { + "epoch": 0.9005681818181818, + "grad_norm": 0.32842562735745623, + "learning_rate": 4.156243628555151e-06, + "loss": 0.3018, + "step": 317 + }, + { + "epoch": 0.9034090909090909, + "grad_norm": 0.30679142774263857, + "learning_rate": 4.150428036134161e-06, + "loss": 0.2476, + "step": 318 + }, + { + "epoch": 0.90625, + "grad_norm": 0.38736943533330265, + "learning_rate": 4.144596570052249e-06, + "loss": 0.279, + "step": 319 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.3461038914128392, + "learning_rate": 4.1387492863959076e-06, + "loss": 0.262, + "step": 320 + }, + { + "epoch": 0.9119318181818182, + "grad_norm": 0.3328949084965424, + "learning_rate": 4.132886241403756e-06, + "loss": 0.2841, + "step": 321 + }, + { + "epoch": 0.9147727272727273, + "grad_norm": 0.3684252037786764, + "learning_rate": 4.127007491466008e-06, + "loss": 0.3032, + "step": 322 + }, + { + "epoch": 0.9176136363636364, + "grad_norm": 0.44163540100916987, + "learning_rate": 4.121113093123925e-06, + "loss": 0.3164, + "step": 323 + }, + { + "epoch": 0.9204545454545454, + "grad_norm": 0.49048074141989995, + "learning_rate": 4.115203103069273e-06, + "loss": 0.2623, + "step": 324 + }, + { + "epoch": 0.9232954545454546, + "grad_norm": 0.34827477492871306, + "learning_rate": 4.109277578143779e-06, + "loss": 0.2717, + "step": 325 + }, + { + "epoch": 0.9261363636363636, + "grad_norm": 0.3603610666299997, + "learning_rate": 4.10333657533858e-06, + "loss": 0.2783, + "step": 326 + }, + { + "epoch": 0.9289772727272727, + "grad_norm": 0.3901080384564019, + "learning_rate": 4.097380151793681e-06, + "loss": 0.286, + "step": 327 + }, + { + "epoch": 0.9318181818181818, + "grad_norm": 0.3598672604385207, + "learning_rate": 4.0914083647974025e-06, + "loss": 0.3375, + "step": 328 + }, + { + "epoch": 0.9346590909090909, + "grad_norm": 0.32775404856254314, + "learning_rate": 4.085421271785824e-06, + "loss": 0.2904, + "step": 329 + }, + { + "epoch": 0.9375, + "grad_norm": 0.29442351680114387, + "learning_rate": 4.079418930342243e-06, + "loss": 0.2629, + "step": 330 + }, + { + "epoch": 0.9403409090909091, + "grad_norm": 0.4405796100351076, + "learning_rate": 4.0734013981966125e-06, + "loss": 0.3665, + "step": 331 + }, + { + "epoch": 0.9431818181818182, + "grad_norm": 0.3334068109525356, + "learning_rate": 4.0673687332249866e-06, + "loss": 0.3079, + "step": 332 + }, + { + "epoch": 0.9460227272727273, + "grad_norm": 0.32669985590044703, + "learning_rate": 4.061320993448968e-06, + "loss": 0.2904, + "step": 333 + }, + { + "epoch": 0.9488636363636364, + "grad_norm": 0.3442146928076968, + "learning_rate": 4.055258237035146e-06, + "loss": 0.3146, + "step": 334 + }, + { + "epoch": 0.9517045454545454, + "grad_norm": 0.4309052746676042, + "learning_rate": 4.04918052229454e-06, + "loss": 0.3446, + "step": 335 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 0.35908542610160016, + "learning_rate": 4.043087907682035e-06, + "loss": 0.2534, + "step": 336 + }, + { + "epoch": 0.9573863636363636, + "grad_norm": 0.3894188962377372, + "learning_rate": 4.036980451795822e-06, + "loss": 0.3262, + "step": 337 + }, + { + "epoch": 0.9602272727272727, + "grad_norm": 0.37392061032103363, + "learning_rate": 4.030858213376838e-06, + "loss": 0.3158, + "step": 338 + }, + { + "epoch": 0.9630681818181818, + "grad_norm": 0.3880624083667109, + "learning_rate": 4.02472125130819e-06, + "loss": 0.2908, + "step": 339 + }, + { + "epoch": 0.9659090909090909, + "grad_norm": 0.4031632690009814, + "learning_rate": 4.018569624614602e-06, + "loss": 0.3279, + "step": 340 + }, + { + "epoch": 0.96875, + "grad_norm": 0.38583919245780574, + "learning_rate": 4.012403392461837e-06, + "loss": 0.2657, + "step": 341 + }, + { + "epoch": 0.9715909090909091, + "grad_norm": 0.4657940346556613, + "learning_rate": 4.006222614156132e-06, + "loss": 0.3176, + "step": 342 + }, + { + "epoch": 0.9744318181818182, + "grad_norm": 0.28406132307929355, + "learning_rate": 4.000027349143633e-06, + "loss": 0.2261, + "step": 343 + }, + { + "epoch": 0.9772727272727273, + "grad_norm": 0.3809447081607224, + "learning_rate": 3.993817657009808e-06, + "loss": 0.291, + "step": 344 + }, + { + "epoch": 0.9801136363636364, + "grad_norm": 0.37276416289236974, + "learning_rate": 3.987593597478894e-06, + "loss": 0.3229, + "step": 345 + }, + { + "epoch": 0.9829545454545454, + "grad_norm": 0.36213806018136363, + "learning_rate": 3.981355230413305e-06, + "loss": 0.2785, + "step": 346 + }, + { + "epoch": 0.9857954545454546, + "grad_norm": 0.3774008729788378, + "learning_rate": 3.975102615813068e-06, + "loss": 0.272, + "step": 347 + }, + { + "epoch": 0.9886363636363636, + "grad_norm": 0.3268419464248498, + "learning_rate": 3.968835813815236e-06, + "loss": 0.2468, + "step": 348 + }, + { + "epoch": 0.9914772727272727, + "grad_norm": 0.401670934547313, + "learning_rate": 3.962554884693323e-06, + "loss": 0.2953, + "step": 349 + }, + { + "epoch": 0.9943181818181818, + "grad_norm": 0.40169610324443583, + "learning_rate": 3.956259888856708e-06, + "loss": 0.2939, + "step": 350 + }, + { + "epoch": 0.9971590909090909, + "grad_norm": 0.2891600640815435, + "learning_rate": 3.949950886850069e-06, + "loss": 0.2805, + "step": 351 + }, + { + "epoch": 1.0, + "grad_norm": 0.3279215818041681, + "learning_rate": 3.943627939352789e-06, + "loss": 0.2598, + "step": 352 + }, + { + "epoch": 1.0028409090909092, + "grad_norm": 0.3533913319935541, + "learning_rate": 3.9372911071783805e-06, + "loss": 0.2673, + "step": 353 + }, + { + "epoch": 1.0056818181818181, + "grad_norm": 0.38416565428145066, + "learning_rate": 3.930940451273898e-06, + "loss": 0.2933, + "step": 354 + }, + { + "epoch": 1.0085227272727273, + "grad_norm": 0.41220420942768127, + "learning_rate": 3.924576032719349e-06, + "loss": 0.2952, + "step": 355 + }, + { + "epoch": 1.0113636363636365, + "grad_norm": 0.4096268298831798, + "learning_rate": 3.9181979127271076e-06, + "loss": 0.2575, + "step": 356 + }, + { + "epoch": 1.0142045454545454, + "grad_norm": 0.45379315898269595, + "learning_rate": 3.911806152641333e-06, + "loss": 0.2717, + "step": 357 + }, + { + "epoch": 1.0170454545454546, + "grad_norm": 0.32770827000624236, + "learning_rate": 3.9054008139373675e-06, + "loss": 0.266, + "step": 358 + }, + { + "epoch": 1.0198863636363635, + "grad_norm": 0.2965104343367262, + "learning_rate": 3.8989819582211555e-06, + "loss": 0.2548, + "step": 359 + }, + { + "epoch": 1.0227272727272727, + "grad_norm": 0.4054461782711258, + "learning_rate": 3.892549647228642e-06, + "loss": 0.3398, + "step": 360 + }, + { + "epoch": 1.0255681818181819, + "grad_norm": 0.39022556113460055, + "learning_rate": 3.886103942825189e-06, + "loss": 0.2826, + "step": 361 + }, + { + "epoch": 1.0284090909090908, + "grad_norm": 0.3374532413491821, + "learning_rate": 3.879644907004972e-06, + "loss": 0.2644, + "step": 362 + }, + { + "epoch": 1.03125, + "grad_norm": 0.337718457045594, + "learning_rate": 3.873172601890386e-06, + "loss": 0.2545, + "step": 363 + }, + { + "epoch": 1.0340909090909092, + "grad_norm": 0.3729922751436951, + "learning_rate": 3.86668708973145e-06, + "loss": 0.2951, + "step": 364 + }, + { + "epoch": 1.0369318181818181, + "grad_norm": 0.31238473142978845, + "learning_rate": 3.860188432905209e-06, + "loss": 0.2537, + "step": 365 + }, + { + "epoch": 1.0397727272727273, + "grad_norm": 0.37350151083829397, + "learning_rate": 3.853676693915129e-06, + "loss": 0.2614, + "step": 366 + }, + { + "epoch": 1.0426136363636365, + "grad_norm": 0.3575634359205247, + "learning_rate": 3.8471519353905025e-06, + "loss": 0.2437, + "step": 367 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 0.3537757819725644, + "learning_rate": 3.840614220085837e-06, + "loss": 0.2747, + "step": 368 + }, + { + "epoch": 1.0482954545454546, + "grad_norm": 0.34943668518465093, + "learning_rate": 3.834063610880263e-06, + "loss": 0.2844, + "step": 369 + }, + { + "epoch": 1.0511363636363635, + "grad_norm": 0.32611370130766987, + "learning_rate": 3.827500170776921e-06, + "loss": 0.2578, + "step": 370 + }, + { + "epoch": 1.0539772727272727, + "grad_norm": 0.29743321074762596, + "learning_rate": 3.8209239629023565e-06, + "loss": 0.2361, + "step": 371 + }, + { + "epoch": 1.0568181818181819, + "grad_norm": 0.3317934285561481, + "learning_rate": 3.814335050505916e-06, + "loss": 0.2645, + "step": 372 + }, + { + "epoch": 1.0596590909090908, + "grad_norm": 0.40729226447208133, + "learning_rate": 3.8077334969591377e-06, + "loss": 0.2929, + "step": 373 + }, + { + "epoch": 1.0625, + "grad_norm": 0.35583822537265253, + "learning_rate": 3.801119365755138e-06, + "loss": 0.3036, + "step": 374 + }, + { + "epoch": 1.0653409090909092, + "grad_norm": 0.47116931222172215, + "learning_rate": 3.7944927205080073e-06, + "loss": 0.2962, + "step": 375 + }, + { + "epoch": 1.0681818181818181, + "grad_norm": 0.4620500786524589, + "learning_rate": 3.7878536249521935e-06, + "loss": 0.3186, + "step": 376 + }, + { + "epoch": 1.0710227272727273, + "grad_norm": 0.4310223125222202, + "learning_rate": 3.7812021429418886e-06, + "loss": 0.305, + "step": 377 + }, + { + "epoch": 1.0738636363636365, + "grad_norm": 0.35860375920691345, + "learning_rate": 3.77453833845042e-06, + "loss": 0.3124, + "step": 378 + }, + { + "epoch": 1.0767045454545454, + "grad_norm": 0.40493909967111513, + "learning_rate": 3.7678622755696292e-06, + "loss": 0.2649, + "step": 379 + }, + { + "epoch": 1.0795454545454546, + "grad_norm": 0.3699164344949677, + "learning_rate": 3.7611740185092587e-06, + "loss": 0.3346, + "step": 380 + }, + { + "epoch": 1.0823863636363635, + "grad_norm": 0.5931781411606138, + "learning_rate": 3.754473631596332e-06, + "loss": 0.2729, + "step": 381 + }, + { + "epoch": 1.0852272727272727, + "grad_norm": 0.3122039055630976, + "learning_rate": 3.7477611792745384e-06, + "loss": 0.2816, + "step": 382 + }, + { + "epoch": 1.0880681818181819, + "grad_norm": 0.35273556528651445, + "learning_rate": 3.7410367261036094e-06, + "loss": 0.2765, + "step": 383 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.361323677115818, + "learning_rate": 3.7343003367587e-06, + "loss": 0.2831, + "step": 384 + }, + { + "epoch": 1.09375, + "grad_norm": 0.3776789429609578, + "learning_rate": 3.727552076029767e-06, + "loss": 0.3006, + "step": 385 + }, + { + "epoch": 1.0965909090909092, + "grad_norm": 0.4049848534001206, + "learning_rate": 3.7207920088209454e-06, + "loss": 0.3213, + "step": 386 + }, + { + "epoch": 1.0994318181818181, + "grad_norm": 0.3541711790485223, + "learning_rate": 3.7140202001499214e-06, + "loss": 0.2902, + "step": 387 + }, + { + "epoch": 1.1022727272727273, + "grad_norm": 0.3501668624619801, + "learning_rate": 3.707236715147312e-06, + "loss": 0.2809, + "step": 388 + }, + { + "epoch": 1.1051136363636365, + "grad_norm": 0.38321621491594765, + "learning_rate": 3.700441619056035e-06, + "loss": 0.3163, + "step": 389 + }, + { + "epoch": 1.1079545454545454, + "grad_norm": 0.4044457614031915, + "learning_rate": 3.693634977230681e-06, + "loss": 0.2862, + "step": 390 + }, + { + "epoch": 1.1107954545454546, + "grad_norm": 0.40951036198359486, + "learning_rate": 3.686816855136891e-06, + "loss": 0.28, + "step": 391 + }, + { + "epoch": 1.1136363636363635, + "grad_norm": 0.29410719311103134, + "learning_rate": 3.679987318350717e-06, + "loss": 0.2299, + "step": 392 + }, + { + "epoch": 1.1164772727272727, + "grad_norm": 0.3819079818809885, + "learning_rate": 3.673146432557998e-06, + "loss": 0.32, + "step": 393 + }, + { + "epoch": 1.1193181818181819, + "grad_norm": 0.3628245440460693, + "learning_rate": 3.666294263553729e-06, + "loss": 0.2724, + "step": 394 + }, + { + "epoch": 1.1221590909090908, + "grad_norm": 0.34928329721642853, + "learning_rate": 3.659430877241423e-06, + "loss": 0.248, + "step": 395 + }, + { + "epoch": 1.125, + "grad_norm": 0.442925717620733, + "learning_rate": 3.6525563396324826e-06, + "loss": 0.2942, + "step": 396 + }, + { + "epoch": 1.1278409090909092, + "grad_norm": 0.4525323331322651, + "learning_rate": 3.6456707168455584e-06, + "loss": 0.3258, + "step": 397 + }, + { + "epoch": 1.1306818181818181, + "grad_norm": 0.39153087965891287, + "learning_rate": 3.6387740751059218e-06, + "loss": 0.3072, + "step": 398 + }, + { + "epoch": 1.1335227272727273, + "grad_norm": 0.3886102447660378, + "learning_rate": 3.6318664807448218e-06, + "loss": 0.3415, + "step": 399 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.3642339507412296, + "learning_rate": 3.6249480001988463e-06, + "loss": 0.2691, + "step": 400 + }, + { + "epoch": 1.1392045454545454, + "grad_norm": 0.3380651370346197, + "learning_rate": 3.6180187000092894e-06, + "loss": 0.2791, + "step": 401 + }, + { + "epoch": 1.1420454545454546, + "grad_norm": 0.3193544491508243, + "learning_rate": 3.611078646821505e-06, + "loss": 0.2326, + "step": 402 + }, + { + "epoch": 1.1448863636363638, + "grad_norm": 0.30524333443799656, + "learning_rate": 3.6041279073842684e-06, + "loss": 0.2489, + "step": 403 + }, + { + "epoch": 1.1477272727272727, + "grad_norm": 0.39683144371135337, + "learning_rate": 3.597166548549136e-06, + "loss": 0.2656, + "step": 404 + }, + { + "epoch": 1.1505681818181819, + "grad_norm": 0.39975422805218463, + "learning_rate": 3.590194637269798e-06, + "loss": 0.2823, + "step": 405 + }, + { + "epoch": 1.1534090909090908, + "grad_norm": 0.3781718281788356, + "learning_rate": 3.5832122406014398e-06, + "loss": 0.2545, + "step": 406 + }, + { + "epoch": 1.15625, + "grad_norm": 0.39633632407524205, + "learning_rate": 3.576219425700092e-06, + "loss": 0.2656, + "step": 407 + }, + { + "epoch": 1.1590909090909092, + "grad_norm": 0.503126670284463, + "learning_rate": 3.5692162598219877e-06, + "loss": 0.3106, + "step": 408 + }, + { + "epoch": 1.1619318181818181, + "grad_norm": 0.3803993289484403, + "learning_rate": 3.5622028103229154e-06, + "loss": 0.2777, + "step": 409 + }, + { + "epoch": 1.1647727272727273, + "grad_norm": 0.32896270814306483, + "learning_rate": 3.555179144657568e-06, + "loss": 0.2681, + "step": 410 + }, + { + "epoch": 1.1676136363636362, + "grad_norm": 0.45079184347220275, + "learning_rate": 3.548145330378901e-06, + "loss": 0.298, + "step": 411 + }, + { + "epoch": 1.1704545454545454, + "grad_norm": 0.3409745563125651, + "learning_rate": 3.5411014351374735e-06, + "loss": 0.2829, + "step": 412 + }, + { + "epoch": 1.1732954545454546, + "grad_norm": 0.3524051821269997, + "learning_rate": 3.5340475266808046e-06, + "loss": 0.2897, + "step": 413 + }, + { + "epoch": 1.1761363636363638, + "grad_norm": 0.31354296956532873, + "learning_rate": 3.5269836728527194e-06, + "loss": 0.2512, + "step": 414 + }, + { + "epoch": 1.1789772727272727, + "grad_norm": 0.2819333444591201, + "learning_rate": 3.5199099415926985e-06, + "loss": 0.2336, + "step": 415 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 0.3667062945127836, + "learning_rate": 3.5128264009352177e-06, + "loss": 0.2797, + "step": 416 + }, + { + "epoch": 1.1846590909090908, + "grad_norm": 0.3717065816803459, + "learning_rate": 3.5057331190091036e-06, + "loss": 0.2625, + "step": 417 + }, + { + "epoch": 1.1875, + "grad_norm": 0.34247191523071263, + "learning_rate": 3.4986301640368726e-06, + "loss": 0.2915, + "step": 418 + }, + { + "epoch": 1.1903409090909092, + "grad_norm": 0.28055115946196074, + "learning_rate": 3.4915176043340726e-06, + "loss": 0.2323, + "step": 419 + }, + { + "epoch": 1.1931818181818181, + "grad_norm": 0.3512617852047132, + "learning_rate": 3.4843955083086315e-06, + "loss": 0.276, + "step": 420 + }, + { + "epoch": 1.1960227272727273, + "grad_norm": 0.3402592655838616, + "learning_rate": 3.477263944460196e-06, + "loss": 0.258, + "step": 421 + }, + { + "epoch": 1.1988636363636362, + "grad_norm": 0.3440775197912379, + "learning_rate": 3.4701229813794744e-06, + "loss": 0.2686, + "step": 422 + }, + { + "epoch": 1.2017045454545454, + "grad_norm": 0.32159613738142184, + "learning_rate": 3.4629726877475733e-06, + "loss": 0.2775, + "step": 423 + }, + { + "epoch": 1.2045454545454546, + "grad_norm": 0.3405153808986929, + "learning_rate": 3.4558131323353423e-06, + "loss": 0.2947, + "step": 424 + }, + { + "epoch": 1.2073863636363638, + "grad_norm": 0.4111884872726661, + "learning_rate": 3.4486443840027084e-06, + "loss": 0.2427, + "step": 425 + }, + { + "epoch": 1.2102272727272727, + "grad_norm": 0.38692560086654654, + "learning_rate": 3.4414665116980167e-06, + "loss": 0.3084, + "step": 426 + }, + { + "epoch": 1.2130681818181819, + "grad_norm": 0.4000466884476275, + "learning_rate": 3.4342795844573634e-06, + "loss": 0.2933, + "step": 427 + }, + { + "epoch": 1.2159090909090908, + "grad_norm": 0.3605831840618787, + "learning_rate": 3.427083671403937e-06, + "loss": 0.2892, + "step": 428 + }, + { + "epoch": 1.21875, + "grad_norm": 0.3225439729294941, + "learning_rate": 3.4198788417473485e-06, + "loss": 0.2579, + "step": 429 + }, + { + "epoch": 1.2215909090909092, + "grad_norm": 0.3869565428112392, + "learning_rate": 3.41266516478297e-06, + "loss": 0.3349, + "step": 430 + }, + { + "epoch": 1.2244318181818181, + "grad_norm": 0.3790938940448294, + "learning_rate": 3.4054427098912636e-06, + "loss": 0.2836, + "step": 431 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 0.33485764653621325, + "learning_rate": 3.3982115465371185e-06, + "loss": 0.2465, + "step": 432 + }, + { + "epoch": 1.2301136363636362, + "grad_norm": 0.3421027182025914, + "learning_rate": 3.390971744269181e-06, + "loss": 0.2436, + "step": 433 + }, + { + "epoch": 1.2329545454545454, + "grad_norm": 0.3343569283936874, + "learning_rate": 3.3837233727191856e-06, + "loss": 0.2533, + "step": 434 + }, + { + "epoch": 1.2357954545454546, + "grad_norm": 0.3490337805677148, + "learning_rate": 3.3764665016012842e-06, + "loss": 0.2401, + "step": 435 + }, + { + "epoch": 1.2386363636363638, + "grad_norm": 0.3116736362955648, + "learning_rate": 3.3692012007113776e-06, + "loss": 0.2482, + "step": 436 + }, + { + "epoch": 1.2414772727272727, + "grad_norm": 0.3963218536576595, + "learning_rate": 3.3619275399264444e-06, + "loss": 0.2944, + "step": 437 + }, + { + "epoch": 1.2443181818181819, + "grad_norm": 0.39432480274886955, + "learning_rate": 3.3546455892038666e-06, + "loss": 0.2918, + "step": 438 + }, + { + "epoch": 1.2471590909090908, + "grad_norm": 0.3775480283393243, + "learning_rate": 3.3473554185807573e-06, + "loss": 0.2771, + "step": 439 + }, + { + "epoch": 1.25, + "grad_norm": 0.34490450741107803, + "learning_rate": 3.340057098173288e-06, + "loss": 0.2756, + "step": 440 + }, + { + "epoch": 1.2528409090909092, + "grad_norm": 0.3324905873722346, + "learning_rate": 3.3327506981760183e-06, + "loss": 0.2608, + "step": 441 + }, + { + "epoch": 1.2556818181818181, + "grad_norm": 0.47138267546166734, + "learning_rate": 3.32543628886121e-06, + "loss": 0.3077, + "step": 442 + }, + { + "epoch": 1.2585227272727273, + "grad_norm": 0.2953842775844083, + "learning_rate": 3.3181139405781616e-06, + "loss": 0.2377, + "step": 443 + }, + { + "epoch": 1.2613636363636362, + "grad_norm": 0.3612627525520785, + "learning_rate": 3.3107837237525274e-06, + "loss": 0.2427, + "step": 444 + }, + { + "epoch": 1.2642045454545454, + "grad_norm": 0.3653963278501932, + "learning_rate": 3.3034457088856396e-06, + "loss": 0.2559, + "step": 445 + }, + { + "epoch": 1.2670454545454546, + "grad_norm": 0.3129568330696853, + "learning_rate": 3.2960999665538335e-06, + "loss": 0.2534, + "step": 446 + }, + { + "epoch": 1.2698863636363638, + "grad_norm": 0.3510947430261117, + "learning_rate": 3.288746567407763e-06, + "loss": 0.2502, + "step": 447 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.3437157582636368, + "learning_rate": 3.281385582171727e-06, + "loss": 0.2525, + "step": 448 + }, + { + "epoch": 1.2755681818181819, + "grad_norm": 0.3888446263801318, + "learning_rate": 3.274017081642986e-06, + "loss": 0.2885, + "step": 449 + }, + { + "epoch": 1.2784090909090908, + "grad_norm": 0.35942811400817226, + "learning_rate": 3.2666411366910827e-06, + "loss": 0.2571, + "step": 450 + }, + { + "epoch": 1.28125, + "grad_norm": 0.41674090701769867, + "learning_rate": 3.2592578182571583e-06, + "loss": 0.2973, + "step": 451 + }, + { + "epoch": 1.2840909090909092, + "grad_norm": 0.3702323179560626, + "learning_rate": 3.2518671973532704e-06, + "loss": 0.2415, + "step": 452 + }, + { + "epoch": 1.2869318181818181, + "grad_norm": 0.36007563550430505, + "learning_rate": 3.244469345061715e-06, + "loss": 0.2277, + "step": 453 + }, + { + "epoch": 1.2897727272727273, + "grad_norm": 0.3914691699646844, + "learning_rate": 3.237064332534336e-06, + "loss": 0.2828, + "step": 454 + }, + { + "epoch": 1.2926136363636362, + "grad_norm": 0.3522104855581335, + "learning_rate": 3.229652230991843e-06, + "loss": 0.2671, + "step": 455 + }, + { + "epoch": 1.2954545454545454, + "grad_norm": 0.3553148108185653, + "learning_rate": 3.2222331117231283e-06, + "loss": 0.2817, + "step": 456 + }, + { + "epoch": 1.2982954545454546, + "grad_norm": 0.3771227330111479, + "learning_rate": 3.2148070460845814e-06, + "loss": 0.274, + "step": 457 + }, + { + "epoch": 1.3011363636363638, + "grad_norm": 0.41388528735027136, + "learning_rate": 3.2073741054994e-06, + "loss": 0.3181, + "step": 458 + }, + { + "epoch": 1.3039772727272727, + "grad_norm": 0.33865063205260826, + "learning_rate": 3.199934361456903e-06, + "loss": 0.2634, + "step": 459 + }, + { + "epoch": 1.3068181818181819, + "grad_norm": 0.3520115660135833, + "learning_rate": 3.1924878855118475e-06, + "loss": 0.2618, + "step": 460 + }, + { + "epoch": 1.3096590909090908, + "grad_norm": 0.40034402955639337, + "learning_rate": 3.185034749283734e-06, + "loss": 0.2837, + "step": 461 + }, + { + "epoch": 1.3125, + "grad_norm": 0.34422942980117177, + "learning_rate": 3.1775750244561233e-06, + "loss": 0.2638, + "step": 462 + }, + { + "epoch": 1.3153409090909092, + "grad_norm": 0.38963794033279253, + "learning_rate": 3.1701087827759434e-06, + "loss": 0.294, + "step": 463 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 0.4262376192411251, + "learning_rate": 3.162636096052803e-06, + "loss": 0.3342, + "step": 464 + }, + { + "epoch": 1.3210227272727273, + "grad_norm": 0.38196782588004025, + "learning_rate": 3.155157036158295e-06, + "loss": 0.281, + "step": 465 + }, + { + "epoch": 1.3238636363636362, + "grad_norm": 0.39128577037723217, + "learning_rate": 3.147671675025313e-06, + "loss": 0.2864, + "step": 466 + }, + { + "epoch": 1.3267045454545454, + "grad_norm": 0.3622238856754979, + "learning_rate": 3.1401800846473506e-06, + "loss": 0.2742, + "step": 467 + }, + { + "epoch": 1.3295454545454546, + "grad_norm": 0.3187408313823274, + "learning_rate": 3.132682337077818e-06, + "loss": 0.2549, + "step": 468 + }, + { + "epoch": 1.3323863636363638, + "grad_norm": 0.33256196577073566, + "learning_rate": 3.1251785044293425e-06, + "loss": 0.2921, + "step": 469 + }, + { + "epoch": 1.3352272727272727, + "grad_norm": 0.377119549478706, + "learning_rate": 3.117668658873078e-06, + "loss": 0.2722, + "step": 470 + }, + { + "epoch": 1.3380681818181819, + "grad_norm": 0.31419013026351733, + "learning_rate": 3.1101528726380085e-06, + "loss": 0.2519, + "step": 471 + }, + { + "epoch": 1.3409090909090908, + "grad_norm": 0.3471415869479363, + "learning_rate": 3.102631218010257e-06, + "loss": 0.2817, + "step": 472 + }, + { + "epoch": 1.34375, + "grad_norm": 0.37953158089107286, + "learning_rate": 3.0951037673323863e-06, + "loss": 0.2642, + "step": 473 + }, + { + "epoch": 1.3465909090909092, + "grad_norm": 0.34488245509452714, + "learning_rate": 3.0875705930027065e-06, + "loss": 0.2499, + "step": 474 + }, + { + "epoch": 1.3494318181818181, + "grad_norm": 0.29818790329911665, + "learning_rate": 3.0800317674745755e-06, + "loss": 0.2572, + "step": 475 + }, + { + "epoch": 1.3522727272727273, + "grad_norm": 0.35582979006101406, + "learning_rate": 3.0724873632557068e-06, + "loss": 0.2806, + "step": 476 + }, + { + "epoch": 1.3551136363636362, + "grad_norm": 0.3886707765043663, + "learning_rate": 3.064937452907465e-06, + "loss": 0.2395, + "step": 477 + }, + { + "epoch": 1.3579545454545454, + "grad_norm": 0.39452409132776717, + "learning_rate": 3.057382109044177e-06, + "loss": 0.2748, + "step": 478 + }, + { + "epoch": 1.3607954545454546, + "grad_norm": 0.34362558608870675, + "learning_rate": 3.049821404332424e-06, + "loss": 0.2664, + "step": 479 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.3923547533127044, + "learning_rate": 3.0422554114903514e-06, + "loss": 0.3134, + "step": 480 + }, + { + "epoch": 1.3664772727272727, + "grad_norm": 0.42311598203108824, + "learning_rate": 3.0346842032869624e-06, + "loss": 0.3227, + "step": 481 + }, + { + "epoch": 1.3693181818181819, + "grad_norm": 0.49341501720924236, + "learning_rate": 3.0271078525414234e-06, + "loss": 0.2789, + "step": 482 + }, + { + "epoch": 1.3721590909090908, + "grad_norm": 0.3923870792288359, + "learning_rate": 3.0195264321223584e-06, + "loss": 0.3003, + "step": 483 + }, + { + "epoch": 1.375, + "grad_norm": 0.5047411107384405, + "learning_rate": 3.0119400149471535e-06, + "loss": 0.2835, + "step": 484 + }, + { + "epoch": 1.3778409090909092, + "grad_norm": 0.3431083613633404, + "learning_rate": 3.004348673981252e-06, + "loss": 0.2744, + "step": 485 + }, + { + "epoch": 1.3806818181818181, + "grad_norm": 0.3370392701002557, + "learning_rate": 2.996752482237456e-06, + "loss": 0.2503, + "step": 486 + }, + { + "epoch": 1.3835227272727273, + "grad_norm": 0.35789574905836263, + "learning_rate": 2.9891515127752172e-06, + "loss": 0.2558, + "step": 487 + }, + { + "epoch": 1.3863636363636362, + "grad_norm": 0.39542709664531145, + "learning_rate": 2.981545838699943e-06, + "loss": 0.2499, + "step": 488 + }, + { + "epoch": 1.3892045454545454, + "grad_norm": 0.4799271866705037, + "learning_rate": 2.9739355331622886e-06, + "loss": 0.2845, + "step": 489 + }, + { + "epoch": 1.3920454545454546, + "grad_norm": 0.30250300604212543, + "learning_rate": 2.966320669357453e-06, + "loss": 0.2428, + "step": 490 + }, + { + "epoch": 1.3948863636363638, + "grad_norm": 0.27928557627455064, + "learning_rate": 2.9587013205244767e-06, + "loss": 0.2354, + "step": 491 + }, + { + "epoch": 1.3977272727272727, + "grad_norm": 0.3254689902299252, + "learning_rate": 2.951077559945538e-06, + "loss": 0.2719, + "step": 492 + }, + { + "epoch": 1.4005681818181819, + "grad_norm": 0.38918459975286523, + "learning_rate": 2.943449460945244e-06, + "loss": 0.2726, + "step": 493 + }, + { + "epoch": 1.4034090909090908, + "grad_norm": 0.29871192903714955, + "learning_rate": 2.9358170968899323e-06, + "loss": 0.263, + "step": 494 + }, + { + "epoch": 1.40625, + "grad_norm": 0.3943630183447143, + "learning_rate": 2.9281805411869573e-06, + "loss": 0.2931, + "step": 495 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 0.34932644595142737, + "learning_rate": 2.920539867283992e-06, + "loss": 0.2577, + "step": 496 + }, + { + "epoch": 1.4119318181818181, + "grad_norm": 0.36296363929883135, + "learning_rate": 2.9128951486683144e-06, + "loss": 0.2884, + "step": 497 + }, + { + "epoch": 1.4147727272727273, + "grad_norm": 0.3536090241186941, + "learning_rate": 2.9052464588661076e-06, + "loss": 0.2518, + "step": 498 + }, + { + "epoch": 1.4176136363636362, + "grad_norm": 0.4071123114766137, + "learning_rate": 2.8975938714417466e-06, + "loss": 0.2955, + "step": 499 + }, + { + "epoch": 1.4204545454545454, + "grad_norm": 0.36319240545094117, + "learning_rate": 2.8899374599970943e-06, + "loss": 0.2933, + "step": 500 + }, + { + "epoch": 1.4232954545454546, + "grad_norm": 0.33541538203913807, + "learning_rate": 2.882277298170792e-06, + "loss": 0.2693, + "step": 501 + }, + { + "epoch": 1.4261363636363638, + "grad_norm": 0.42293889077814073, + "learning_rate": 2.8746134596375534e-06, + "loss": 0.2907, + "step": 502 + }, + { + "epoch": 1.4289772727272727, + "grad_norm": 0.3702782961580686, + "learning_rate": 2.866946018107453e-06, + "loss": 0.2701, + "step": 503 + }, + { + "epoch": 1.4318181818181819, + "grad_norm": 0.3454390175085058, + "learning_rate": 2.8592750473252197e-06, + "loss": 0.2612, + "step": 504 + }, + { + "epoch": 1.4346590909090908, + "grad_norm": 0.33107307095308464, + "learning_rate": 2.8516006210695244e-06, + "loss": 0.239, + "step": 505 + }, + { + "epoch": 1.4375, + "grad_norm": 0.3569062909249772, + "learning_rate": 2.843922813152275e-06, + "loss": 0.2755, + "step": 506 + }, + { + "epoch": 1.4403409090909092, + "grad_norm": 0.37131837135922086, + "learning_rate": 2.836241697417902e-06, + "loss": 0.2623, + "step": 507 + }, + { + "epoch": 1.4431818181818181, + "grad_norm": 0.3699557028893426, + "learning_rate": 2.8285573477426504e-06, + "loss": 0.2811, + "step": 508 + }, + { + "epoch": 1.4460227272727273, + "grad_norm": 0.33561480648358855, + "learning_rate": 2.820869838033871e-06, + "loss": 0.2686, + "step": 509 + }, + { + "epoch": 1.4488636363636362, + "grad_norm": 0.4711840304366533, + "learning_rate": 2.813179242229304e-06, + "loss": 0.2946, + "step": 510 + }, + { + "epoch": 1.4517045454545454, + "grad_norm": 0.382672820843295, + "learning_rate": 2.805485634296374e-06, + "loss": 0.2945, + "step": 511 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.3264806302650397, + "learning_rate": 2.7977890882314763e-06, + "loss": 0.2658, + "step": 512 + }, + { + "epoch": 1.4573863636363638, + "grad_norm": 0.3590459125833833, + "learning_rate": 2.7900896780592616e-06, + "loss": 0.2675, + "step": 513 + }, + { + "epoch": 1.4602272727272727, + "grad_norm": 0.41777977412669154, + "learning_rate": 2.7823874778319316e-06, + "loss": 0.3133, + "step": 514 + }, + { + "epoch": 1.4630681818181819, + "grad_norm": 0.3700743186678299, + "learning_rate": 2.774682561628519e-06, + "loss": 0.2781, + "step": 515 + }, + { + "epoch": 1.4659090909090908, + "grad_norm": 0.3586139592020702, + "learning_rate": 2.7669750035541798e-06, + "loss": 0.2709, + "step": 516 + }, + { + "epoch": 1.46875, + "grad_norm": 0.32668952210259083, + "learning_rate": 2.759264877739481e-06, + "loss": 0.2628, + "step": 517 + }, + { + "epoch": 1.4715909090909092, + "grad_norm": 0.3304970455370839, + "learning_rate": 2.7515522583396825e-06, + "loss": 0.2859, + "step": 518 + }, + { + "epoch": 1.4744318181818181, + "grad_norm": 0.3188860297893081, + "learning_rate": 2.74383721953403e-06, + "loss": 0.2435, + "step": 519 + }, + { + "epoch": 1.4772727272727273, + "grad_norm": 0.3701340525867732, + "learning_rate": 2.736119835525037e-06, + "loss": 0.2571, + "step": 520 + }, + { + "epoch": 1.4801136363636362, + "grad_norm": 0.4888553988204271, + "learning_rate": 2.728400180537775e-06, + "loss": 0.2956, + "step": 521 + }, + { + "epoch": 1.4829545454545454, + "grad_norm": 0.4110586830757001, + "learning_rate": 2.720678328819155e-06, + "loss": 0.2396, + "step": 522 + }, + { + "epoch": 1.4857954545454546, + "grad_norm": 0.3828799651532281, + "learning_rate": 2.712954354637218e-06, + "loss": 0.2701, + "step": 523 + }, + { + "epoch": 1.4886363636363638, + "grad_norm": 0.359763211121689, + "learning_rate": 2.705228332280418e-06, + "loss": 0.2387, + "step": 524 + }, + { + "epoch": 1.4914772727272727, + "grad_norm": 0.3785795319364518, + "learning_rate": 2.6975003360569087e-06, + "loss": 0.2761, + "step": 525 + }, + { + "epoch": 1.4943181818181819, + "grad_norm": 0.34255573500581615, + "learning_rate": 2.689770440293825e-06, + "loss": 0.267, + "step": 526 + }, + { + "epoch": 1.4971590909090908, + "grad_norm": 0.37025650452574843, + "learning_rate": 2.6820387193365764e-06, + "loss": 0.2781, + "step": 527 + }, + { + "epoch": 1.5, + "grad_norm": 0.35002281689988746, + "learning_rate": 2.674305247548125e-06, + "loss": 0.2947, + "step": 528 + }, + { + "epoch": 1.5028409090909092, + "grad_norm": 0.34143779580523753, + "learning_rate": 2.6665700993082705e-06, + "loss": 0.2658, + "step": 529 + }, + { + "epoch": 1.5056818181818183, + "grad_norm": 0.3560924867441854, + "learning_rate": 2.6588333490129376e-06, + "loss": 0.2742, + "step": 530 + }, + { + "epoch": 1.5085227272727273, + "grad_norm": 0.32295396334903814, + "learning_rate": 2.65109507107346e-06, + "loss": 0.2382, + "step": 531 + }, + { + "epoch": 1.5113636363636362, + "grad_norm": 0.33859114158227865, + "learning_rate": 2.6433553399158652e-06, + "loss": 0.2937, + "step": 532 + }, + { + "epoch": 1.5142045454545454, + "grad_norm": 0.35244369608972004, + "learning_rate": 2.6356142299801544e-06, + "loss": 0.3037, + "step": 533 + }, + { + "epoch": 1.5170454545454546, + "grad_norm": 0.3336662584141403, + "learning_rate": 2.6278718157195924e-06, + "loss": 0.2844, + "step": 534 + }, + { + "epoch": 1.5198863636363638, + "grad_norm": 0.35862845558521106, + "learning_rate": 2.620128171599989e-06, + "loss": 0.246, + "step": 535 + }, + { + "epoch": 1.5227272727272727, + "grad_norm": 0.31358277794725126, + "learning_rate": 2.6123833720989796e-06, + "loss": 0.2653, + "step": 536 + }, + { + "epoch": 1.5255681818181817, + "grad_norm": 0.36029376106362876, + "learning_rate": 2.6046374917053156e-06, + "loss": 0.2785, + "step": 537 + }, + { + "epoch": 1.5284090909090908, + "grad_norm": 0.3512123146788697, + "learning_rate": 2.5968906049181425e-06, + "loss": 0.2723, + "step": 538 + }, + { + "epoch": 1.53125, + "grad_norm": 0.35559911829983626, + "learning_rate": 2.5891427862462853e-06, + "loss": 0.2939, + "step": 539 + }, + { + "epoch": 1.5340909090909092, + "grad_norm": 0.3774459233336894, + "learning_rate": 2.581394110207532e-06, + "loss": 0.2593, + "step": 540 + }, + { + "epoch": 1.5369318181818183, + "grad_norm": 0.3213295704503383, + "learning_rate": 2.5736446513279166e-06, + "loss": 0.2615, + "step": 541 + }, + { + "epoch": 1.5397727272727273, + "grad_norm": 0.33894998490392014, + "learning_rate": 2.5658944841410032e-06, + "loss": 0.2856, + "step": 542 + }, + { + "epoch": 1.5426136363636362, + "grad_norm": 0.4085808452620872, + "learning_rate": 2.5581436831871666e-06, + "loss": 0.2611, + "step": 543 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 0.3377548562078041, + "learning_rate": 2.5503923230128787e-06, + "loss": 0.2445, + "step": 544 + }, + { + "epoch": 1.5482954545454546, + "grad_norm": 0.2986016210832829, + "learning_rate": 2.5426404781699886e-06, + "loss": 0.2345, + "step": 545 + }, + { + "epoch": 1.5511363636363638, + "grad_norm": 0.3130189679053128, + "learning_rate": 2.534888223215008e-06, + "loss": 0.2648, + "step": 546 + }, + { + "epoch": 1.5539772727272727, + "grad_norm": 0.29362772394820585, + "learning_rate": 2.5271356327083927e-06, + "loss": 0.2231, + "step": 547 + }, + { + "epoch": 1.5568181818181817, + "grad_norm": 0.3371287342113354, + "learning_rate": 2.5193827812138268e-06, + "loss": 0.2801, + "step": 548 + }, + { + "epoch": 1.5596590909090908, + "grad_norm": 0.438680590348071, + "learning_rate": 2.511629743297502e-06, + "loss": 0.3117, + "step": 549 + }, + { + "epoch": 1.5625, + "grad_norm": 0.3623332826643985, + "learning_rate": 2.5038765935274038e-06, + "loss": 0.2582, + "step": 550 + }, + { + "epoch": 1.5653409090909092, + "grad_norm": 0.3611764461964591, + "learning_rate": 2.4961234064725966e-06, + "loss": 0.2606, + "step": 551 + }, + { + "epoch": 1.5681818181818183, + "grad_norm": 0.6683755911265977, + "learning_rate": 2.488370256702499e-06, + "loss": 0.2686, + "step": 552 + }, + { + "epoch": 1.5710227272727273, + "grad_norm": 0.3699878510363697, + "learning_rate": 2.4806172187861736e-06, + "loss": 0.2823, + "step": 553 + }, + { + "epoch": 1.5738636363636362, + "grad_norm": 0.3603575134404355, + "learning_rate": 2.4728643672916073e-06, + "loss": 0.2696, + "step": 554 + }, + { + "epoch": 1.5767045454545454, + "grad_norm": 0.5708462895257692, + "learning_rate": 2.465111776784993e-06, + "loss": 0.3003, + "step": 555 + }, + { + "epoch": 1.5795454545454546, + "grad_norm": 0.414861092800249, + "learning_rate": 2.4573595218300127e-06, + "loss": 0.2878, + "step": 556 + }, + { + "epoch": 1.5823863636363638, + "grad_norm": 0.36176025431242964, + "learning_rate": 2.4496076769871226e-06, + "loss": 0.2614, + "step": 557 + }, + { + "epoch": 1.5852272727272727, + "grad_norm": 0.4170474058146532, + "learning_rate": 2.4418563168128346e-06, + "loss": 0.2868, + "step": 558 + }, + { + "epoch": 1.5880681818181817, + "grad_norm": 0.3270649689091589, + "learning_rate": 2.4341055158589976e-06, + "loss": 0.2699, + "step": 559 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.3807070125410976, + "learning_rate": 2.4263553486720838e-06, + "loss": 0.303, + "step": 560 + }, + { + "epoch": 1.59375, + "grad_norm": 0.3848553762149162, + "learning_rate": 2.4186058897924685e-06, + "loss": 0.2748, + "step": 561 + }, + { + "epoch": 1.5965909090909092, + "grad_norm": 0.3232840810454203, + "learning_rate": 2.410857213753715e-06, + "loss": 0.2445, + "step": 562 + }, + { + "epoch": 1.5994318181818183, + "grad_norm": 0.3092676360533537, + "learning_rate": 2.4031093950818583e-06, + "loss": 0.2356, + "step": 563 + }, + { + "epoch": 1.6022727272727273, + "grad_norm": 0.45118596036379494, + "learning_rate": 2.3953625082946856e-06, + "loss": 0.2837, + "step": 564 + }, + { + "epoch": 1.6051136363636362, + "grad_norm": 0.34970482571526373, + "learning_rate": 2.3876166279010212e-06, + "loss": 0.2973, + "step": 565 + }, + { + "epoch": 1.6079545454545454, + "grad_norm": 0.3364465296058301, + "learning_rate": 2.379871828400012e-06, + "loss": 0.2423, + "step": 566 + }, + { + "epoch": 1.6107954545454546, + "grad_norm": 0.363328151622841, + "learning_rate": 2.372128184280408e-06, + "loss": 0.269, + "step": 567 + }, + { + "epoch": 1.6136363636363638, + "grad_norm": 0.26766248199292697, + "learning_rate": 2.364385770019846e-06, + "loss": 0.2346, + "step": 568 + }, + { + "epoch": 1.6164772727272727, + "grad_norm": 0.3913465078730921, + "learning_rate": 2.356644660084135e-06, + "loss": 0.2866, + "step": 569 + }, + { + "epoch": 1.6193181818181817, + "grad_norm": 0.31905393138162685, + "learning_rate": 2.34890492892654e-06, + "loss": 0.2666, + "step": 570 + }, + { + "epoch": 1.6221590909090908, + "grad_norm": 0.3432468450311117, + "learning_rate": 2.341166650987064e-06, + "loss": 0.2443, + "step": 571 + }, + { + "epoch": 1.625, + "grad_norm": 0.34070598347786063, + "learning_rate": 2.333429900691731e-06, + "loss": 0.2968, + "step": 572 + }, + { + "epoch": 1.6278409090909092, + "grad_norm": 0.4257323783577944, + "learning_rate": 2.3256947524518756e-06, + "loss": 0.275, + "step": 573 + }, + { + "epoch": 1.6306818181818183, + "grad_norm": 0.35120372623976087, + "learning_rate": 2.317961280663424e-06, + "loss": 0.2779, + "step": 574 + }, + { + "epoch": 1.6335227272727273, + "grad_norm": 0.3288834361465399, + "learning_rate": 2.3102295597061757e-06, + "loss": 0.262, + "step": 575 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.3781083785525166, + "learning_rate": 2.3024996639430925e-06, + "loss": 0.2705, + "step": 576 + }, + { + "epoch": 1.6392045454545454, + "grad_norm": 0.3309675255585671, + "learning_rate": 2.2947716677195823e-06, + "loss": 0.2607, + "step": 577 + }, + { + "epoch": 1.6420454545454546, + "grad_norm": 0.4097606078658523, + "learning_rate": 2.2870456453627823e-06, + "loss": 0.3267, + "step": 578 + }, + { + "epoch": 1.6448863636363638, + "grad_norm": 0.25572751310886616, + "learning_rate": 2.2793216711808456e-06, + "loss": 0.2278, + "step": 579 + }, + { + "epoch": 1.6477272727272727, + "grad_norm": 0.3060607584281395, + "learning_rate": 2.2715998194622257e-06, + "loss": 0.2517, + "step": 580 + }, + { + "epoch": 1.6505681818181817, + "grad_norm": 0.37963871119456877, + "learning_rate": 2.2638801644749636e-06, + "loss": 0.2634, + "step": 581 + }, + { + "epoch": 1.6534090909090908, + "grad_norm": 0.3762574705485531, + "learning_rate": 2.2561627804659704e-06, + "loss": 0.2534, + "step": 582 + }, + { + "epoch": 1.65625, + "grad_norm": 0.34282885282451137, + "learning_rate": 2.2484477416603183e-06, + "loss": 0.2666, + "step": 583 + }, + { + "epoch": 1.6590909090909092, + "grad_norm": 0.3508691585265268, + "learning_rate": 2.24073512226052e-06, + "loss": 0.2589, + "step": 584 + }, + { + "epoch": 1.6619318181818183, + "grad_norm": 0.38903092342578377, + "learning_rate": 2.2330249964458202e-06, + "loss": 0.2853, + "step": 585 + }, + { + "epoch": 1.6647727272727273, + "grad_norm": 0.3466002683474289, + "learning_rate": 2.2253174383714816e-06, + "loss": 0.2812, + "step": 586 + }, + { + "epoch": 1.6676136363636362, + "grad_norm": 0.46395674632161, + "learning_rate": 2.21761252216807e-06, + "loss": 0.2692, + "step": 587 + }, + { + "epoch": 1.6704545454545454, + "grad_norm": 0.3699824822038089, + "learning_rate": 2.2099103219407392e-06, + "loss": 0.2699, + "step": 588 + }, + { + "epoch": 1.6732954545454546, + "grad_norm": 0.3805031596017454, + "learning_rate": 2.2022109117685246e-06, + "loss": 0.2953, + "step": 589 + }, + { + "epoch": 1.6761363636363638, + "grad_norm": 0.37764726137134685, + "learning_rate": 2.1945143657036267e-06, + "loss": 0.2753, + "step": 590 + }, + { + "epoch": 1.6789772727272727, + "grad_norm": 0.3304479070305256, + "learning_rate": 2.1868207577706964e-06, + "loss": 0.2524, + "step": 591 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 0.3587520279737923, + "learning_rate": 2.1791301619661297e-06, + "loss": 0.2602, + "step": 592 + }, + { + "epoch": 1.6846590909090908, + "grad_norm": 0.3323465218687911, + "learning_rate": 2.17144265225735e-06, + "loss": 0.2692, + "step": 593 + }, + { + "epoch": 1.6875, + "grad_norm": 0.3572276587914552, + "learning_rate": 2.1637583025820985e-06, + "loss": 0.2858, + "step": 594 + }, + { + "epoch": 1.6903409090909092, + "grad_norm": 0.37800630772529514, + "learning_rate": 2.156077186847726e-06, + "loss": 0.294, + "step": 595 + }, + { + "epoch": 1.6931818181818183, + "grad_norm": 0.3421660175170903, + "learning_rate": 2.148399378930476e-06, + "loss": 0.2573, + "step": 596 + }, + { + "epoch": 1.6960227272727273, + "grad_norm": 0.34254475964042214, + "learning_rate": 2.1407249526747816e-06, + "loss": 0.275, + "step": 597 + }, + { + "epoch": 1.6988636363636362, + "grad_norm": 0.3715201904697272, + "learning_rate": 2.133053981892547e-06, + "loss": 0.2833, + "step": 598 + }, + { + "epoch": 1.7017045454545454, + "grad_norm": 0.36015289752626467, + "learning_rate": 2.125386540362447e-06, + "loss": 0.2828, + "step": 599 + }, + { + "epoch": 1.7045454545454546, + "grad_norm": 0.40367397113055686, + "learning_rate": 2.1177227018292086e-06, + "loss": 0.2621, + "step": 600 + }, + { + "epoch": 1.7073863636363638, + "grad_norm": 0.32129619035430856, + "learning_rate": 2.110062540002906e-06, + "loss": 0.2757, + "step": 601 + }, + { + "epoch": 1.7102272727272727, + "grad_norm": 0.3137451287766472, + "learning_rate": 2.1024061285582546e-06, + "loss": 0.2535, + "step": 602 + }, + { + "epoch": 1.7130681818181817, + "grad_norm": 0.4280343421587481, + "learning_rate": 2.0947535411338936e-06, + "loss": 0.2559, + "step": 603 + }, + { + "epoch": 1.7159090909090908, + "grad_norm": 0.38561258389624026, + "learning_rate": 2.087104851331686e-06, + "loss": 0.339, + "step": 604 + }, + { + "epoch": 1.71875, + "grad_norm": 0.3187139343663328, + "learning_rate": 2.0794601327160083e-06, + "loss": 0.224, + "step": 605 + }, + { + "epoch": 1.7215909090909092, + "grad_norm": 0.4058807325173988, + "learning_rate": 2.0718194588130435e-06, + "loss": 0.2743, + "step": 606 + }, + { + "epoch": 1.7244318181818183, + "grad_norm": 0.3501025253129524, + "learning_rate": 2.0641829031100685e-06, + "loss": 0.2534, + "step": 607 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.34621897515864436, + "learning_rate": 2.0565505390547558e-06, + "loss": 0.2565, + "step": 608 + }, + { + "epoch": 1.7301136363636362, + "grad_norm": 0.2972165110796837, + "learning_rate": 2.0489224400544626e-06, + "loss": 0.2472, + "step": 609 + }, + { + "epoch": 1.7329545454545454, + "grad_norm": 0.28430139406095895, + "learning_rate": 2.041298679475524e-06, + "loss": 0.2278, + "step": 610 + }, + { + "epoch": 1.7357954545454546, + "grad_norm": 0.3424108937746101, + "learning_rate": 2.033679330642548e-06, + "loss": 0.2708, + "step": 611 + }, + { + "epoch": 1.7386363636363638, + "grad_norm": 0.34689691643105225, + "learning_rate": 2.026064466837712e-06, + "loss": 0.2489, + "step": 612 + }, + { + "epoch": 1.7414772727272727, + "grad_norm": 0.36538604704717154, + "learning_rate": 2.018454161300058e-06, + "loss": 0.2959, + "step": 613 + }, + { + "epoch": 1.7443181818181817, + "grad_norm": 0.3914980478603566, + "learning_rate": 2.0108484872247836e-06, + "loss": 0.2877, + "step": 614 + }, + { + "epoch": 1.7471590909090908, + "grad_norm": 0.3460591534025964, + "learning_rate": 2.003247517762545e-06, + "loss": 0.2392, + "step": 615 + }, + { + "epoch": 1.75, + "grad_norm": 0.35201168894909723, + "learning_rate": 1.995651326018748e-06, + "loss": 0.2775, + "step": 616 + }, + { + "epoch": 1.7528409090909092, + "grad_norm": 0.3907457148602396, + "learning_rate": 1.988059985052847e-06, + "loss": 0.2649, + "step": 617 + }, + { + "epoch": 1.7556818181818183, + "grad_norm": 0.31089272434312254, + "learning_rate": 1.980473567877643e-06, + "loss": 0.2717, + "step": 618 + }, + { + "epoch": 1.7585227272727273, + "grad_norm": 0.39029862965581613, + "learning_rate": 1.9728921474585783e-06, + "loss": 0.2996, + "step": 619 + }, + { + "epoch": 1.7613636363636362, + "grad_norm": 0.37522254054472837, + "learning_rate": 1.965315796713038e-06, + "loss": 0.3206, + "step": 620 + }, + { + "epoch": 1.7642045454545454, + "grad_norm": 0.37421333571503007, + "learning_rate": 1.957744588509649e-06, + "loss": 0.2953, + "step": 621 + }, + { + "epoch": 1.7670454545454546, + "grad_norm": 0.4113713231201874, + "learning_rate": 1.9501785956675767e-06, + "loss": 0.2587, + "step": 622 + }, + { + "epoch": 1.7698863636363638, + "grad_norm": 0.3775256295092349, + "learning_rate": 1.942617890955824e-06, + "loss": 0.2706, + "step": 623 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 0.361676860315546, + "learning_rate": 1.935062547092535e-06, + "loss": 0.2573, + "step": 624 + }, + { + "epoch": 1.7755681818181817, + "grad_norm": 0.3828484280989141, + "learning_rate": 1.927512636744294e-06, + "loss": 0.2635, + "step": 625 + }, + { + "epoch": 1.7784090909090908, + "grad_norm": 0.3194894627210845, + "learning_rate": 1.9199682325254258e-06, + "loss": 0.2412, + "step": 626 + }, + { + "epoch": 1.78125, + "grad_norm": 0.3467465431720772, + "learning_rate": 1.9124294069972947e-06, + "loss": 0.2558, + "step": 627 + }, + { + "epoch": 1.7840909090909092, + "grad_norm": 0.40591415428499084, + "learning_rate": 1.9048962326676145e-06, + "loss": 0.2591, + "step": 628 + }, + { + "epoch": 1.7869318181818183, + "grad_norm": 0.324247081690912, + "learning_rate": 1.897368781989744e-06, + "loss": 0.2525, + "step": 629 + }, + { + "epoch": 1.7897727272727273, + "grad_norm": 0.30168524950243947, + "learning_rate": 1.889847127361992e-06, + "loss": 0.2414, + "step": 630 + }, + { + "epoch": 1.7926136363636362, + "grad_norm": 0.3391445741041072, + "learning_rate": 1.8823313411269226e-06, + "loss": 0.2666, + "step": 631 + }, + { + "epoch": 1.7954545454545454, + "grad_norm": 0.3695919372425977, + "learning_rate": 1.874821495570658e-06, + "loss": 0.2738, + "step": 632 + }, + { + "epoch": 1.7982954545454546, + "grad_norm": 0.41985233793486193, + "learning_rate": 1.8673176629221824e-06, + "loss": 0.2843, + "step": 633 + }, + { + "epoch": 1.8011363636363638, + "grad_norm": 0.34508550168400526, + "learning_rate": 1.8598199153526502e-06, + "loss": 0.2762, + "step": 634 + }, + { + "epoch": 1.8039772727272727, + "grad_norm": 0.34432258391495646, + "learning_rate": 1.852328324974688e-06, + "loss": 0.2746, + "step": 635 + }, + { + "epoch": 1.8068181818181817, + "grad_norm": 0.432219335772206, + "learning_rate": 1.8448429638417053e-06, + "loss": 0.293, + "step": 636 + }, + { + "epoch": 1.8096590909090908, + "grad_norm": 0.30494323840811877, + "learning_rate": 1.8373639039471974e-06, + "loss": 0.2483, + "step": 637 + }, + { + "epoch": 1.8125, + "grad_norm": 0.38979888807881874, + "learning_rate": 1.8298912172240568e-06, + "loss": 0.2665, + "step": 638 + }, + { + "epoch": 1.8153409090909092, + "grad_norm": 0.4409357967627925, + "learning_rate": 1.8224249755438773e-06, + "loss": 0.2979, + "step": 639 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.40058073253392457, + "learning_rate": 1.8149652507162662e-06, + "loss": 0.2402, + "step": 640 + }, + { + "epoch": 1.8210227272727273, + "grad_norm": 0.3781947301668901, + "learning_rate": 1.807512114488153e-06, + "loss": 0.2815, + "step": 641 + }, + { + "epoch": 1.8238636363636362, + "grad_norm": 0.32982880475917153, + "learning_rate": 1.8000656385430978e-06, + "loss": 0.274, + "step": 642 + }, + { + "epoch": 1.8267045454545454, + "grad_norm": 0.34588388650165885, + "learning_rate": 1.7926258945006008e-06, + "loss": 0.2415, + "step": 643 + }, + { + "epoch": 1.8295454545454546, + "grad_norm": 0.46509270816531234, + "learning_rate": 1.7851929539154188e-06, + "loss": 0.2352, + "step": 644 + }, + { + "epoch": 1.8323863636363638, + "grad_norm": 0.3949892127680776, + "learning_rate": 1.7777668882768723e-06, + "loss": 0.2731, + "step": 645 + }, + { + "epoch": 1.8352272727272727, + "grad_norm": 0.33118981202064834, + "learning_rate": 1.7703477690081584e-06, + "loss": 0.2062, + "step": 646 + }, + { + "epoch": 1.8380681818181817, + "grad_norm": 0.41123429927968475, + "learning_rate": 1.762935667465665e-06, + "loss": 0.2603, + "step": 647 + }, + { + "epoch": 1.8409090909090908, + "grad_norm": 0.4086985175493265, + "learning_rate": 1.7555306549382853e-06, + "loss": 0.2633, + "step": 648 + }, + { + "epoch": 1.84375, + "grad_norm": 0.3829776136552432, + "learning_rate": 1.7481328026467292e-06, + "loss": 0.2645, + "step": 649 + }, + { + "epoch": 1.8465909090909092, + "grad_norm": 0.36580249698143114, + "learning_rate": 1.7407421817428432e-06, + "loss": 0.2907, + "step": 650 + }, + { + "epoch": 1.8494318181818183, + "grad_norm": 0.5114322764325684, + "learning_rate": 1.733358863308918e-06, + "loss": 0.2491, + "step": 651 + }, + { + "epoch": 1.8522727272727273, + "grad_norm": 0.3758211802363351, + "learning_rate": 1.7259829183570146e-06, + "loss": 0.275, + "step": 652 + }, + { + "epoch": 1.8551136363636362, + "grad_norm": 0.44005362349975546, + "learning_rate": 1.7186144178282735e-06, + "loss": 0.2759, + "step": 653 + }, + { + "epoch": 1.8579545454545454, + "grad_norm": 0.41121803130231066, + "learning_rate": 1.7112534325922381e-06, + "loss": 0.2835, + "step": 654 + }, + { + "epoch": 1.8607954545454546, + "grad_norm": 0.37656111256141905, + "learning_rate": 1.7039000334461673e-06, + "loss": 0.2808, + "step": 655 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 0.3651987202447528, + "learning_rate": 1.6965542911143601e-06, + "loss": 0.3218, + "step": 656 + }, + { + "epoch": 1.8664772727272727, + "grad_norm": 0.40004844795530625, + "learning_rate": 1.6892162762474732e-06, + "loss": 0.2945, + "step": 657 + }, + { + "epoch": 1.8693181818181817, + "grad_norm": 0.33043091198634184, + "learning_rate": 1.6818860594218396e-06, + "loss": 0.2277, + "step": 658 + }, + { + "epoch": 1.8721590909090908, + "grad_norm": 0.3346497899463932, + "learning_rate": 1.674563711138791e-06, + "loss": 0.2324, + "step": 659 + }, + { + "epoch": 1.875, + "grad_norm": 0.32658486289094646, + "learning_rate": 1.6672493018239828e-06, + "loss": 0.242, + "step": 660 + }, + { + "epoch": 1.8778409090909092, + "grad_norm": 0.3483520142042606, + "learning_rate": 1.659942901826712e-06, + "loss": 0.2724, + "step": 661 + }, + { + "epoch": 1.8806818181818183, + "grad_norm": 0.3447989906256544, + "learning_rate": 1.6526445814192437e-06, + "loss": 0.2522, + "step": 662 + }, + { + "epoch": 1.8835227272727273, + "grad_norm": 0.3745982582543309, + "learning_rate": 1.6453544107961338e-06, + "loss": 0.268, + "step": 663 + }, + { + "epoch": 1.8863636363636362, + "grad_norm": 0.47460009049304464, + "learning_rate": 1.638072460073556e-06, + "loss": 0.3004, + "step": 664 + }, + { + "epoch": 1.8892045454545454, + "grad_norm": 0.38922747831910864, + "learning_rate": 1.6307987992886221e-06, + "loss": 0.2923, + "step": 665 + }, + { + "epoch": 1.8920454545454546, + "grad_norm": 0.3619334724335469, + "learning_rate": 1.6235334983987166e-06, + "loss": 0.2929, + "step": 666 + }, + { + "epoch": 1.8948863636363638, + "grad_norm": 0.4134447223169521, + "learning_rate": 1.6162766272808153e-06, + "loss": 0.2443, + "step": 667 + }, + { + "epoch": 1.8977272727272727, + "grad_norm": 0.37827695457409233, + "learning_rate": 1.6090282557308199e-06, + "loss": 0.2634, + "step": 668 + }, + { + "epoch": 1.9005681818181817, + "grad_norm": 0.37553439336248, + "learning_rate": 1.6017884534628821e-06, + "loss": 0.2624, + "step": 669 + }, + { + "epoch": 1.9034090909090908, + "grad_norm": 0.30503546597237136, + "learning_rate": 1.594557290108737e-06, + "loss": 0.2448, + "step": 670 + }, + { + "epoch": 1.90625, + "grad_norm": 0.33139361815750534, + "learning_rate": 1.5873348352170309e-06, + "loss": 0.2344, + "step": 671 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.4071705047497215, + "learning_rate": 1.5801211582526515e-06, + "loss": 0.2972, + "step": 672 + }, + { + "epoch": 1.9119318181818183, + "grad_norm": 0.3520108684037794, + "learning_rate": 1.5729163285960636e-06, + "loss": 0.3064, + "step": 673 + }, + { + "epoch": 1.9147727272727273, + "grad_norm": 1.044294639450523, + "learning_rate": 1.5657204155426372e-06, + "loss": 0.2764, + "step": 674 + }, + { + "epoch": 1.9176136363636362, + "grad_norm": 0.2733575442921981, + "learning_rate": 1.5585334883019845e-06, + "loss": 0.2115, + "step": 675 + }, + { + "epoch": 1.9204545454545454, + "grad_norm": 0.3436043484209694, + "learning_rate": 1.551355615997292e-06, + "loss": 0.2613, + "step": 676 + }, + { + "epoch": 1.9232954545454546, + "grad_norm": 0.347973792440035, + "learning_rate": 1.5441868676646588e-06, + "loss": 0.2984, + "step": 677 + }, + { + "epoch": 1.9261363636363638, + "grad_norm": 0.3714627621893232, + "learning_rate": 1.537027312252427e-06, + "loss": 0.2939, + "step": 678 + }, + { + "epoch": 1.9289772727272727, + "grad_norm": 0.36946185178466473, + "learning_rate": 1.5298770186205262e-06, + "loss": 0.3133, + "step": 679 + }, + { + "epoch": 1.9318181818181817, + "grad_norm": 0.4445653274012168, + "learning_rate": 1.522736055539804e-06, + "loss": 0.2638, + "step": 680 + }, + { + "epoch": 1.9346590909090908, + "grad_norm": 0.36757539498984404, + "learning_rate": 1.5156044916913687e-06, + "loss": 0.2594, + "step": 681 + }, + { + "epoch": 1.9375, + "grad_norm": 0.3740375047815139, + "learning_rate": 1.5084823956659284e-06, + "loss": 0.2816, + "step": 682 + }, + { + "epoch": 1.9403409090909092, + "grad_norm": 0.3691390978665013, + "learning_rate": 1.5013698359631284e-06, + "loss": 0.3269, + "step": 683 + }, + { + "epoch": 1.9431818181818183, + "grad_norm": 0.3882033860276734, + "learning_rate": 1.4942668809908966e-06, + "loss": 0.2926, + "step": 684 + }, + { + "epoch": 1.9460227272727273, + "grad_norm": 0.3303389257139215, + "learning_rate": 1.487173599064783e-06, + "loss": 0.2813, + "step": 685 + }, + { + "epoch": 1.9488636363636362, + "grad_norm": 0.385716686112769, + "learning_rate": 1.4800900584073025e-06, + "loss": 0.3027, + "step": 686 + }, + { + "epoch": 1.9517045454545454, + "grad_norm": 0.30969234063219786, + "learning_rate": 1.4730163271472808e-06, + "loss": 0.2848, + "step": 687 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 0.39460846418007084, + "learning_rate": 1.465952473319196e-06, + "loss": 0.2638, + "step": 688 + }, + { + "epoch": 1.9573863636363638, + "grad_norm": 0.38043423948555954, + "learning_rate": 1.458898564862528e-06, + "loss": 0.3017, + "step": 689 + }, + { + "epoch": 1.9602272727272727, + "grad_norm": 0.344190102552331, + "learning_rate": 1.4518546696211003e-06, + "loss": 0.2475, + "step": 690 + }, + { + "epoch": 1.9630681818181817, + "grad_norm": 0.3584182768945062, + "learning_rate": 1.4448208553424318e-06, + "loss": 0.2599, + "step": 691 + }, + { + "epoch": 1.9659090909090908, + "grad_norm": 0.4193998956615056, + "learning_rate": 1.4377971896770854e-06, + "loss": 0.2932, + "step": 692 + }, + { + "epoch": 1.96875, + "grad_norm": 0.3183638489077071, + "learning_rate": 1.4307837401780129e-06, + "loss": 0.2353, + "step": 693 + }, + { + "epoch": 1.9715909090909092, + "grad_norm": 0.551291367904842, + "learning_rate": 1.4237805742999078e-06, + "loss": 0.2888, + "step": 694 + }, + { + "epoch": 1.9744318181818183, + "grad_norm": 0.3836625936106596, + "learning_rate": 1.4167877593985604e-06, + "loss": 0.2606, + "step": 695 + }, + { + "epoch": 1.9772727272727273, + "grad_norm": 0.33238753899253215, + "learning_rate": 1.4098053627302021e-06, + "loss": 0.2511, + "step": 696 + }, + { + "epoch": 1.9801136363636362, + "grad_norm": 0.3697431181915492, + "learning_rate": 1.402833451450865e-06, + "loss": 0.2592, + "step": 697 + }, + { + "epoch": 1.9829545454545454, + "grad_norm": 0.39546306881879256, + "learning_rate": 1.3958720926157326e-06, + "loss": 0.2867, + "step": 698 + }, + { + "epoch": 1.9857954545454546, + "grad_norm": 0.37081182355768993, + "learning_rate": 1.3889213531784967e-06, + "loss": 0.2774, + "step": 699 + }, + { + "epoch": 1.9886363636363638, + "grad_norm": 0.31730996135018236, + "learning_rate": 1.3819812999907112e-06, + "loss": 0.2558, + "step": 700 + }, + { + "epoch": 1.9914772727272727, + "grad_norm": 0.48697102294004946, + "learning_rate": 1.3750519998011545e-06, + "loss": 0.2807, + "step": 701 + }, + { + "epoch": 1.9943181818181817, + "grad_norm": 0.32660834038500147, + "learning_rate": 1.3681335192551795e-06, + "loss": 0.266, + "step": 702 + }, + { + "epoch": 1.9971590909090908, + "grad_norm": 0.37088856838391165, + "learning_rate": 1.3612259248940778e-06, + "loss": 0.3023, + "step": 703 + }, + { + "epoch": 2.0, + "grad_norm": 0.46910986149494815, + "learning_rate": 1.354329283154442e-06, + "loss": 0.354, + "step": 704 + }, + { + "epoch": 2.002840909090909, + "grad_norm": 0.3578196229806462, + "learning_rate": 1.3474436603675195e-06, + "loss": 0.2863, + "step": 705 + }, + { + "epoch": 2.0056818181818183, + "grad_norm": 0.3320147175830239, + "learning_rate": 1.3405691227585774e-06, + "loss": 0.2791, + "step": 706 + }, + { + "epoch": 2.008522727272727, + "grad_norm": 0.4104267883722151, + "learning_rate": 1.333705736446272e-06, + "loss": 0.2599, + "step": 707 + }, + { + "epoch": 2.0113636363636362, + "grad_norm": 0.44025732665188794, + "learning_rate": 1.326853567442003e-06, + "loss": 0.2648, + "step": 708 + }, + { + "epoch": 2.0142045454545454, + "grad_norm": 0.4463091829454087, + "learning_rate": 1.320012681649284e-06, + "loss": 0.3235, + "step": 709 + }, + { + "epoch": 2.0170454545454546, + "grad_norm": 0.3977418006694515, + "learning_rate": 1.3131831448631099e-06, + "loss": 0.2494, + "step": 710 + }, + { + "epoch": 2.0198863636363638, + "grad_norm": 0.30294420075479717, + "learning_rate": 1.3063650227693192e-06, + "loss": 0.2274, + "step": 711 + }, + { + "epoch": 2.022727272727273, + "grad_norm": 0.3580935126068431, + "learning_rate": 1.2995583809439655e-06, + "loss": 0.2641, + "step": 712 + }, + { + "epoch": 2.0255681818181817, + "grad_norm": 0.3633999760316955, + "learning_rate": 1.2927632848526892e-06, + "loss": 0.2664, + "step": 713 + }, + { + "epoch": 2.028409090909091, + "grad_norm": 0.39362626572566367, + "learning_rate": 1.285979799850079e-06, + "loss": 0.3028, + "step": 714 + }, + { + "epoch": 2.03125, + "grad_norm": 0.3732307387516034, + "learning_rate": 1.2792079911790554e-06, + "loss": 0.2903, + "step": 715 + }, + { + "epoch": 2.034090909090909, + "grad_norm": 0.348231549102206, + "learning_rate": 1.2724479239702334e-06, + "loss": 0.2776, + "step": 716 + }, + { + "epoch": 2.0369318181818183, + "grad_norm": 0.32154175294270404, + "learning_rate": 1.2656996632413e-06, + "loss": 0.2363, + "step": 717 + }, + { + "epoch": 2.039772727272727, + "grad_norm": 0.3738689076803405, + "learning_rate": 1.2589632738963915e-06, + "loss": 0.2747, + "step": 718 + }, + { + "epoch": 2.0426136363636362, + "grad_norm": 0.370533612023648, + "learning_rate": 1.2522388207254624e-06, + "loss": 0.2568, + "step": 719 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.3839434235801676, + "learning_rate": 1.2455263684036687e-06, + "loss": 0.2792, + "step": 720 + }, + { + "epoch": 2.0482954545454546, + "grad_norm": 0.5003341324574189, + "learning_rate": 1.2388259814907421e-06, + "loss": 0.2769, + "step": 721 + }, + { + "epoch": 2.0511363636363638, + "grad_norm": 0.3351671952514299, + "learning_rate": 1.2321377244303718e-06, + "loss": 0.2296, + "step": 722 + }, + { + "epoch": 2.053977272727273, + "grad_norm": 0.2999985412422647, + "learning_rate": 1.22546166154958e-06, + "loss": 0.2284, + "step": 723 + }, + { + "epoch": 2.0568181818181817, + "grad_norm": 0.3135859144132813, + "learning_rate": 1.2187978570581118e-06, + "loss": 0.251, + "step": 724 + }, + { + "epoch": 2.059659090909091, + "grad_norm": 0.4125239171099722, + "learning_rate": 1.212146375047808e-06, + "loss": 0.2569, + "step": 725 + }, + { + "epoch": 2.0625, + "grad_norm": 0.5126461046016878, + "learning_rate": 1.2055072794919927e-06, + "loss": 0.2867, + "step": 726 + }, + { + "epoch": 2.065340909090909, + "grad_norm": 0.32428865281600694, + "learning_rate": 1.198880634244862e-06, + "loss": 0.2526, + "step": 727 + }, + { + "epoch": 2.0681818181818183, + "grad_norm": 0.5892083787676873, + "learning_rate": 1.192266503040863e-06, + "loss": 0.2827, + "step": 728 + }, + { + "epoch": 2.071022727272727, + "grad_norm": 0.2947475596312562, + "learning_rate": 1.1856649494940842e-06, + "loss": 0.2288, + "step": 729 + }, + { + "epoch": 2.0738636363636362, + "grad_norm": 0.35972607487628616, + "learning_rate": 1.1790760370976445e-06, + "loss": 0.268, + "step": 730 + }, + { + "epoch": 2.0767045454545454, + "grad_norm": 0.36619988601771414, + "learning_rate": 1.1724998292230804e-06, + "loss": 0.2832, + "step": 731 + }, + { + "epoch": 2.0795454545454546, + "grad_norm": 0.3733558388597783, + "learning_rate": 1.1659363891197373e-06, + "loss": 0.2723, + "step": 732 + }, + { + "epoch": 2.0823863636363638, + "grad_norm": 0.39404340487463446, + "learning_rate": 1.1593857799141635e-06, + "loss": 0.2823, + "step": 733 + }, + { + "epoch": 2.085227272727273, + "grad_norm": 0.39535002691603904, + "learning_rate": 1.152848064609499e-06, + "loss": 0.2765, + "step": 734 + }, + { + "epoch": 2.0880681818181817, + "grad_norm": 0.4562125910263655, + "learning_rate": 1.1463233060848701e-06, + "loss": 0.2229, + "step": 735 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 0.34157106543064586, + "learning_rate": 1.139811567094791e-06, + "loss": 0.251, + "step": 736 + }, + { + "epoch": 2.09375, + "grad_norm": 0.3975912471137775, + "learning_rate": 1.1333129102685504e-06, + "loss": 0.2953, + "step": 737 + }, + { + "epoch": 2.096590909090909, + "grad_norm": 0.4344936348962993, + "learning_rate": 1.1268273981096154e-06, + "loss": 0.2481, + "step": 738 + }, + { + "epoch": 2.0994318181818183, + "grad_norm": 0.40663820339750667, + "learning_rate": 1.1203550929950296e-06, + "loss": 0.2704, + "step": 739 + }, + { + "epoch": 2.102272727272727, + "grad_norm": 0.4525407147079834, + "learning_rate": 1.1138960571748122e-06, + "loss": 0.2308, + "step": 740 + }, + { + "epoch": 2.1051136363636362, + "grad_norm": 0.36101599924638966, + "learning_rate": 1.107450352771358e-06, + "loss": 0.3198, + "step": 741 + }, + { + "epoch": 2.1079545454545454, + "grad_norm": 0.4132570992405224, + "learning_rate": 1.1010180417788458e-06, + "loss": 0.3157, + "step": 742 + }, + { + "epoch": 2.1107954545454546, + "grad_norm": 0.4296796806025471, + "learning_rate": 1.094599186062633e-06, + "loss": 0.2719, + "step": 743 + }, + { + "epoch": 2.1136363636363638, + "grad_norm": 0.4115860705303619, + "learning_rate": 1.0881938473586672e-06, + "loss": 0.2588, + "step": 744 + }, + { + "epoch": 2.116477272727273, + "grad_norm": 0.3341390354972397, + "learning_rate": 1.0818020872728935e-06, + "loss": 0.2803, + "step": 745 + }, + { + "epoch": 2.1193181818181817, + "grad_norm": 0.386666143661149, + "learning_rate": 1.0754239672806526e-06, + "loss": 0.2954, + "step": 746 + }, + { + "epoch": 2.122159090909091, + "grad_norm": 0.39729795109834065, + "learning_rate": 1.0690595487261032e-06, + "loss": 0.292, + "step": 747 + }, + { + "epoch": 2.125, + "grad_norm": 0.4632063849794996, + "learning_rate": 1.0627088928216203e-06, + "loss": 0.3011, + "step": 748 + }, + { + "epoch": 2.127840909090909, + "grad_norm": 0.364788422480122, + "learning_rate": 1.0563720606472116e-06, + "loss": 0.2887, + "step": 749 + }, + { + "epoch": 2.1306818181818183, + "grad_norm": 0.3613800764493521, + "learning_rate": 1.050049113149932e-06, + "loss": 0.2698, + "step": 750 + }, + { + "epoch": 2.133522727272727, + "grad_norm": 0.4840054604670755, + "learning_rate": 1.0437401111432928e-06, + "loss": 0.2671, + "step": 751 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 0.35647589283664843, + "learning_rate": 1.0374451153066773e-06, + "loss": 0.277, + "step": 752 + }, + { + "epoch": 2.1392045454545454, + "grad_norm": 0.3070617647042118, + "learning_rate": 1.0311641861847644e-06, + "loss": 0.2262, + "step": 753 + }, + { + "epoch": 2.1420454545454546, + "grad_norm": 0.36421008528422827, + "learning_rate": 1.0248973841869336e-06, + "loss": 0.2541, + "step": 754 + }, + { + "epoch": 2.1448863636363638, + "grad_norm": 0.36442145568995793, + "learning_rate": 1.018644769586695e-06, + "loss": 0.2968, + "step": 755 + }, + { + "epoch": 2.147727272727273, + "grad_norm": 0.5392899583290776, + "learning_rate": 1.0124064025211063e-06, + "loss": 0.2338, + "step": 756 + }, + { + "epoch": 2.1505681818181817, + "grad_norm": 0.40485627469450297, + "learning_rate": 1.006182342990192e-06, + "loss": 0.2734, + "step": 757 + }, + { + "epoch": 2.153409090909091, + "grad_norm": 0.36165309778969656, + "learning_rate": 9.99972650856368e-07, + "loss": 0.2717, + "step": 758 + }, + { + "epoch": 2.15625, + "grad_norm": 0.37054356564143653, + "learning_rate": 9.937773858438677e-07, + "loss": 0.2867, + "step": 759 + }, + { + "epoch": 2.159090909090909, + "grad_norm": 0.3209190334600411, + "learning_rate": 9.87596607538164e-07, + "loss": 0.2026, + "step": 760 + }, + { + "epoch": 2.1619318181818183, + "grad_norm": 0.33862908014599463, + "learning_rate": 9.81430375385399e-07, + "loss": 0.2589, + "step": 761 + }, + { + "epoch": 2.164772727272727, + "grad_norm": 0.33768216225160724, + "learning_rate": 9.752787486918108e-07, + "loss": 0.2832, + "step": 762 + }, + { + "epoch": 2.1676136363636362, + "grad_norm": 0.33566640920720886, + "learning_rate": 9.691417866231633e-07, + "loss": 0.2646, + "step": 763 + }, + { + "epoch": 2.1704545454545454, + "grad_norm": 0.296999788237227, + "learning_rate": 9.630195482041778e-07, + "loss": 0.2405, + "step": 764 + }, + { + "epoch": 2.1732954545454546, + "grad_norm": 0.36623960819597895, + "learning_rate": 9.569120923179661e-07, + "loss": 0.2997, + "step": 765 + }, + { + "epoch": 2.1761363636363638, + "grad_norm": 0.35989187708509074, + "learning_rate": 9.508194777054613e-07, + "loss": 0.2627, + "step": 766 + }, + { + "epoch": 2.178977272727273, + "grad_norm": 0.45558444510597795, + "learning_rate": 9.447417629648542e-07, + "loss": 0.2939, + "step": 767 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 1.352661504436191, + "learning_rate": 9.386790065510326e-07, + "loss": 0.2674, + "step": 768 + }, + { + "epoch": 2.184659090909091, + "grad_norm": 0.3484066310248953, + "learning_rate": 9.326312667750143e-07, + "loss": 0.2647, + "step": 769 + }, + { + "epoch": 2.1875, + "grad_norm": 0.3372643949746599, + "learning_rate": 9.265986018033887e-07, + "loss": 0.2712, + "step": 770 + }, + { + "epoch": 2.190340909090909, + "grad_norm": 0.45171014423025785, + "learning_rate": 9.205810696577577e-07, + "loss": 0.2531, + "step": 771 + }, + { + "epoch": 2.1931818181818183, + "grad_norm": 0.3426033696862187, + "learning_rate": 9.14578728214176e-07, + "loss": 0.2594, + "step": 772 + }, + { + "epoch": 2.196022727272727, + "grad_norm": 0.44139931309445984, + "learning_rate": 9.085916352025983e-07, + "loss": 0.2747, + "step": 773 + }, + { + "epoch": 2.1988636363636362, + "grad_norm": 0.3644501914038969, + "learning_rate": 9.02619848206319e-07, + "loss": 0.3172, + "step": 774 + }, + { + "epoch": 2.2017045454545454, + "grad_norm": 0.41216240398841364, + "learning_rate": 8.966634246614208e-07, + "loss": 0.2614, + "step": 775 + }, + { + "epoch": 2.2045454545454546, + "grad_norm": 0.34732770899892357, + "learning_rate": 8.907224218562219e-07, + "loss": 0.248, + "step": 776 + }, + { + "epoch": 2.2073863636363638, + "grad_norm": 0.34245013389418555, + "learning_rate": 8.847968969307283e-07, + "loss": 0.295, + "step": 777 + }, + { + "epoch": 2.210227272727273, + "grad_norm": 0.3893001282929315, + "learning_rate": 8.788869068760758e-07, + "loss": 0.2967, + "step": 778 + }, + { + "epoch": 2.2130681818181817, + "grad_norm": 0.48226856220499215, + "learning_rate": 8.729925085339929e-07, + "loss": 0.3055, + "step": 779 + }, + { + "epoch": 2.215909090909091, + "grad_norm": 0.36479498548889644, + "learning_rate": 8.67113758596245e-07, + "loss": 0.2944, + "step": 780 + }, + { + "epoch": 2.21875, + "grad_norm": 0.311563765449273, + "learning_rate": 8.612507136040926e-07, + "loss": 0.2208, + "step": 781 + }, + { + "epoch": 2.221590909090909, + "grad_norm": 0.39153959534391375, + "learning_rate": 8.554034299477506e-07, + "loss": 0.2955, + "step": 782 + }, + { + "epoch": 2.2244318181818183, + "grad_norm": 0.3752941766025436, + "learning_rate": 8.495719638658395e-07, + "loss": 0.2882, + "step": 783 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 0.34306207357731855, + "learning_rate": 8.437563714448496e-07, + "loss": 0.2855, + "step": 784 + }, + { + "epoch": 2.2301136363636362, + "grad_norm": 0.2911256041409022, + "learning_rate": 8.379567086185989e-07, + "loss": 0.2245, + "step": 785 + }, + { + "epoch": 2.2329545454545454, + "grad_norm": 0.38423726910475914, + "learning_rate": 8.321730311676965e-07, + "loss": 0.2881, + "step": 786 + }, + { + "epoch": 2.2357954545454546, + "grad_norm": 0.28685959612362666, + "learning_rate": 8.264053947190051e-07, + "loss": 0.2168, + "step": 787 + }, + { + "epoch": 2.2386363636363638, + "grad_norm": 0.3177020831576707, + "learning_rate": 8.206538547451088e-07, + "loss": 0.2392, + "step": 788 + }, + { + "epoch": 2.241477272727273, + "grad_norm": 0.314674201211804, + "learning_rate": 8.149184665637746e-07, + "loss": 0.2244, + "step": 789 + }, + { + "epoch": 2.2443181818181817, + "grad_norm": 0.34609325605203806, + "learning_rate": 8.091992853374239e-07, + "loss": 0.2506, + "step": 790 + }, + { + "epoch": 2.247159090909091, + "grad_norm": 0.37417875469018747, + "learning_rate": 8.034963660726022e-07, + "loss": 0.297, + "step": 791 + }, + { + "epoch": 2.25, + "grad_norm": 0.4190001624824225, + "learning_rate": 7.978097636194482e-07, + "loss": 0.2822, + "step": 792 + }, + { + "epoch": 2.252840909090909, + "grad_norm": 0.31172594700443673, + "learning_rate": 7.921395326711664e-07, + "loss": 0.2277, + "step": 793 + }, + { + "epoch": 2.2556818181818183, + "grad_norm": 0.35515884644954326, + "learning_rate": 7.864857277635027e-07, + "loss": 0.252, + "step": 794 + }, + { + "epoch": 2.2585227272727275, + "grad_norm": 0.48510568393864467, + "learning_rate": 7.808484032742184e-07, + "loss": 0.2661, + "step": 795 + }, + { + "epoch": 2.2613636363636362, + "grad_norm": 0.40576550011180185, + "learning_rate": 7.75227613422567e-07, + "loss": 0.2624, + "step": 796 + }, + { + "epoch": 2.2642045454545454, + "grad_norm": 0.3153702935106711, + "learning_rate": 7.696234122687756e-07, + "loss": 0.2423, + "step": 797 + }, + { + "epoch": 2.2670454545454546, + "grad_norm": 0.45813794434618704, + "learning_rate": 7.640358537135214e-07, + "loss": 0.2773, + "step": 798 + }, + { + "epoch": 2.2698863636363638, + "grad_norm": 0.43799221687287815, + "learning_rate": 7.584649914974132e-07, + "loss": 0.2543, + "step": 799 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.36099400774254925, + "learning_rate": 7.5291087920048e-07, + "loss": 0.2554, + "step": 800 + }, + { + "epoch": 2.2755681818181817, + "grad_norm": 0.3681744190202427, + "learning_rate": 7.47373570241646e-07, + "loss": 0.2393, + "step": 801 + }, + { + "epoch": 2.278409090909091, + "grad_norm": 0.30088848462434675, + "learning_rate": 7.418531178782281e-07, + "loss": 0.2443, + "step": 802 + }, + { + "epoch": 2.28125, + "grad_norm": 0.36658882990515207, + "learning_rate": 7.363495752054145e-07, + "loss": 0.2716, + "step": 803 + }, + { + "epoch": 2.284090909090909, + "grad_norm": 0.3691396379554879, + "learning_rate": 7.30862995155758e-07, + "loss": 0.281, + "step": 804 + }, + { + "epoch": 2.2869318181818183, + "grad_norm": 0.3976865364065572, + "learning_rate": 7.25393430498669e-07, + "loss": 0.3126, + "step": 805 + }, + { + "epoch": 2.2897727272727275, + "grad_norm": 0.34972134382431147, + "learning_rate": 7.199409338399024e-07, + "loss": 0.2716, + "step": 806 + }, + { + "epoch": 2.2926136363636362, + "grad_norm": 0.359990470488163, + "learning_rate": 7.145055576210552e-07, + "loss": 0.282, + "step": 807 + }, + { + "epoch": 2.2954545454545454, + "grad_norm": 0.32127716098200765, + "learning_rate": 7.090873541190649e-07, + "loss": 0.2537, + "step": 808 + }, + { + "epoch": 2.2982954545454546, + "grad_norm": 0.3386422816466643, + "learning_rate": 7.036863754456985e-07, + "loss": 0.2663, + "step": 809 + }, + { + "epoch": 2.3011363636363638, + "grad_norm": 0.43294818109617667, + "learning_rate": 6.983026735470586e-07, + "loss": 0.3144, + "step": 810 + }, + { + "epoch": 2.3039772727272725, + "grad_norm": 0.3668974255373313, + "learning_rate": 6.929363002030829e-07, + "loss": 0.2665, + "step": 811 + }, + { + "epoch": 2.3068181818181817, + "grad_norm": 0.3372045903540735, + "learning_rate": 6.875873070270423e-07, + "loss": 0.2291, + "step": 812 + }, + { + "epoch": 2.309659090909091, + "grad_norm": 0.3686361653405783, + "learning_rate": 6.822557454650472e-07, + "loss": 0.3127, + "step": 813 + }, + { + "epoch": 2.3125, + "grad_norm": 0.3287416264369441, + "learning_rate": 6.769416667955545e-07, + "loss": 0.2497, + "step": 814 + }, + { + "epoch": 2.315340909090909, + "grad_norm": 0.378493696975223, + "learning_rate": 6.7164512212887e-07, + "loss": 0.2538, + "step": 815 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 0.3527906349071735, + "learning_rate": 6.6636616240666e-07, + "loss": 0.2759, + "step": 816 + }, + { + "epoch": 2.3210227272727275, + "grad_norm": 0.3283146351073707, + "learning_rate": 6.611048384014601e-07, + "loss": 0.2787, + "step": 817 + }, + { + "epoch": 2.3238636363636362, + "grad_norm": 0.4262766716182643, + "learning_rate": 6.558612007161876e-07, + "loss": 0.3367, + "step": 818 + }, + { + "epoch": 2.3267045454545454, + "grad_norm": 0.29243285573134076, + "learning_rate": 6.506352997836537e-07, + "loss": 0.2312, + "step": 819 + }, + { + "epoch": 2.3295454545454546, + "grad_norm": 0.3708515561207515, + "learning_rate": 6.454271858660816e-07, + "loss": 0.2947, + "step": 820 + }, + { + "epoch": 2.3323863636363638, + "grad_norm": 0.3031026424988807, + "learning_rate": 6.402369090546173e-07, + "loss": 0.2376, + "step": 821 + }, + { + "epoch": 2.3352272727272725, + "grad_norm": 0.40063837240803074, + "learning_rate": 6.350645192688531e-07, + "loss": 0.2706, + "step": 822 + }, + { + "epoch": 2.3380681818181817, + "grad_norm": 0.3931219211524187, + "learning_rate": 6.299100662563459e-07, + "loss": 0.2245, + "step": 823 + }, + { + "epoch": 2.340909090909091, + "grad_norm": 0.496053050631395, + "learning_rate": 6.247735995921375e-07, + "loss": 0.2665, + "step": 824 + }, + { + "epoch": 2.34375, + "grad_norm": 0.36983619426324377, + "learning_rate": 6.19655168678279e-07, + "loss": 0.2437, + "step": 825 + }, + { + "epoch": 2.346590909090909, + "grad_norm": 0.31853434490396093, + "learning_rate": 6.145548227433551e-07, + "loss": 0.237, + "step": 826 + }, + { + "epoch": 2.3494318181818183, + "grad_norm": 0.3833013165526796, + "learning_rate": 6.094726108420105e-07, + "loss": 0.2321, + "step": 827 + }, + { + "epoch": 2.3522727272727275, + "grad_norm": 0.34709948082141423, + "learning_rate": 6.044085818544807e-07, + "loss": 0.2435, + "step": 828 + }, + { + "epoch": 2.3551136363636362, + "grad_norm": 0.346027003213824, + "learning_rate": 5.993627844861172e-07, + "loss": 0.2536, + "step": 829 + }, + { + "epoch": 2.3579545454545454, + "grad_norm": 0.3350399776737133, + "learning_rate": 5.943352672669215e-07, + "loss": 0.2403, + "step": 830 + }, + { + "epoch": 2.3607954545454546, + "grad_norm": 0.32396672340715865, + "learning_rate": 5.89326078551081e-07, + "loss": 0.2213, + "step": 831 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.3844292483072848, + "learning_rate": 5.843352665164992e-07, + "loss": 0.249, + "step": 832 + }, + { + "epoch": 2.3664772727272725, + "grad_norm": 0.35019451009540753, + "learning_rate": 5.793628791643327e-07, + "loss": 0.2888, + "step": 833 + }, + { + "epoch": 2.3693181818181817, + "grad_norm": 0.3164025713303425, + "learning_rate": 5.744089643185355e-07, + "loss": 0.2515, + "step": 834 + }, + { + "epoch": 2.372159090909091, + "grad_norm": 0.3308520526667594, + "learning_rate": 5.69473569625392e-07, + "loss": 0.2587, + "step": 835 + }, + { + "epoch": 2.375, + "grad_norm": 0.3378919456195333, + "learning_rate": 5.645567425530607e-07, + "loss": 0.2433, + "step": 836 + }, + { + "epoch": 2.377840909090909, + "grad_norm": 0.3354025023866522, + "learning_rate": 5.596585303911217e-07, + "loss": 0.2542, + "step": 837 + }, + { + "epoch": 2.3806818181818183, + "grad_norm": 0.3756871055057431, + "learning_rate": 5.547789802501164e-07, + "loss": 0.2755, + "step": 838 + }, + { + "epoch": 2.3835227272727275, + "grad_norm": 0.3363888579054467, + "learning_rate": 5.499181390610958e-07, + "loss": 0.2545, + "step": 839 + }, + { + "epoch": 2.3863636363636362, + "grad_norm": 0.3726730517187886, + "learning_rate": 5.450760535751734e-07, + "loss": 0.2679, + "step": 840 + }, + { + "epoch": 2.3892045454545454, + "grad_norm": 0.34128788657594616, + "learning_rate": 5.402527703630681e-07, + "loss": 0.2744, + "step": 841 + }, + { + "epoch": 2.3920454545454546, + "grad_norm": 0.3112695417600679, + "learning_rate": 5.354483358146617e-07, + "loss": 0.2231, + "step": 842 + }, + { + "epoch": 2.3948863636363638, + "grad_norm": 0.34694374550516704, + "learning_rate": 5.306627961385538e-07, + "loss": 0.2571, + "step": 843 + }, + { + "epoch": 2.3977272727272725, + "grad_norm": 0.302981543192964, + "learning_rate": 5.258961973616117e-07, + "loss": 0.2427, + "step": 844 + }, + { + "epoch": 2.4005681818181817, + "grad_norm": 0.3008721863153869, + "learning_rate": 5.211485853285314e-07, + "loss": 0.2251, + "step": 845 + }, + { + "epoch": 2.403409090909091, + "grad_norm": 0.33302783458473956, + "learning_rate": 5.164200057013985e-07, + "loss": 0.2711, + "step": 846 + }, + { + "epoch": 2.40625, + "grad_norm": 0.3898327860869564, + "learning_rate": 5.117105039592444e-07, + "loss": 0.2869, + "step": 847 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 0.3770328552305208, + "learning_rate": 5.070201253976115e-07, + "loss": 0.2777, + "step": 848 + }, + { + "epoch": 2.4119318181818183, + "grad_norm": 0.32513904942970184, + "learning_rate": 5.02348915128118e-07, + "loss": 0.2655, + "step": 849 + }, + { + "epoch": 2.4147727272727275, + "grad_norm": 0.3173329184832482, + "learning_rate": 4.976969180780225e-07, + "loss": 0.2398, + "step": 850 + }, + { + "epoch": 2.4176136363636362, + "grad_norm": 0.3853995789331807, + "learning_rate": 4.930641789897938e-07, + "loss": 0.2699, + "step": 851 + }, + { + "epoch": 2.4204545454545454, + "grad_norm": 0.3880784265747346, + "learning_rate": 4.884507424206788e-07, + "loss": 0.2649, + "step": 852 + }, + { + "epoch": 2.4232954545454546, + "grad_norm": 0.3710421332719178, + "learning_rate": 4.838566527422742e-07, + "loss": 0.2604, + "step": 853 + }, + { + "epoch": 2.4261363636363638, + "grad_norm": 0.42114780257384915, + "learning_rate": 4.792819541400998e-07, + "loss": 0.2982, + "step": 854 + }, + { + "epoch": 2.4289772727272725, + "grad_norm": 0.3704518376341159, + "learning_rate": 4.747266906131759e-07, + "loss": 0.2916, + "step": 855 + }, + { + "epoch": 2.4318181818181817, + "grad_norm": 0.3664937063178789, + "learning_rate": 4.7019090597359624e-07, + "loss": 0.2586, + "step": 856 + }, + { + "epoch": 2.434659090909091, + "grad_norm": 0.30129914419743803, + "learning_rate": 4.656746438461085e-07, + "loss": 0.233, + "step": 857 + }, + { + "epoch": 2.4375, + "grad_norm": 0.3610260573998573, + "learning_rate": 4.611779476676956e-07, + "loss": 0.2295, + "step": 858 + }, + { + "epoch": 2.440340909090909, + "grad_norm": 0.31555005162338934, + "learning_rate": 4.5670086068715564e-07, + "loss": 0.2324, + "step": 859 + }, + { + "epoch": 2.4431818181818183, + "grad_norm": 0.38647155996115823, + "learning_rate": 4.522434259646896e-07, + "loss": 0.2509, + "step": 860 + }, + { + "epoch": 2.4460227272727275, + "grad_norm": 0.3295294330692125, + "learning_rate": 4.4780568637148277e-07, + "loss": 0.2409, + "step": 861 + }, + { + "epoch": 2.4488636363636362, + "grad_norm": 0.40919134523297795, + "learning_rate": 4.4338768458929455e-07, + "loss": 0.2753, + "step": 862 + }, + { + "epoch": 2.4517045454545454, + "grad_norm": 0.3281509333195072, + "learning_rate": 4.3898946311005054e-07, + "loss": 0.2776, + "step": 863 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 0.3003640118064134, + "learning_rate": 4.346110642354284e-07, + "loss": 0.2288, + "step": 864 + }, + { + "epoch": 2.4573863636363638, + "grad_norm": 0.2856917980597871, + "learning_rate": 4.30252530076454e-07, + "loss": 0.2262, + "step": 865 + }, + { + "epoch": 2.4602272727272725, + "grad_norm": 0.3716917156792666, + "learning_rate": 4.259139025530981e-07, + "loss": 0.2704, + "step": 866 + }, + { + "epoch": 2.4630681818181817, + "grad_norm": 0.3646615055009088, + "learning_rate": 4.2159522339387027e-07, + "loss": 0.2422, + "step": 867 + }, + { + "epoch": 2.465909090909091, + "grad_norm": 0.352013885171188, + "learning_rate": 4.1729653413541795e-07, + "loss": 0.2586, + "step": 868 + }, + { + "epoch": 2.46875, + "grad_norm": 0.3341889105921635, + "learning_rate": 4.13017876122129e-07, + "loss": 0.2514, + "step": 869 + }, + { + "epoch": 2.471590909090909, + "grad_norm": 0.3436869862214985, + "learning_rate": 4.087592905057319e-07, + "loss": 0.2663, + "step": 870 + }, + { + "epoch": 2.4744318181818183, + "grad_norm": 0.3459285477446355, + "learning_rate": 4.0452081824490007e-07, + "loss": 0.2274, + "step": 871 + }, + { + "epoch": 2.4772727272727275, + "grad_norm": 0.39474776701060227, + "learning_rate": 4.0030250010486106e-07, + "loss": 0.2635, + "step": 872 + }, + { + "epoch": 2.4801136363636362, + "grad_norm": 0.3588162845171683, + "learning_rate": 3.9610437665699803e-07, + "loss": 0.2702, + "step": 873 + }, + { + "epoch": 2.4829545454545454, + "grad_norm": 0.3055170644052573, + "learning_rate": 3.919264882784662e-07, + "loss": 0.2642, + "step": 874 + }, + { + "epoch": 2.4857954545454546, + "grad_norm": 0.4004147388266674, + "learning_rate": 3.8776887515180215e-07, + "loss": 0.2673, + "step": 875 + }, + { + "epoch": 2.4886363636363638, + "grad_norm": 0.3435684772838886, + "learning_rate": 3.836315772645355e-07, + "loss": 0.2572, + "step": 876 + }, + { + "epoch": 2.4914772727272725, + "grad_norm": 0.3929983920357782, + "learning_rate": 3.79514634408806e-07, + "loss": 0.314, + "step": 877 + }, + { + "epoch": 2.4943181818181817, + "grad_norm": 0.3402456651574272, + "learning_rate": 3.7541808618098225e-07, + "loss": 0.2742, + "step": 878 + }, + { + "epoch": 2.497159090909091, + "grad_norm": 0.3391484648776555, + "learning_rate": 3.713419719812775e-07, + "loss": 0.2957, + "step": 879 + }, + { + "epoch": 2.5, + "grad_norm": 0.3300372482602716, + "learning_rate": 3.6728633101337283e-07, + "loss": 0.2402, + "step": 880 + }, + { + "epoch": 2.502840909090909, + "grad_norm": 0.3880324057454857, + "learning_rate": 3.632512022840401e-07, + "loss": 0.225, + "step": 881 + }, + { + "epoch": 2.5056818181818183, + "grad_norm": 0.40083562156829194, + "learning_rate": 3.592366246027654e-07, + "loss": 0.2885, + "step": 882 + }, + { + "epoch": 2.5085227272727275, + "grad_norm": 0.3898508645513151, + "learning_rate": 3.552426365813791e-07, + "loss": 0.279, + "step": 883 + }, + { + "epoch": 2.5113636363636362, + "grad_norm": 0.34747356344583896, + "learning_rate": 3.512692766336795e-07, + "loss": 0.2551, + "step": 884 + }, + { + "epoch": 2.5142045454545454, + "grad_norm": 0.3697878476145354, + "learning_rate": 3.4731658297506717e-07, + "loss": 0.2584, + "step": 885 + }, + { + "epoch": 2.5170454545454546, + "grad_norm": 0.3442593343497222, + "learning_rate": 3.433845936221772e-07, + "loss": 0.2323, + "step": 886 + }, + { + "epoch": 2.5198863636363638, + "grad_norm": 0.4052191198887473, + "learning_rate": 3.394733463925115e-07, + "loss": 0.2895, + "step": 887 + }, + { + "epoch": 2.5227272727272725, + "grad_norm": 0.3639886136390821, + "learning_rate": 3.355828789040752e-07, + "loss": 0.276, + "step": 888 + }, + { + "epoch": 2.5255681818181817, + "grad_norm": 0.39883666474289897, + "learning_rate": 3.3171322857501796e-07, + "loss": 0.2858, + "step": 889 + }, + { + "epoch": 2.528409090909091, + "grad_norm": 0.40889869044433336, + "learning_rate": 3.278644326232713e-07, + "loss": 0.257, + "step": 890 + }, + { + "epoch": 2.53125, + "grad_norm": 0.3284086126915543, + "learning_rate": 3.2403652806619e-07, + "loss": 0.2699, + "step": 891 + }, + { + "epoch": 2.534090909090909, + "grad_norm": 0.3806103148982155, + "learning_rate": 3.2022955172019947e-07, + "loss": 0.2607, + "step": 892 + }, + { + "epoch": 2.5369318181818183, + "grad_norm": 0.414262076377764, + "learning_rate": 3.1644354020043846e-07, + "loss": 0.2709, + "step": 893 + }, + { + "epoch": 2.5397727272727275, + "grad_norm": 0.3564646964673218, + "learning_rate": 3.1267852992040715e-07, + "loss": 0.2845, + "step": 894 + }, + { + "epoch": 2.5426136363636362, + "grad_norm": 0.35912306046922576, + "learning_rate": 3.0893455709162023e-07, + "loss": 0.2466, + "step": 895 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.3605968532309376, + "learning_rate": 3.052116577232533e-07, + "loss": 0.2868, + "step": 896 + }, + { + "epoch": 2.5482954545454546, + "grad_norm": 0.4129969651465434, + "learning_rate": 3.015098676218009e-07, + "loss": 0.2738, + "step": 897 + }, + { + "epoch": 2.5511363636363638, + "grad_norm": 0.38800081862705826, + "learning_rate": 2.9782922239073084e-07, + "loss": 0.274, + "step": 898 + }, + { + "epoch": 2.5539772727272725, + "grad_norm": 0.28725463626604075, + "learning_rate": 2.9416975743014134e-07, + "loss": 0.246, + "step": 899 + }, + { + "epoch": 2.5568181818181817, + "grad_norm": 0.33194490572792595, + "learning_rate": 2.9053150793642013e-07, + "loss": 0.2418, + "step": 900 + }, + { + "epoch": 2.559659090909091, + "grad_norm": 0.31927368240055043, + "learning_rate": 2.8691450890190794e-07, + "loss": 0.259, + "step": 901 + }, + { + "epoch": 2.5625, + "grad_norm": 0.4514488260064792, + "learning_rate": 2.833187951145588e-07, + "loss": 0.2674, + "step": 902 + }, + { + "epoch": 2.565340909090909, + "grad_norm": 0.3952590748072181, + "learning_rate": 2.797444011576089e-07, + "loss": 0.2764, + "step": 903 + }, + { + "epoch": 2.5681818181818183, + "grad_norm": 0.3035956390116324, + "learning_rate": 2.7619136140924153e-07, + "loss": 0.2361, + "step": 904 + }, + { + "epoch": 2.5710227272727275, + "grad_norm": 0.365463810965996, + "learning_rate": 2.726597100422565e-07, + "loss": 0.2955, + "step": 905 + }, + { + "epoch": 2.5738636363636362, + "grad_norm": 0.37417152902560946, + "learning_rate": 2.6914948102374384e-07, + "loss": 0.3007, + "step": 906 + }, + { + "epoch": 2.5767045454545454, + "grad_norm": 0.36872656495257466, + "learning_rate": 2.656607081147547e-07, + "loss": 0.2647, + "step": 907 + }, + { + "epoch": 2.5795454545454546, + "grad_norm": 0.381314240650295, + "learning_rate": 2.621934248699767e-07, + "loss": 0.3176, + "step": 908 + }, + { + "epoch": 2.5823863636363638, + "grad_norm": 0.41529466546435734, + "learning_rate": 2.5874766463741263e-07, + "loss": 0.2482, + "step": 909 + }, + { + "epoch": 2.5852272727272725, + "grad_norm": 0.4258649726301599, + "learning_rate": 2.553234605580593e-07, + "loss": 0.2618, + "step": 910 + }, + { + "epoch": 2.5880681818181817, + "grad_norm": 0.3762825021021476, + "learning_rate": 2.5192084556558776e-07, + "loss": 0.2914, + "step": 911 + }, + { + "epoch": 2.590909090909091, + "grad_norm": 0.3627506684619514, + "learning_rate": 2.4853985238602745e-07, + "loss": 0.2875, + "step": 912 + }, + { + "epoch": 2.59375, + "grad_norm": 0.3173651745814326, + "learning_rate": 2.451805135374516e-07, + "loss": 0.2421, + "step": 913 + }, + { + "epoch": 2.596590909090909, + "grad_norm": 0.44802208559240897, + "learning_rate": 2.4184286132966305e-07, + "loss": 0.2803, + "step": 914 + }, + { + "epoch": 2.5994318181818183, + "grad_norm": 0.36772649044669337, + "learning_rate": 2.3852692786388634e-07, + "loss": 0.3018, + "step": 915 + }, + { + "epoch": 2.6022727272727275, + "grad_norm": 0.3473737442586536, + "learning_rate": 2.3523274503245624e-07, + "loss": 0.2565, + "step": 916 + }, + { + "epoch": 2.6051136363636362, + "grad_norm": 0.31723371911082704, + "learning_rate": 2.319603445185109e-07, + "loss": 0.2769, + "step": 917 + }, + { + "epoch": 2.6079545454545454, + "grad_norm": 0.36837062880150556, + "learning_rate": 2.2870975779569066e-07, + "loss": 0.294, + "step": 918 + }, + { + "epoch": 2.6107954545454546, + "grad_norm": 0.34124708806422904, + "learning_rate": 2.2548101612783147e-07, + "loss": 0.2516, + "step": 919 + }, + { + "epoch": 2.6136363636363638, + "grad_norm": 0.3202170151424555, + "learning_rate": 2.2227415056866431e-07, + "loss": 0.254, + "step": 920 + }, + { + "epoch": 2.6164772727272725, + "grad_norm": 0.4260342271233267, + "learning_rate": 2.1908919196152013e-07, + "loss": 0.2719, + "step": 921 + }, + { + "epoch": 2.6193181818181817, + "grad_norm": 0.37728441327420986, + "learning_rate": 2.1592617093902978e-07, + "loss": 0.2753, + "step": 922 + }, + { + "epoch": 2.622159090909091, + "grad_norm": 0.39060937907330195, + "learning_rate": 2.1278511792283018e-07, + "loss": 0.2947, + "step": 923 + }, + { + "epoch": 2.625, + "grad_norm": 0.30888479325881507, + "learning_rate": 2.0966606312327303e-07, + "loss": 0.2284, + "step": 924 + }, + { + "epoch": 2.627840909090909, + "grad_norm": 0.40561974710485005, + "learning_rate": 2.065690365391329e-07, + "loss": 0.2943, + "step": 925 + }, + { + "epoch": 2.6306818181818183, + "grad_norm": 0.355886681039042, + "learning_rate": 2.0349406795731774e-07, + "loss": 0.2462, + "step": 926 + }, + { + "epoch": 2.6335227272727275, + "grad_norm": 0.37901081172880524, + "learning_rate": 2.0044118695258657e-07, + "loss": 0.2918, + "step": 927 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 0.48522777901179487, + "learning_rate": 1.9741042288725893e-07, + "loss": 0.3463, + "step": 928 + }, + { + "epoch": 2.6392045454545454, + "grad_norm": 0.35552067688931177, + "learning_rate": 1.944018049109375e-07, + "loss": 0.2589, + "step": 929 + }, + { + "epoch": 2.6420454545454546, + "grad_norm": 0.3245196964527464, + "learning_rate": 1.9141536196022658e-07, + "loss": 0.2667, + "step": 930 + }, + { + "epoch": 2.6448863636363638, + "grad_norm": 0.397373448701769, + "learning_rate": 1.884511227584518e-07, + "loss": 0.2635, + "step": 931 + }, + { + "epoch": 2.6477272727272725, + "grad_norm": 0.3230165219575403, + "learning_rate": 1.8550911581538517e-07, + "loss": 0.2524, + "step": 932 + }, + { + "epoch": 2.6505681818181817, + "grad_norm": 0.3201491067518106, + "learning_rate": 1.825893694269723e-07, + "loss": 0.2704, + "step": 933 + }, + { + "epoch": 2.653409090909091, + "grad_norm": 0.3806372642940993, + "learning_rate": 1.7969191167505811e-07, + "loss": 0.2891, + "step": 934 + }, + { + "epoch": 2.65625, + "grad_norm": 0.3315048294973883, + "learning_rate": 1.7681677042711732e-07, + "loss": 0.2469, + "step": 935 + }, + { + "epoch": 2.659090909090909, + "grad_norm": 0.3429832481491404, + "learning_rate": 1.7396397333598657e-07, + "loss": 0.2344, + "step": 936 + }, + { + "epoch": 2.6619318181818183, + "grad_norm": 0.31805225672924486, + "learning_rate": 1.711335478395984e-07, + "loss": 0.2301, + "step": 937 + }, + { + "epoch": 2.6647727272727275, + "grad_norm": 0.347431193735004, + "learning_rate": 1.6832552116071905e-07, + "loss": 0.274, + "step": 938 + }, + { + "epoch": 2.6676136363636362, + "grad_norm": 0.3276581659477082, + "learning_rate": 1.6553992030668293e-07, + "loss": 0.2569, + "step": 939 + }, + { + "epoch": 2.6704545454545454, + "grad_norm": 0.4181936566989231, + "learning_rate": 1.6277677206913588e-07, + "loss": 0.2737, + "step": 940 + }, + { + "epoch": 2.6732954545454546, + "grad_norm": 0.37610721012897674, + "learning_rate": 1.6003610302377708e-07, + "loss": 0.2999, + "step": 941 + }, + { + "epoch": 2.6761363636363638, + "grad_norm": 0.33046264353939814, + "learning_rate": 1.5731793953010193e-07, + "loss": 0.2427, + "step": 942 + }, + { + "epoch": 2.6789772727272725, + "grad_norm": 0.3494974820800891, + "learning_rate": 1.5462230773115066e-07, + "loss": 0.264, + "step": 943 + }, + { + "epoch": 2.6818181818181817, + "grad_norm": 0.3468159326122336, + "learning_rate": 1.5194923355325464e-07, + "loss": 0.3076, + "step": 944 + }, + { + "epoch": 2.684659090909091, + "grad_norm": 0.40045232274054987, + "learning_rate": 1.492987427057893e-07, + "loss": 0.3051, + "step": 945 + }, + { + "epoch": 2.6875, + "grad_norm": 0.4030575958079979, + "learning_rate": 1.4667086068092446e-07, + "loss": 0.2437, + "step": 946 + }, + { + "epoch": 2.690340909090909, + "grad_norm": 0.34082328928674294, + "learning_rate": 1.440656127533821e-07, + "loss": 0.2501, + "step": 947 + }, + { + "epoch": 2.6931818181818183, + "grad_norm": 0.34010796962843276, + "learning_rate": 1.414830239801898e-07, + "loss": 0.27, + "step": 948 + }, + { + "epoch": 2.6960227272727275, + "grad_norm": 0.4274695728838406, + "learning_rate": 1.3892311920044282e-07, + "loss": 0.2964, + "step": 949 + }, + { + "epoch": 2.6988636363636362, + "grad_norm": 0.35443571450269734, + "learning_rate": 1.3638592303506364e-07, + "loss": 0.252, + "step": 950 + }, + { + "epoch": 2.7017045454545454, + "grad_norm": 0.40737204314859, + "learning_rate": 1.3387145988656537e-07, + "loss": 0.2891, + "step": 951 + }, + { + "epoch": 2.7045454545454546, + "grad_norm": 0.352138799387513, + "learning_rate": 1.313797539388159e-07, + "loss": 0.2439, + "step": 952 + }, + { + "epoch": 2.7073863636363638, + "grad_norm": 0.33845536331763004, + "learning_rate": 1.2891082915680864e-07, + "loss": 0.2802, + "step": 953 + }, + { + "epoch": 2.7102272727272725, + "grad_norm": 0.35504925601892684, + "learning_rate": 1.264647092864288e-07, + "loss": 0.2514, + "step": 954 + }, + { + "epoch": 2.7130681818181817, + "grad_norm": 0.3609121713893806, + "learning_rate": 1.2404141785422568e-07, + "loss": 0.25, + "step": 955 + }, + { + "epoch": 2.715909090909091, + "grad_norm": 0.3936221787085924, + "learning_rate": 1.2164097816718818e-07, + "loss": 0.2312, + "step": 956 + }, + { + "epoch": 2.71875, + "grad_norm": 0.38365034429115125, + "learning_rate": 1.1926341331251756e-07, + "loss": 0.2682, + "step": 957 + }, + { + "epoch": 2.721590909090909, + "grad_norm": 0.31959559051327435, + "learning_rate": 1.169087461574081e-07, + "loss": 0.2457, + "step": 958 + }, + { + "epoch": 2.7244318181818183, + "grad_norm": 0.3799557870602865, + "learning_rate": 1.1457699934882715e-07, + "loss": 0.2968, + "step": 959 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.27723093935677195, + "learning_rate": 1.1226819531329342e-07, + "loss": 0.2219, + "step": 960 + }, + { + "epoch": 2.7301136363636362, + "grad_norm": 0.3534828660456155, + "learning_rate": 1.0998235625666708e-07, + "loss": 0.2433, + "step": 961 + }, + { + "epoch": 2.7329545454545454, + "grad_norm": 0.35791787748576426, + "learning_rate": 1.0771950416393228e-07, + "loss": 0.2597, + "step": 962 + }, + { + "epoch": 2.7357954545454546, + "grad_norm": 0.4475649717820448, + "learning_rate": 1.0547966079898637e-07, + "loss": 0.2636, + "step": 963 + }, + { + "epoch": 2.7386363636363638, + "grad_norm": 0.39027504830647813, + "learning_rate": 1.0326284770443063e-07, + "loss": 0.2728, + "step": 964 + }, + { + "epoch": 2.7414772727272725, + "grad_norm": 0.4315977251477179, + "learning_rate": 1.0106908620136525e-07, + "loss": 0.2588, + "step": 965 + }, + { + "epoch": 2.7443181818181817, + "grad_norm": 0.32246952155074843, + "learning_rate": 9.889839738918022e-08, + "loss": 0.2369, + "step": 966 + }, + { + "epoch": 2.747159090909091, + "grad_norm": 0.3333508436923039, + "learning_rate": 9.675080214535559e-08, + "loss": 0.2574, + "step": 967 + }, + { + "epoch": 2.75, + "grad_norm": 0.3654840156563527, + "learning_rate": 9.46263211252596e-08, + "loss": 0.3222, + "step": 968 + }, + { + "epoch": 2.752840909090909, + "grad_norm": 0.3366414190912868, + "learning_rate": 9.252497476194972e-08, + "loss": 0.2926, + "step": 969 + }, + { + "epoch": 2.7556818181818183, + "grad_norm": 0.3243823618475195, + "learning_rate": 9.044678326597722e-08, + "loss": 0.2484, + "step": 970 + }, + { + "epoch": 2.7585227272727275, + "grad_norm": 0.34777278160161157, + "learning_rate": 8.839176662519155e-08, + "loss": 0.2349, + "step": 971 + }, + { + "epoch": 2.7613636363636362, + "grad_norm": 0.34671371366502046, + "learning_rate": 8.635994460454766e-08, + "loss": 0.2574, + "step": 972 + }, + { + "epoch": 2.7642045454545454, + "grad_norm": 0.38617683116302787, + "learning_rate": 8.435133674591922e-08, + "loss": 0.3007, + "step": 973 + }, + { + "epoch": 2.7670454545454546, + "grad_norm": 0.4218961579649425, + "learning_rate": 8.2365962367906e-08, + "loss": 0.2916, + "step": 974 + }, + { + "epoch": 2.7698863636363638, + "grad_norm": 0.3971792338298757, + "learning_rate": 8.040384056565098e-08, + "loss": 0.2563, + "step": 975 + }, + { + "epoch": 2.7727272727272725, + "grad_norm": 0.3226524769417545, + "learning_rate": 7.846499021065684e-08, + "loss": 0.266, + "step": 976 + }, + { + "epoch": 2.7755681818181817, + "grad_norm": 0.3540519465775941, + "learning_rate": 7.654942995060283e-08, + "loss": 0.2616, + "step": 977 + }, + { + "epoch": 2.778409090909091, + "grad_norm": 0.3781537081979966, + "learning_rate": 7.465717820916624e-08, + "loss": 0.2698, + "step": 978 + }, + { + "epoch": 2.78125, + "grad_norm": 0.3564755050368105, + "learning_rate": 7.278825318584647e-08, + "loss": 0.27, + "step": 979 + }, + { + "epoch": 2.784090909090909, + "grad_norm": 0.3510249393237661, + "learning_rate": 7.094267285578688e-08, + "loss": 0.2666, + "step": 980 + }, + { + "epoch": 2.7869318181818183, + "grad_norm": 0.3998246424539849, + "learning_rate": 6.912045496960507e-08, + "loss": 0.2851, + "step": 981 + }, + { + "epoch": 2.7897727272727275, + "grad_norm": 0.37123966300816885, + "learning_rate": 6.732161705322093e-08, + "loss": 0.2528, + "step": 982 + }, + { + "epoch": 2.7926136363636362, + "grad_norm": 0.32607742324666744, + "learning_rate": 6.554617640768674e-08, + "loss": 0.2682, + "step": 983 + }, + { + "epoch": 2.7954545454545454, + "grad_norm": 0.3091883263291907, + "learning_rate": 6.379415010902362e-08, + "loss": 0.2431, + "step": 984 + }, + { + "epoch": 2.7982954545454546, + "grad_norm": 0.3896435979654701, + "learning_rate": 6.206555500805455e-08, + "loss": 0.2662, + "step": 985 + }, + { + "epoch": 2.8011363636363638, + "grad_norm": 0.36244662485716045, + "learning_rate": 6.036040773024387e-08, + "loss": 0.2708, + "step": 986 + }, + { + "epoch": 2.8039772727272725, + "grad_norm": 0.3558651773572941, + "learning_rate": 5.867872467553715e-08, + "loss": 0.3004, + "step": 987 + }, + { + "epoch": 2.8068181818181817, + "grad_norm": 0.37311773304851065, + "learning_rate": 5.702052201820352e-08, + "loss": 0.3088, + "step": 988 + }, + { + "epoch": 2.809659090909091, + "grad_norm": 0.411421481665237, + "learning_rate": 5.5385815706678894e-08, + "loss": 0.2923, + "step": 989 + }, + { + "epoch": 2.8125, + "grad_norm": 0.3759229007631887, + "learning_rate": 5.377462146341439e-08, + "loss": 0.2945, + "step": 990 + }, + { + "epoch": 2.815340909090909, + "grad_norm": 0.3014861546323833, + "learning_rate": 5.218695478472397e-08, + "loss": 0.2119, + "step": 991 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 0.4021583403485505, + "learning_rate": 5.062283094063536e-08, + "loss": 0.2878, + "step": 992 + }, + { + "epoch": 2.8210227272727275, + "grad_norm": 0.3293364475828707, + "learning_rate": 4.9082264974744665e-08, + "loss": 0.266, + "step": 993 + }, + { + "epoch": 2.8238636363636362, + "grad_norm": 0.30933470398564117, + "learning_rate": 4.756527170406922e-08, + "loss": 0.2314, + "step": 994 + }, + { + "epoch": 2.8267045454545454, + "grad_norm": 0.37909174739130147, + "learning_rate": 4.607186571890715e-08, + "loss": 0.2667, + "step": 995 + }, + { + "epoch": 2.8295454545454546, + "grad_norm": 0.37878603560502083, + "learning_rate": 4.46020613826964e-08, + "loss": 0.2937, + "step": 996 + }, + { + "epoch": 2.8323863636363638, + "grad_norm": 0.408496513297682, + "learning_rate": 4.3155872831875946e-08, + "loss": 0.2757, + "step": 997 + }, + { + "epoch": 2.8352272727272725, + "grad_norm": 0.3566593848752578, + "learning_rate": 4.1733313975750586e-08, + "loss": 0.2584, + "step": 998 + }, + { + "epoch": 2.8380681818181817, + "grad_norm": 0.352150696238673, + "learning_rate": 4.033439849635695e-08, + "loss": 0.2115, + "step": 999 + }, + { + "epoch": 2.840909090909091, + "grad_norm": 0.39392089147895293, + "learning_rate": 3.895913984833216e-08, + "loss": 0.2816, + "step": 1000 + }, + { + "epoch": 2.84375, + "grad_norm": 0.3412262767323334, + "learning_rate": 3.760755125878368e-08, + "loss": 0.2431, + "step": 1001 + }, + { + "epoch": 2.846590909090909, + "grad_norm": 0.3325324503502811, + "learning_rate": 3.627964572716331e-08, + "loss": 0.264, + "step": 1002 + }, + { + "epoch": 2.8494318181818183, + "grad_norm": 0.35296040111990046, + "learning_rate": 3.497543602514059e-08, + "loss": 0.2614, + "step": 1003 + }, + { + "epoch": 2.8522727272727275, + "grad_norm": 0.2837474483774213, + "learning_rate": 3.3694934696481275e-08, + "loss": 0.2123, + "step": 1004 + }, + { + "epoch": 2.8551136363636362, + "grad_norm": 0.34272040018575495, + "learning_rate": 3.24381540569263e-08, + "loss": 0.2808, + "step": 1005 + }, + { + "epoch": 2.8579545454545454, + "grad_norm": 0.3498353760521046, + "learning_rate": 3.120510619407324e-08, + "loss": 0.251, + "step": 1006 + }, + { + "epoch": 2.8607954545454546, + "grad_norm": 0.4069913912888687, + "learning_rate": 2.9995802967259516e-08, + "loss": 0.316, + "step": 1007 + }, + { + "epoch": 2.8636363636363638, + "grad_norm": 0.3361233831001831, + "learning_rate": 2.8810256007449632e-08, + "loss": 0.2293, + "step": 1008 + }, + { + "epoch": 2.8664772727272725, + "grad_norm": 0.4519558529396144, + "learning_rate": 2.7648476717122287e-08, + "loss": 0.2792, + "step": 1009 + }, + { + "epoch": 2.8693181818181817, + "grad_norm": 0.409732040720535, + "learning_rate": 2.651047627016068e-08, + "loss": 0.2904, + "step": 1010 + }, + { + "epoch": 2.872159090909091, + "grad_norm": 0.3250171306579268, + "learning_rate": 2.5396265611745687e-08, + "loss": 0.2463, + "step": 1011 + }, + { + "epoch": 2.875, + "grad_norm": 0.3856346320602474, + "learning_rate": 2.4305855458250373e-08, + "loss": 0.2356, + "step": 1012 + }, + { + "epoch": 2.877840909090909, + "grad_norm": 0.3526716263439721, + "learning_rate": 2.3239256297136193e-08, + "loss": 0.258, + "step": 1013 + }, + { + "epoch": 2.8806818181818183, + "grad_norm": 0.41510762650616695, + "learning_rate": 2.2196478386853624e-08, + "loss": 0.3018, + "step": 1014 + }, + { + "epoch": 2.8835227272727275, + "grad_norm": 0.2827340090469283, + "learning_rate": 2.117753175674142e-08, + "loss": 0.1949, + "step": 1015 + }, + { + "epoch": 2.8863636363636362, + "grad_norm": 0.42491879871002564, + "learning_rate": 2.0182426206932503e-08, + "loss": 0.2607, + "step": 1016 + }, + { + "epoch": 2.8892045454545454, + "grad_norm": 0.3281820518654654, + "learning_rate": 1.921117130825767e-08, + "loss": 0.266, + "step": 1017 + }, + { + "epoch": 2.8920454545454546, + "grad_norm": 0.5241869397210815, + "learning_rate": 1.82637764021551e-08, + "loss": 0.2566, + "step": 1018 + }, + { + "epoch": 2.8948863636363638, + "grad_norm": 0.36254656882284764, + "learning_rate": 1.7340250600579588e-08, + "loss": 0.2683, + "step": 1019 + }, + { + "epoch": 2.8977272727272725, + "grad_norm": 0.32113348760758087, + "learning_rate": 1.6440602785914584e-08, + "loss": 0.2495, + "step": 1020 + }, + { + "epoch": 2.9005681818181817, + "grad_norm": 0.39293475539987827, + "learning_rate": 1.556484161088806e-08, + "loss": 0.2673, + "step": 1021 + }, + { + "epoch": 2.903409090909091, + "grad_norm": 0.3692023050105476, + "learning_rate": 1.4712975498488158e-08, + "loss": 0.2676, + "step": 1022 + }, + { + "epoch": 2.90625, + "grad_norm": 0.3301143389304983, + "learning_rate": 1.3885012641882967e-08, + "loss": 0.2549, + "step": 1023 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.35000768054036296, + "learning_rate": 1.3080961004340308e-08, + "loss": 0.2769, + "step": 1024 + }, + { + "epoch": 2.9119318181818183, + "grad_norm": 0.3257952826732702, + "learning_rate": 1.2300828319153635e-08, + "loss": 0.2455, + "step": 1025 + }, + { + "epoch": 2.9147727272727275, + "grad_norm": 0.40990948777245817, + "learning_rate": 1.1544622089565139e-08, + "loss": 0.2999, + "step": 1026 + }, + { + "epoch": 2.9176136363636362, + "grad_norm": 0.5767979073587421, + "learning_rate": 1.0812349588694426e-08, + "loss": 0.2985, + "step": 1027 + }, + { + "epoch": 2.9204545454545454, + "grad_norm": 0.39277960762361686, + "learning_rate": 1.010401785947024e-08, + "loss": 0.3085, + "step": 1028 + }, + { + "epoch": 2.9232954545454546, + "grad_norm": 0.3487911223998262, + "learning_rate": 9.419633714559118e-09, + "loss": 0.2771, + "step": 1029 + }, + { + "epoch": 2.9261363636363638, + "grad_norm": 0.35424561590037207, + "learning_rate": 8.759203736304067e-09, + "loss": 0.2753, + "step": 1030 + }, + { + "epoch": 2.9289772727272725, + "grad_norm": 0.29485879447799396, + "learning_rate": 8.122734276657384e-09, + "loss": 0.2089, + "step": 1031 + }, + { + "epoch": 2.9318181818181817, + "grad_norm": 0.6294311065483419, + "learning_rate": 7.51023145712293e-09, + "loss": 0.3052, + "step": 1032 + }, + { + "epoch": 2.934659090909091, + "grad_norm": 0.354680706106559, + "learning_rate": 6.921701168694228e-09, + "loss": 0.2638, + "step": 1033 + }, + { + "epoch": 2.9375, + "grad_norm": 0.31404010890218703, + "learning_rate": 6.357149071800628e-09, + "loss": 0.2396, + "step": 1034 + }, + { + "epoch": 2.940340909090909, + "grad_norm": 0.3191510959590836, + "learning_rate": 5.816580596250676e-09, + "loss": 0.2652, + "step": 1035 + }, + { + "epoch": 2.9431818181818183, + "grad_norm": 0.33936071059872674, + "learning_rate": 5.300000941180494e-09, + "loss": 0.2761, + "step": 1036 + }, + { + "epoch": 2.9460227272727275, + "grad_norm": 0.6694940206582203, + "learning_rate": 4.807415075005206e-09, + "loss": 0.2716, + "step": 1037 + }, + { + "epoch": 2.9488636363636362, + "grad_norm": 0.3022654996639677, + "learning_rate": 4.338827735368423e-09, + "loss": 0.267, + "step": 1038 + }, + { + "epoch": 2.9517045454545454, + "grad_norm": 0.31223716729746726, + "learning_rate": 3.894243429098943e-09, + "loss": 0.2556, + "step": 1039 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.33999761359381697, + "learning_rate": 3.4736664321671777e-09, + "loss": 0.2234, + "step": 1040 + }, + { + "epoch": 2.9573863636363638, + "grad_norm": 0.38818142260184346, + "learning_rate": 3.0771007896424066e-09, + "loss": 0.2822, + "step": 1041 + }, + { + "epoch": 2.9602272727272725, + "grad_norm": 0.3915644733747401, + "learning_rate": 2.7045503156555853e-09, + "loss": 0.3089, + "step": 1042 + }, + { + "epoch": 2.9630681818181817, + "grad_norm": 0.35070734375473045, + "learning_rate": 2.3560185933621526e-09, + "loss": 0.2485, + "step": 1043 + }, + { + "epoch": 2.965909090909091, + "grad_norm": 0.32676962221864597, + "learning_rate": 2.031508974907337e-09, + "loss": 0.2564, + "step": 1044 + }, + { + "epoch": 2.96875, + "grad_norm": 0.37376434665996433, + "learning_rate": 1.7310245813939586e-09, + "loss": 0.2843, + "step": 1045 + }, + { + "epoch": 2.971590909090909, + "grad_norm": 0.3812123549505928, + "learning_rate": 1.4545683028521772e-09, + "loss": 0.2642, + "step": 1046 + }, + { + "epoch": 2.9744318181818183, + "grad_norm": 0.40366173461812144, + "learning_rate": 1.2021427982128463e-09, + "loss": 0.2714, + "step": 1047 + }, + { + "epoch": 2.9772727272727275, + "grad_norm": 0.38234650853272395, + "learning_rate": 9.737504952803124e-10, + "loss": 0.2483, + "step": 1048 + }, + { + "epoch": 2.9801136363636362, + "grad_norm": 0.3581632163317752, + "learning_rate": 7.693935907102102e-10, + "loss": 0.2448, + "step": 1049 + }, + { + "epoch": 2.9829545454545454, + "grad_norm": 0.44654505449503146, + "learning_rate": 5.890740499878145e-10, + "loss": 0.295, + "step": 1050 + }, + { + "epoch": 2.9857954545454546, + "grad_norm": 0.33560840489821, + "learning_rate": 4.3279360740972053e-10, + "loss": 0.2217, + "step": 1051 + }, + { + "epoch": 2.9886363636363638, + "grad_norm": 0.3283855292339783, + "learning_rate": 3.005537660663582e-10, + "loss": 0.219, + "step": 1052 + }, + { + "epoch": 2.9914772727272725, + "grad_norm": 0.35996516047736465, + "learning_rate": 1.923557978281143e-10, + "loss": 0.2571, + "step": 1053 + }, + { + "epoch": 2.9943181818181817, + "grad_norm": 0.30525860331677324, + "learning_rate": 1.0820074333256492e-10, + "loss": 0.2571, + "step": 1054 + }, + { + "epoch": 2.997159090909091, + "grad_norm": 0.37025001234738963, + "learning_rate": 4.808941197531614e-11, + "loss": 0.269, + "step": 1055 + }, + { + "epoch": 3.0, + "grad_norm": 0.3619551402376093, + "learning_rate": 1.2022381901399815e-11, + "loss": 0.24, + "step": 1056 + }, + { + "epoch": 3.0, + "step": 1056, + "total_flos": 1454552492015616.0, + "train_loss": 0.28625056774101476, + "train_runtime": 131042.0993, + "train_samples_per_second": 0.258, + "train_steps_per_second": 0.008 + } + ], + "logging_steps": 1.0, + "max_steps": 1056, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1454552492015616.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}