{ "best_metric": 0.244761124253273, "best_model_checkpoint": "Classifier-Intent-snowflake/checkpoint-803", "epoch": 1.0, "eval_steps": 500, "global_step": 803, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012453300124533001, "grad_norm": 14.392992973327637, "learning_rate": 3.1133250311332504e-08, "loss": 1.3872, "step": 1 }, { "epoch": 0.0024906600249066002, "grad_norm": 16.613218307495117, "learning_rate": 6.226650062266501e-08, "loss": 1.4209, "step": 2 }, { "epoch": 0.0037359900373599006, "grad_norm": 14.957581520080566, "learning_rate": 9.339975093399752e-08, "loss": 1.5269, "step": 3 }, { "epoch": 0.0049813200498132005, "grad_norm": 14.315893173217773, "learning_rate": 1.2453300124533001e-07, "loss": 1.3745, "step": 4 }, { "epoch": 0.0062266500622665, "grad_norm": 17.72991371154785, "learning_rate": 1.556662515566625e-07, "loss": 1.2588, "step": 5 }, { "epoch": 0.007471980074719801, "grad_norm": 15.170116424560547, "learning_rate": 1.8679950186799505e-07, "loss": 1.4722, "step": 6 }, { "epoch": 0.008717310087173101, "grad_norm": 14.7129487991333, "learning_rate": 2.1793275217932754e-07, "loss": 1.4404, "step": 7 }, { "epoch": 0.009962640099626401, "grad_norm": 19.042442321777344, "learning_rate": 2.4906600249066003e-07, "loss": 1.5845, "step": 8 }, { "epoch": 0.0112079701120797, "grad_norm": 14.830946922302246, "learning_rate": 2.801992528019925e-07, "loss": 1.3213, "step": 9 }, { "epoch": 0.012453300124533, "grad_norm": 15.1524076461792, "learning_rate": 3.11332503113325e-07, "loss": 1.2402, "step": 10 }, { "epoch": 0.0136986301369863, "grad_norm": 15.068155288696289, "learning_rate": 3.4246575342465755e-07, "loss": 1.3062, "step": 11 }, { "epoch": 0.014943960149439602, "grad_norm": 17.31379508972168, "learning_rate": 3.735990037359901e-07, "loss": 1.6055, "step": 12 }, { "epoch": 0.0161892901618929, "grad_norm": 15.690240859985352, "learning_rate": 4.0473225404732254e-07, "loss": 1.4761, "step": 13 }, { "epoch": 0.017434620174346202, "grad_norm": 14.473444938659668, "learning_rate": 4.358655043586551e-07, "loss": 1.4365, "step": 14 }, { "epoch": 0.0186799501867995, "grad_norm": 15.4556884765625, "learning_rate": 4.669987546699875e-07, "loss": 1.5645, "step": 15 }, { "epoch": 0.019925280199252802, "grad_norm": 16.610450744628906, "learning_rate": 4.981320049813201e-07, "loss": 1.3652, "step": 16 }, { "epoch": 0.021170610211706103, "grad_norm": Infinity, "learning_rate": 4.981320049813201e-07, "loss": 1.5137, "step": 17 }, { "epoch": 0.0224159402241594, "grad_norm": 16.464548110961914, "learning_rate": 5.292652552926527e-07, "loss": 1.2983, "step": 18 }, { "epoch": 0.023661270236612703, "grad_norm": 13.879263877868652, "learning_rate": 5.60398505603985e-07, "loss": 1.3018, "step": 19 }, { "epoch": 0.024906600249066, "grad_norm": 18.191198348999023, "learning_rate": 5.915317559153176e-07, "loss": 1.5151, "step": 20 }, { "epoch": 0.026151930261519303, "grad_norm": 14.711188316345215, "learning_rate": 6.2266500622665e-07, "loss": 1.4517, "step": 21 }, { "epoch": 0.0273972602739726, "grad_norm": 20.39883804321289, "learning_rate": 6.537982565379826e-07, "loss": 1.5142, "step": 22 }, { "epoch": 0.028642590286425903, "grad_norm": 17.874603271484375, "learning_rate": 6.849315068493151e-07, "loss": 1.4731, "step": 23 }, { "epoch": 0.029887920298879204, "grad_norm": 15.248433113098145, "learning_rate": 7.160647571606476e-07, "loss": 1.4927, "step": 24 }, { "epoch": 0.031133250311332503, "grad_norm": 14.43382453918457, "learning_rate": 7.471980074719802e-07, "loss": 1.2744, "step": 25 }, { "epoch": 0.0323785803237858, "grad_norm": 20.193641662597656, "learning_rate": 7.783312577833126e-07, "loss": 1.5669, "step": 26 }, { "epoch": 0.033623910336239106, "grad_norm": 16.741762161254883, "learning_rate": 8.094645080946451e-07, "loss": 1.5303, "step": 27 }, { "epoch": 0.034869240348692404, "grad_norm": 15.6235933303833, "learning_rate": 8.405977584059777e-07, "loss": 1.3936, "step": 28 }, { "epoch": 0.0361145703611457, "grad_norm": 14.727874755859375, "learning_rate": 8.717310087173102e-07, "loss": 1.4126, "step": 29 }, { "epoch": 0.037359900373599, "grad_norm": 16.20413589477539, "learning_rate": 9.028642590286426e-07, "loss": 1.4624, "step": 30 }, { "epoch": 0.038605230386052306, "grad_norm": 20.796939849853516, "learning_rate": 9.33997509339975e-07, "loss": 1.3433, "step": 31 }, { "epoch": 0.039850560398505604, "grad_norm": 16.971792221069336, "learning_rate": 9.651307596513077e-07, "loss": 1.3628, "step": 32 }, { "epoch": 0.0410958904109589, "grad_norm": 14.428796768188477, "learning_rate": 9.962640099626401e-07, "loss": 1.2837, "step": 33 }, { "epoch": 0.04234122042341221, "grad_norm": 15.790252685546875, "learning_rate": 1.0273972602739725e-06, "loss": 1.4268, "step": 34 }, { "epoch": 0.043586550435865505, "grad_norm": 16.02347183227539, "learning_rate": 1.0585305105853053e-06, "loss": 1.4766, "step": 35 }, { "epoch": 0.0448318804483188, "grad_norm": 15.317863464355469, "learning_rate": 1.0896637608966377e-06, "loss": 1.3018, "step": 36 }, { "epoch": 0.0460772104607721, "grad_norm": 22.28313636779785, "learning_rate": 1.12079701120797e-06, "loss": 1.4688, "step": 37 }, { "epoch": 0.047322540473225407, "grad_norm": 15.996356964111328, "learning_rate": 1.1519302615193027e-06, "loss": 1.1543, "step": 38 }, { "epoch": 0.048567870485678705, "grad_norm": 15.208770751953125, "learning_rate": 1.1830635118306353e-06, "loss": 1.4375, "step": 39 }, { "epoch": 0.049813200498132, "grad_norm": 15.227863311767578, "learning_rate": 1.2141967621419677e-06, "loss": 1.4365, "step": 40 }, { "epoch": 0.05105853051058531, "grad_norm": 14.673625946044922, "learning_rate": 1.2453300124533e-06, "loss": 1.2534, "step": 41 }, { "epoch": 0.052303860523038606, "grad_norm": 17.28438949584961, "learning_rate": 1.2764632627646329e-06, "loss": 1.5381, "step": 42 }, { "epoch": 0.053549190535491904, "grad_norm": 16.5577449798584, "learning_rate": 1.3075965130759652e-06, "loss": 1.5459, "step": 43 }, { "epoch": 0.0547945205479452, "grad_norm": 18.29193687438965, "learning_rate": 1.3387297633872976e-06, "loss": 1.1919, "step": 44 }, { "epoch": 0.05603985056039851, "grad_norm": 15.694727897644043, "learning_rate": 1.3698630136986302e-06, "loss": 1.4409, "step": 45 }, { "epoch": 0.057285180572851806, "grad_norm": 14.10815715789795, "learning_rate": 1.4009962640099628e-06, "loss": 1.2461, "step": 46 }, { "epoch": 0.058530510585305104, "grad_norm": 14.045819282531738, "learning_rate": 1.4321295143212952e-06, "loss": 1.4111, "step": 47 }, { "epoch": 0.05977584059775841, "grad_norm": 19.675201416015625, "learning_rate": 1.4632627646326276e-06, "loss": 1.4072, "step": 48 }, { "epoch": 0.06102117061021171, "grad_norm": 14.410515785217285, "learning_rate": 1.4943960149439604e-06, "loss": 1.23, "step": 49 }, { "epoch": 0.062266500622665005, "grad_norm": 16.496902465820312, "learning_rate": 1.5255292652552928e-06, "loss": 1.3691, "step": 50 }, { "epoch": 0.06351183063511831, "grad_norm": 14.99001407623291, "learning_rate": 1.5566625155666252e-06, "loss": 1.2393, "step": 51 }, { "epoch": 0.0647571606475716, "grad_norm": 14.407447814941406, "learning_rate": 1.5877957658779578e-06, "loss": 1.3501, "step": 52 }, { "epoch": 0.0660024906600249, "grad_norm": 15.634856224060059, "learning_rate": 1.6189290161892901e-06, "loss": 1.5059, "step": 53 }, { "epoch": 0.06724782067247821, "grad_norm": 13.683075904846191, "learning_rate": 1.6500622665006227e-06, "loss": 1.2251, "step": 54 }, { "epoch": 0.0684931506849315, "grad_norm": 15.530966758728027, "learning_rate": 1.6811955168119553e-06, "loss": 1.229, "step": 55 }, { "epoch": 0.06973848069738481, "grad_norm": 14.17822265625, "learning_rate": 1.7123287671232877e-06, "loss": 1.2646, "step": 56 }, { "epoch": 0.07098381070983811, "grad_norm": 14.06949234008789, "learning_rate": 1.7434620174346203e-06, "loss": 1.1851, "step": 57 }, { "epoch": 0.0722291407222914, "grad_norm": 13.386149406433105, "learning_rate": 1.774595267745953e-06, "loss": 1.1406, "step": 58 }, { "epoch": 0.07347447073474471, "grad_norm": 15.319520950317383, "learning_rate": 1.8057285180572853e-06, "loss": 1.2173, "step": 59 }, { "epoch": 0.074719800747198, "grad_norm": 14.985965728759766, "learning_rate": 1.8368617683686179e-06, "loss": 1.3159, "step": 60 }, { "epoch": 0.0759651307596513, "grad_norm": 17.426523208618164, "learning_rate": 1.86799501867995e-06, "loss": 1.0112, "step": 61 }, { "epoch": 0.07721046077210461, "grad_norm": 15.114604949951172, "learning_rate": 1.8991282689912827e-06, "loss": 1.3403, "step": 62 }, { "epoch": 0.0784557907845579, "grad_norm": 16.03323745727539, "learning_rate": 1.9302615193026155e-06, "loss": 1.2666, "step": 63 }, { "epoch": 0.07970112079701121, "grad_norm": 13.463469505310059, "learning_rate": 1.9613947696139476e-06, "loss": 1.106, "step": 64 }, { "epoch": 0.08094645080946451, "grad_norm": 15.67467212677002, "learning_rate": 1.9925280199252802e-06, "loss": 1.3604, "step": 65 }, { "epoch": 0.0821917808219178, "grad_norm": 16.3656063079834, "learning_rate": 2.023661270236613e-06, "loss": 1.0149, "step": 66 }, { "epoch": 0.08343711083437111, "grad_norm": 18.009429931640625, "learning_rate": 2.054794520547945e-06, "loss": 1.2056, "step": 67 }, { "epoch": 0.08468244084682441, "grad_norm": 17.479284286499023, "learning_rate": 2.085927770859278e-06, "loss": 0.96, "step": 68 }, { "epoch": 0.0859277708592777, "grad_norm": 18.173294067382812, "learning_rate": 2.1170610211706106e-06, "loss": 1.0894, "step": 69 }, { "epoch": 0.08717310087173101, "grad_norm": 13.998863220214844, "learning_rate": 2.148194271481943e-06, "loss": 1.1992, "step": 70 }, { "epoch": 0.08841843088418432, "grad_norm": 20.954397201538086, "learning_rate": 2.1793275217932754e-06, "loss": 1.2236, "step": 71 }, { "epoch": 0.0896637608966376, "grad_norm": 15.964156150817871, "learning_rate": 2.210460772104608e-06, "loss": 1.4097, "step": 72 }, { "epoch": 0.09090909090909091, "grad_norm": 15.810689926147461, "learning_rate": 2.24159402241594e-06, "loss": 0.8547, "step": 73 }, { "epoch": 0.0921544209215442, "grad_norm": 17.040708541870117, "learning_rate": 2.2727272727272728e-06, "loss": 1.4102, "step": 74 }, { "epoch": 0.09339975093399751, "grad_norm": 14.936725616455078, "learning_rate": 2.3038605230386054e-06, "loss": 1.249, "step": 75 }, { "epoch": 0.09464508094645081, "grad_norm": 15.473489761352539, "learning_rate": 2.334993773349938e-06, "loss": 0.833, "step": 76 }, { "epoch": 0.0958904109589041, "grad_norm": 20.1041259765625, "learning_rate": 2.3661270236612705e-06, "loss": 1.4458, "step": 77 }, { "epoch": 0.09713574097135741, "grad_norm": 13.799981117248535, "learning_rate": 2.3972602739726027e-06, "loss": 1.0784, "step": 78 }, { "epoch": 0.09838107098381071, "grad_norm": 17.304981231689453, "learning_rate": 2.4283935242839353e-06, "loss": 1.5112, "step": 79 }, { "epoch": 0.099626400996264, "grad_norm": 13.382006645202637, "learning_rate": 2.459526774595268e-06, "loss": 1.063, "step": 80 }, { "epoch": 0.10087173100871731, "grad_norm": 14.760406494140625, "learning_rate": 2.4906600249066e-06, "loss": 1.1277, "step": 81 }, { "epoch": 0.10211706102117062, "grad_norm": 13.276914596557617, "learning_rate": 2.5217932752179327e-06, "loss": 0.9333, "step": 82 }, { "epoch": 0.10336239103362391, "grad_norm": 21.620939254760742, "learning_rate": 2.5529265255292657e-06, "loss": 1.7554, "step": 83 }, { "epoch": 0.10460772104607721, "grad_norm": 18.264217376708984, "learning_rate": 2.584059775840598e-06, "loss": 1.2744, "step": 84 }, { "epoch": 0.10585305105853052, "grad_norm": 15.217682838439941, "learning_rate": 2.6151930261519305e-06, "loss": 1.2827, "step": 85 }, { "epoch": 0.10709838107098381, "grad_norm": 18.51647186279297, "learning_rate": 2.646326276463263e-06, "loss": 1.5586, "step": 86 }, { "epoch": 0.10834371108343711, "grad_norm": 15.398965835571289, "learning_rate": 2.6774595267745952e-06, "loss": 0.916, "step": 87 }, { "epoch": 0.1095890410958904, "grad_norm": 14.449968338012695, "learning_rate": 2.708592777085928e-06, "loss": 0.6475, "step": 88 }, { "epoch": 0.11083437110834371, "grad_norm": 15.214373588562012, "learning_rate": 2.7397260273972604e-06, "loss": 1.1885, "step": 89 }, { "epoch": 0.11207970112079702, "grad_norm": 21.287311553955078, "learning_rate": 2.770859277708593e-06, "loss": 1.3501, "step": 90 }, { "epoch": 0.1133250311332503, "grad_norm": 14.835405349731445, "learning_rate": 2.8019925280199256e-06, "loss": 0.9062, "step": 91 }, { "epoch": 0.11457036114570361, "grad_norm": 16.75213050842285, "learning_rate": 2.833125778331258e-06, "loss": 1.1338, "step": 92 }, { "epoch": 0.11581569115815692, "grad_norm": 14.93796157836914, "learning_rate": 2.8642590286425904e-06, "loss": 0.9265, "step": 93 }, { "epoch": 0.11706102117061021, "grad_norm": 15.707828521728516, "learning_rate": 2.895392278953923e-06, "loss": 1.0312, "step": 94 }, { "epoch": 0.11830635118306351, "grad_norm": 15.904691696166992, "learning_rate": 2.926525529265255e-06, "loss": 0.9608, "step": 95 }, { "epoch": 0.11955168119551682, "grad_norm": 13.52252197265625, "learning_rate": 2.9576587795765878e-06, "loss": 0.6462, "step": 96 }, { "epoch": 0.12079701120797011, "grad_norm": 15.788945198059082, "learning_rate": 2.9887920298879208e-06, "loss": 1.2263, "step": 97 }, { "epoch": 0.12204234122042341, "grad_norm": 15.971314430236816, "learning_rate": 3.019925280199253e-06, "loss": 0.6865, "step": 98 }, { "epoch": 0.1232876712328767, "grad_norm": 16.350345611572266, "learning_rate": 3.0510585305105856e-06, "loss": 0.9343, "step": 99 }, { "epoch": 0.12453300124533001, "grad_norm": 23.604875564575195, "learning_rate": 3.0821917808219177e-06, "loss": 1.2271, "step": 100 }, { "epoch": 0.12577833125778332, "grad_norm": 16.765127182006836, "learning_rate": 3.1133250311332503e-06, "loss": 0.9685, "step": 101 }, { "epoch": 0.12702366127023662, "grad_norm": 19.068199157714844, "learning_rate": 3.144458281444583e-06, "loss": 1.4028, "step": 102 }, { "epoch": 0.12826899128268993, "grad_norm": 23.64339828491211, "learning_rate": 3.1755915317559155e-06, "loss": 1.1528, "step": 103 }, { "epoch": 0.1295143212951432, "grad_norm": 17.963857650756836, "learning_rate": 3.206724782067248e-06, "loss": 1.2183, "step": 104 }, { "epoch": 0.1307596513075965, "grad_norm": 24.50640106201172, "learning_rate": 3.2378580323785803e-06, "loss": 1.1194, "step": 105 }, { "epoch": 0.1320049813200498, "grad_norm": 13.496341705322266, "learning_rate": 3.268991282689913e-06, "loss": 0.8138, "step": 106 }, { "epoch": 0.13325031133250312, "grad_norm": 13.470151901245117, "learning_rate": 3.3001245330012455e-06, "loss": 0.4418, "step": 107 }, { "epoch": 0.13449564134495642, "grad_norm": 15.696036338806152, "learning_rate": 3.331257783312578e-06, "loss": 0.9106, "step": 108 }, { "epoch": 0.1357409713574097, "grad_norm": 15.40795612335205, "learning_rate": 3.3623910336239107e-06, "loss": 0.8492, "step": 109 }, { "epoch": 0.136986301369863, "grad_norm": 14.989590644836426, "learning_rate": 3.393524283935243e-06, "loss": 0.6815, "step": 110 }, { "epoch": 0.1382316313823163, "grad_norm": 11.08140754699707, "learning_rate": 3.4246575342465754e-06, "loss": 0.3635, "step": 111 }, { "epoch": 0.13947696139476962, "grad_norm": 13.492122650146484, "learning_rate": 3.455790784557908e-06, "loss": 0.4391, "step": 112 }, { "epoch": 0.14072229140722292, "grad_norm": 24.947566986083984, "learning_rate": 3.4869240348692406e-06, "loss": 1.6245, "step": 113 }, { "epoch": 0.14196762141967623, "grad_norm": 21.374814987182617, "learning_rate": 3.5180572851805732e-06, "loss": 1.0759, "step": 114 }, { "epoch": 0.1432129514321295, "grad_norm": 12.628018379211426, "learning_rate": 3.549190535491906e-06, "loss": 0.3741, "step": 115 }, { "epoch": 0.1444582814445828, "grad_norm": 28.174150466918945, "learning_rate": 3.5803237858032376e-06, "loss": 1.8252, "step": 116 }, { "epoch": 0.14570361145703611, "grad_norm": 29.708969116210938, "learning_rate": 3.6114570361145706e-06, "loss": 1.6035, "step": 117 }, { "epoch": 0.14694894146948942, "grad_norm": 14.904471397399902, "learning_rate": 3.642590286425903e-06, "loss": 0.693, "step": 118 }, { "epoch": 0.14819427148194272, "grad_norm": 19.106191635131836, "learning_rate": 3.6737235367372358e-06, "loss": 0.7761, "step": 119 }, { "epoch": 0.149439601494396, "grad_norm": 21.0386905670166, "learning_rate": 3.7048567870485684e-06, "loss": 1.1099, "step": 120 }, { "epoch": 0.1506849315068493, "grad_norm": 11.261611938476562, "learning_rate": 3.7359900373599e-06, "loss": 0.3363, "step": 121 }, { "epoch": 0.1519302615193026, "grad_norm": 21.45566749572754, "learning_rate": 3.7671232876712327e-06, "loss": 1.1392, "step": 122 }, { "epoch": 0.15317559153175592, "grad_norm": 23.72317123413086, "learning_rate": 3.7982565379825653e-06, "loss": 1.2175, "step": 123 }, { "epoch": 0.15442092154420922, "grad_norm": 9.110578536987305, "learning_rate": 3.829389788293898e-06, "loss": 0.2401, "step": 124 }, { "epoch": 0.15566625155666253, "grad_norm": 10.689005851745605, "learning_rate": 3.860523038605231e-06, "loss": 0.2262, "step": 125 }, { "epoch": 0.1569115815691158, "grad_norm": 18.003347396850586, "learning_rate": 3.8916562889165635e-06, "loss": 0.8304, "step": 126 }, { "epoch": 0.1581569115815691, "grad_norm": 16.37116241455078, "learning_rate": 3.922789539227895e-06, "loss": 0.6732, "step": 127 }, { "epoch": 0.15940224159402241, "grad_norm": 20.549619674682617, "learning_rate": 3.953922789539228e-06, "loss": 0.7898, "step": 128 }, { "epoch": 0.16064757160647572, "grad_norm": 27.759565353393555, "learning_rate": 3.9850560398505605e-06, "loss": 1.6685, "step": 129 }, { "epoch": 0.16189290161892902, "grad_norm": 10.014034271240234, "learning_rate": 4.016189290161893e-06, "loss": 0.2059, "step": 130 }, { "epoch": 0.16313823163138233, "grad_norm": 18.375551223754883, "learning_rate": 4.047322540473226e-06, "loss": 0.5604, "step": 131 }, { "epoch": 0.1643835616438356, "grad_norm": 23.120948791503906, "learning_rate": 4.078455790784558e-06, "loss": 1.2139, "step": 132 }, { "epoch": 0.1656288916562889, "grad_norm": 20.939762115478516, "learning_rate": 4.10958904109589e-06, "loss": 0.8262, "step": 133 }, { "epoch": 0.16687422166874222, "grad_norm": 39.98530578613281, "learning_rate": 4.140722291407223e-06, "loss": 1.2119, "step": 134 }, { "epoch": 0.16811955168119552, "grad_norm": 16.684823989868164, "learning_rate": 4.171855541718556e-06, "loss": 0.7434, "step": 135 }, { "epoch": 0.16936488169364883, "grad_norm": 8.765166282653809, "learning_rate": 4.202988792029889e-06, "loss": 0.1506, "step": 136 }, { "epoch": 0.1706102117061021, "grad_norm": 20.599409103393555, "learning_rate": 4.234122042341221e-06, "loss": 0.8276, "step": 137 }, { "epoch": 0.1718555417185554, "grad_norm": 27.572763442993164, "learning_rate": 4.265255292652553e-06, "loss": 1.0833, "step": 138 }, { "epoch": 0.17310087173100872, "grad_norm": 18.92407989501953, "learning_rate": 4.296388542963886e-06, "loss": 0.4558, "step": 139 }, { "epoch": 0.17434620174346202, "grad_norm": 17.19509506225586, "learning_rate": 4.327521793275218e-06, "loss": 0.2935, "step": 140 }, { "epoch": 0.17559153175591533, "grad_norm": 24.49059295654297, "learning_rate": 4.358655043586551e-06, "loss": 0.7617, "step": 141 }, { "epoch": 0.17683686176836863, "grad_norm": 10.664165496826172, "learning_rate": 4.389788293897883e-06, "loss": 0.2395, "step": 142 }, { "epoch": 0.1780821917808219, "grad_norm": 25.44748878479004, "learning_rate": 4.420921544209216e-06, "loss": 0.9827, "step": 143 }, { "epoch": 0.1793275217932752, "grad_norm": 15.069397926330566, "learning_rate": 4.452054794520548e-06, "loss": 0.631, "step": 144 }, { "epoch": 0.18057285180572852, "grad_norm": 18.701967239379883, "learning_rate": 4.48318804483188e-06, "loss": 0.8523, "step": 145 }, { "epoch": 0.18181818181818182, "grad_norm": 29.00722885131836, "learning_rate": 4.514321295143213e-06, "loss": 1.2954, "step": 146 }, { "epoch": 0.18306351183063513, "grad_norm": 9.37511157989502, "learning_rate": 4.5454545454545455e-06, "loss": 0.2489, "step": 147 }, { "epoch": 0.1843088418430884, "grad_norm": 6.786942005157471, "learning_rate": 4.576587795765878e-06, "loss": 0.1326, "step": 148 }, { "epoch": 0.1855541718555417, "grad_norm": 28.655126571655273, "learning_rate": 4.607721046077211e-06, "loss": 0.9426, "step": 149 }, { "epoch": 0.18679950186799502, "grad_norm": 6.270091533660889, "learning_rate": 4.638854296388543e-06, "loss": 0.203, "step": 150 }, { "epoch": 0.18804483188044832, "grad_norm": 24.001052856445312, "learning_rate": 4.669987546699876e-06, "loss": 0.6611, "step": 151 }, { "epoch": 0.18929016189290163, "grad_norm": 5.734297275543213, "learning_rate": 4.7011207970112085e-06, "loss": 0.1378, "step": 152 }, { "epoch": 0.19053549190535493, "grad_norm": 10.421098709106445, "learning_rate": 4.732254047322541e-06, "loss": 0.1292, "step": 153 }, { "epoch": 0.1917808219178082, "grad_norm": 6.499827861785889, "learning_rate": 4.763387297633874e-06, "loss": 0.1825, "step": 154 }, { "epoch": 0.1930261519302615, "grad_norm": 7.8410563468933105, "learning_rate": 4.7945205479452054e-06, "loss": 0.2148, "step": 155 }, { "epoch": 0.19427148194271482, "grad_norm": 21.975595474243164, "learning_rate": 4.825653798256538e-06, "loss": 0.3541, "step": 156 }, { "epoch": 0.19551681195516812, "grad_norm": Infinity, "learning_rate": 4.825653798256538e-06, "loss": 0.611, "step": 157 }, { "epoch": 0.19676214196762143, "grad_norm": 41.450469970703125, "learning_rate": 4.856787048567871e-06, "loss": 0.7124, "step": 158 }, { "epoch": 0.1980074719800747, "grad_norm": 11.570192337036133, "learning_rate": 4.887920298879203e-06, "loss": 0.2204, "step": 159 }, { "epoch": 0.199252801992528, "grad_norm": 9.37869930267334, "learning_rate": 4.919053549190536e-06, "loss": 0.2504, "step": 160 }, { "epoch": 0.20049813200498132, "grad_norm": 10.956586837768555, "learning_rate": 4.950186799501868e-06, "loss": 0.2246, "step": 161 }, { "epoch": 0.20174346201743462, "grad_norm": 6.231212139129639, "learning_rate": 4.9813200498132e-06, "loss": 0.1144, "step": 162 }, { "epoch": 0.20298879202988793, "grad_norm": 7.454379558563232, "learning_rate": 5.012453300124533e-06, "loss": 0.1583, "step": 163 }, { "epoch": 0.20423412204234123, "grad_norm": 4.702846050262451, "learning_rate": 5.043586550435865e-06, "loss": 0.0929, "step": 164 }, { "epoch": 0.2054794520547945, "grad_norm": 35.559165954589844, "learning_rate": 5.074719800747199e-06, "loss": 0.4275, "step": 165 }, { "epoch": 0.20672478206724781, "grad_norm": 2.42557430267334, "learning_rate": 5.105853051058531e-06, "loss": 0.0526, "step": 166 }, { "epoch": 0.20797011207970112, "grad_norm": 1.8609647750854492, "learning_rate": 5.136986301369863e-06, "loss": 0.0334, "step": 167 }, { "epoch": 0.20921544209215442, "grad_norm": 4.347940921783447, "learning_rate": 5.168119551681196e-06, "loss": 0.095, "step": 168 }, { "epoch": 0.21046077210460773, "grad_norm": 7.721733093261719, "learning_rate": 5.199252801992528e-06, "loss": 0.1641, "step": 169 }, { "epoch": 0.21170610211706103, "grad_norm": 42.037933349609375, "learning_rate": 5.230386052303861e-06, "loss": 0.4911, "step": 170 }, { "epoch": 0.2129514321295143, "grad_norm": 15.133713722229004, "learning_rate": 5.2615193026151935e-06, "loss": 0.1069, "step": 171 }, { "epoch": 0.21419676214196762, "grad_norm": 3.205000638961792, "learning_rate": 5.292652552926526e-06, "loss": 0.0497, "step": 172 }, { "epoch": 0.21544209215442092, "grad_norm": 1.0115067958831787, "learning_rate": 5.323785803237858e-06, "loss": 0.0211, "step": 173 }, { "epoch": 0.21668742216687423, "grad_norm": Infinity, "learning_rate": 5.323785803237858e-06, "loss": 0.6843, "step": 174 }, { "epoch": 0.21793275217932753, "grad_norm": 2.7913990020751953, "learning_rate": 5.3549190535491905e-06, "loss": 0.03, "step": 175 }, { "epoch": 0.2191780821917808, "grad_norm": 14.680956840515137, "learning_rate": 5.386052303860523e-06, "loss": 0.0976, "step": 176 }, { "epoch": 0.22042341220423411, "grad_norm": 5.276736736297607, "learning_rate": 5.417185554171856e-06, "loss": 0.0715, "step": 177 }, { "epoch": 0.22166874221668742, "grad_norm": 2.4684441089630127, "learning_rate": 5.448318804483188e-06, "loss": 0.0288, "step": 178 }, { "epoch": 0.22291407222914073, "grad_norm": 1.0922425985336304, "learning_rate": 5.479452054794521e-06, "loss": 0.0211, "step": 179 }, { "epoch": 0.22415940224159403, "grad_norm": 9.240842819213867, "learning_rate": 5.5105853051058535e-06, "loss": 0.0652, "step": 180 }, { "epoch": 0.22540473225404734, "grad_norm": 38.4419059753418, "learning_rate": 5.541718555417186e-06, "loss": 0.685, "step": 181 }, { "epoch": 0.2266500622665006, "grad_norm": 15.644163131713867, "learning_rate": 5.572851805728519e-06, "loss": 0.4103, "step": 182 }, { "epoch": 0.22789539227895392, "grad_norm": 2.4954333305358887, "learning_rate": 5.603985056039851e-06, "loss": 0.0449, "step": 183 }, { "epoch": 0.22914072229140722, "grad_norm": 18.7884521484375, "learning_rate": 5.635118306351184e-06, "loss": 0.3378, "step": 184 }, { "epoch": 0.23038605230386053, "grad_norm": Infinity, "learning_rate": 5.635118306351184e-06, "loss": 0.8211, "step": 185 }, { "epoch": 0.23163138231631383, "grad_norm": 2.243523359298706, "learning_rate": 5.666251556662516e-06, "loss": 0.0479, "step": 186 }, { "epoch": 0.2328767123287671, "grad_norm": 3.3581135272979736, "learning_rate": 5.697384806973848e-06, "loss": 0.0505, "step": 187 }, { "epoch": 0.23412204234122042, "grad_norm": 1.6243762969970703, "learning_rate": 5.728518057285181e-06, "loss": 0.0287, "step": 188 }, { "epoch": 0.23536737235367372, "grad_norm": 55.31060791015625, "learning_rate": 5.759651307596513e-06, "loss": 0.2187, "step": 189 }, { "epoch": 0.23661270236612703, "grad_norm": 0.3759680986404419, "learning_rate": 5.790784557907846e-06, "loss": 0.0085, "step": 190 }, { "epoch": 0.23785803237858033, "grad_norm": 10.535552978515625, "learning_rate": 5.821917808219178e-06, "loss": 0.1855, "step": 191 }, { "epoch": 0.23910336239103364, "grad_norm": 11.76515007019043, "learning_rate": 5.85305105853051e-06, "loss": 0.0808, "step": 192 }, { "epoch": 0.2403486924034869, "grad_norm": 16.85251808166504, "learning_rate": 5.884184308841843e-06, "loss": 0.2412, "step": 193 }, { "epoch": 0.24159402241594022, "grad_norm": 0.46440303325653076, "learning_rate": 5.9153175591531755e-06, "loss": 0.008, "step": 194 }, { "epoch": 0.24283935242839352, "grad_norm": 0.7289634943008423, "learning_rate": 5.946450809464509e-06, "loss": 0.013, "step": 195 }, { "epoch": 0.24408468244084683, "grad_norm": 11.138826370239258, "learning_rate": 5.9775840597758416e-06, "loss": 0.1779, "step": 196 }, { "epoch": 0.24533001245330013, "grad_norm": 1.223634123802185, "learning_rate": 6.008717310087173e-06, "loss": 0.0177, "step": 197 }, { "epoch": 0.2465753424657534, "grad_norm": 3.939805507659912, "learning_rate": 6.039850560398506e-06, "loss": 0.0818, "step": 198 }, { "epoch": 0.24782067247820672, "grad_norm": 137.29930114746094, "learning_rate": 6.0709838107098385e-06, "loss": 3.1221, "step": 199 }, { "epoch": 0.24906600249066002, "grad_norm": 3.8515782356262207, "learning_rate": 6.102117061021171e-06, "loss": 0.0835, "step": 200 }, { "epoch": 0.2503113325031133, "grad_norm": 1.5677456855773926, "learning_rate": 6.133250311332504e-06, "loss": 0.0312, "step": 201 }, { "epoch": 0.25155666251556663, "grad_norm": 1.6086269617080688, "learning_rate": 6.1643835616438354e-06, "loss": 0.0299, "step": 202 }, { "epoch": 0.25280199252801994, "grad_norm": 0.9720219969749451, "learning_rate": 6.195516811955168e-06, "loss": 0.0152, "step": 203 }, { "epoch": 0.25404732254047324, "grad_norm": 29.63043212890625, "learning_rate": 6.226650062266501e-06, "loss": 0.1063, "step": 204 }, { "epoch": 0.25529265255292655, "grad_norm": 0.7106034159660339, "learning_rate": 6.257783312577833e-06, "loss": 0.0128, "step": 205 }, { "epoch": 0.25653798256537985, "grad_norm": 0.7417896389961243, "learning_rate": 6.288916562889166e-06, "loss": 0.0138, "step": 206 }, { "epoch": 0.2577833125778331, "grad_norm": 2.157313823699951, "learning_rate": 6.3200498132004984e-06, "loss": 0.0267, "step": 207 }, { "epoch": 0.2590286425902864, "grad_norm": 0.8388156294822693, "learning_rate": 6.351183063511831e-06, "loss": 0.0125, "step": 208 }, { "epoch": 0.2602739726027397, "grad_norm": 0.33427631855010986, "learning_rate": 6.382316313823164e-06, "loss": 0.0067, "step": 209 }, { "epoch": 0.261519302615193, "grad_norm": 0.7715888023376465, "learning_rate": 6.413449564134496e-06, "loss": 0.0112, "step": 210 }, { "epoch": 0.2627646326276463, "grad_norm": 0.23136259615421295, "learning_rate": 6.444582814445828e-06, "loss": 0.0052, "step": 211 }, { "epoch": 0.2640099626400996, "grad_norm": 149.45394897460938, "learning_rate": 6.4757160647571606e-06, "loss": 0.3285, "step": 212 }, { "epoch": 0.26525529265255293, "grad_norm": 2.4453482627868652, "learning_rate": 6.506849315068493e-06, "loss": 0.0472, "step": 213 }, { "epoch": 0.26650062266500624, "grad_norm": 2.4057695865631104, "learning_rate": 6.537982565379826e-06, "loss": 0.033, "step": 214 }, { "epoch": 0.26774595267745954, "grad_norm": 0.2910887598991394, "learning_rate": 6.569115815691158e-06, "loss": 0.0054, "step": 215 }, { "epoch": 0.26899128268991285, "grad_norm": 0.9707146286964417, "learning_rate": 6.600249066002491e-06, "loss": 0.0173, "step": 216 }, { "epoch": 0.27023661270236615, "grad_norm": 0.2008867859840393, "learning_rate": 6.6313823163138235e-06, "loss": 0.0038, "step": 217 }, { "epoch": 0.2714819427148194, "grad_norm": 1.5367100238800049, "learning_rate": 6.662515566625156e-06, "loss": 0.0185, "step": 218 }, { "epoch": 0.2727272727272727, "grad_norm": 0.5055931806564331, "learning_rate": 6.693648816936489e-06, "loss": 0.0066, "step": 219 }, { "epoch": 0.273972602739726, "grad_norm": 0.4430530071258545, "learning_rate": 6.724782067247821e-06, "loss": 0.0062, "step": 220 }, { "epoch": 0.2752179327521793, "grad_norm": 2.2975895404815674, "learning_rate": 6.755915317559154e-06, "loss": 0.01, "step": 221 }, { "epoch": 0.2764632627646326, "grad_norm": 0.8265185952186584, "learning_rate": 6.787048567870486e-06, "loss": 0.0136, "step": 222 }, { "epoch": 0.2777085927770859, "grad_norm": 168.16004943847656, "learning_rate": 6.818181818181818e-06, "loss": 2.9077, "step": 223 }, { "epoch": 0.27895392278953923, "grad_norm": 0.7623637318611145, "learning_rate": 6.849315068493151e-06, "loss": 0.0124, "step": 224 }, { "epoch": 0.28019925280199254, "grad_norm": 0.5590365529060364, "learning_rate": 6.8804483188044835e-06, "loss": 0.0115, "step": 225 }, { "epoch": 0.28144458281444584, "grad_norm": 0.36643216013908386, "learning_rate": 6.911581569115816e-06, "loss": 0.005, "step": 226 }, { "epoch": 0.28268991282689915, "grad_norm": 0.33054330945014954, "learning_rate": 6.942714819427149e-06, "loss": 0.0077, "step": 227 }, { "epoch": 0.28393524283935245, "grad_norm": 0.34179171919822693, "learning_rate": 6.973848069738481e-06, "loss": 0.0077, "step": 228 }, { "epoch": 0.2851805728518057, "grad_norm": 7.439018726348877, "learning_rate": 7.004981320049814e-06, "loss": 0.0183, "step": 229 }, { "epoch": 0.286425902864259, "grad_norm": 0.4672091603279114, "learning_rate": 7.0361145703611465e-06, "loss": 0.0088, "step": 230 }, { "epoch": 0.2876712328767123, "grad_norm": 43.73134994506836, "learning_rate": 7.067247820672479e-06, "loss": 0.0645, "step": 231 }, { "epoch": 0.2889165628891656, "grad_norm": 0.5883788466453552, "learning_rate": 7.098381070983812e-06, "loss": 0.0077, "step": 232 }, { "epoch": 0.2901618929016189, "grad_norm": 0.11801683157682419, "learning_rate": 7.1295143212951425e-06, "loss": 0.0025, "step": 233 }, { "epoch": 0.29140722291407223, "grad_norm": 0.4613223671913147, "learning_rate": 7.160647571606475e-06, "loss": 0.0061, "step": 234 }, { "epoch": 0.29265255292652553, "grad_norm": 0.46132174134254456, "learning_rate": 7.191780821917809e-06, "loss": 0.0054, "step": 235 }, { "epoch": 0.29389788293897884, "grad_norm": Infinity, "learning_rate": 7.191780821917809e-06, "loss": 0.4395, "step": 236 }, { "epoch": 0.29514321295143214, "grad_norm": 0.17022739350795746, "learning_rate": 7.222914072229141e-06, "loss": 0.0041, "step": 237 }, { "epoch": 0.29638854296388545, "grad_norm": 0.10204841196537018, "learning_rate": 7.254047322540474e-06, "loss": 0.0025, "step": 238 }, { "epoch": 0.29763387297633875, "grad_norm": 0.21153950691223145, "learning_rate": 7.285180572851806e-06, "loss": 0.0037, "step": 239 }, { "epoch": 0.298879202988792, "grad_norm": 0.15493176877498627, "learning_rate": 7.316313823163139e-06, "loss": 0.003, "step": 240 }, { "epoch": 0.3001245330012453, "grad_norm": 0.24285216629505157, "learning_rate": 7.3474470734744716e-06, "loss": 0.0049, "step": 241 }, { "epoch": 0.3013698630136986, "grad_norm": 0.19606204330921173, "learning_rate": 7.378580323785804e-06, "loss": 0.0031, "step": 242 }, { "epoch": 0.3026151930261519, "grad_norm": 2.727463483810425, "learning_rate": 7.409713574097137e-06, "loss": 0.0078, "step": 243 }, { "epoch": 0.3038605230386052, "grad_norm": 0.1808951050043106, "learning_rate": 7.440846824408469e-06, "loss": 0.0039, "step": 244 }, { "epoch": 0.30510585305105853, "grad_norm": 0.24642078578472137, "learning_rate": 7.4719800747198e-06, "loss": 0.0047, "step": 245 }, { "epoch": 0.30635118306351183, "grad_norm": 0.10990118980407715, "learning_rate": 7.503113325031133e-06, "loss": 0.0021, "step": 246 }, { "epoch": 0.30759651307596514, "grad_norm": 0.08530181646347046, "learning_rate": 7.5342465753424655e-06, "loss": 0.0022, "step": 247 }, { "epoch": 0.30884184308841844, "grad_norm": 1.499770998954773, "learning_rate": 7.565379825653798e-06, "loss": 0.0047, "step": 248 }, { "epoch": 0.31008717310087175, "grad_norm": 0.08772747963666916, "learning_rate": 7.596513075965131e-06, "loss": 0.0026, "step": 249 }, { "epoch": 0.31133250311332505, "grad_norm": 0.38723257184028625, "learning_rate": 7.627646326276463e-06, "loss": 0.0045, "step": 250 }, { "epoch": 0.3125778331257783, "grad_norm": 0.09018506854772568, "learning_rate": 7.658779576587797e-06, "loss": 0.002, "step": 251 }, { "epoch": 0.3138231631382316, "grad_norm": 3.251638650894165, "learning_rate": 7.689912826899128e-06, "loss": 0.0073, "step": 252 }, { "epoch": 0.3150684931506849, "grad_norm": 0.17742273211479187, "learning_rate": 7.721046077210462e-06, "loss": 0.0034, "step": 253 }, { "epoch": 0.3163138231631382, "grad_norm": 4.7799201011657715, "learning_rate": 7.752179327521794e-06, "loss": 0.0149, "step": 254 }, { "epoch": 0.3175591531755915, "grad_norm": 0.7822676301002502, "learning_rate": 7.783312577833127e-06, "loss": 0.0043, "step": 255 }, { "epoch": 0.31880448318804483, "grad_norm": 0.07635273039340973, "learning_rate": 7.814445828144457e-06, "loss": 0.0019, "step": 256 }, { "epoch": 0.32004981320049813, "grad_norm": 0.128676638007164, "learning_rate": 7.84557907845579e-06, "loss": 0.0031, "step": 257 }, { "epoch": 0.32129514321295144, "grad_norm": 0.35170984268188477, "learning_rate": 7.876712328767124e-06, "loss": 0.0034, "step": 258 }, { "epoch": 0.32254047322540474, "grad_norm": 0.17562495172023773, "learning_rate": 7.907845579078456e-06, "loss": 0.0036, "step": 259 }, { "epoch": 0.32378580323785805, "grad_norm": 0.4719379246234894, "learning_rate": 7.93897882938979e-06, "loss": 0.0052, "step": 260 }, { "epoch": 0.32503113325031135, "grad_norm": 1.012569546699524, "learning_rate": 7.970112079701121e-06, "loss": 0.0034, "step": 261 }, { "epoch": 0.32627646326276466, "grad_norm": 0.6060551404953003, "learning_rate": 8.001245330012454e-06, "loss": 0.0033, "step": 262 }, { "epoch": 0.3275217932752179, "grad_norm": 0.04582296311855316, "learning_rate": 8.032378580323786e-06, "loss": 0.0012, "step": 263 }, { "epoch": 0.3287671232876712, "grad_norm": 0.109385184943676, "learning_rate": 8.06351183063512e-06, "loss": 0.0023, "step": 264 }, { "epoch": 0.3300124533001245, "grad_norm": 0.056446850299835205, "learning_rate": 8.094645080946451e-06, "loss": 0.0013, "step": 265 }, { "epoch": 0.3312577833125778, "grad_norm": 0.10354617983102798, "learning_rate": 8.125778331257785e-06, "loss": 0.002, "step": 266 }, { "epoch": 0.33250311332503113, "grad_norm": 0.14216098189353943, "learning_rate": 8.156911581569117e-06, "loss": 0.0029, "step": 267 }, { "epoch": 0.33374844333748444, "grad_norm": 0.07656246423721313, "learning_rate": 8.188044831880448e-06, "loss": 0.0018, "step": 268 }, { "epoch": 0.33499377334993774, "grad_norm": 0.2349928468465805, "learning_rate": 8.21917808219178e-06, "loss": 0.0034, "step": 269 }, { "epoch": 0.33623910336239105, "grad_norm": 0.1743057817220688, "learning_rate": 8.250311332503113e-06, "loss": 0.0041, "step": 270 }, { "epoch": 0.33748443337484435, "grad_norm": 0.05078033730387688, "learning_rate": 8.281444582814445e-06, "loss": 0.0015, "step": 271 }, { "epoch": 0.33872976338729766, "grad_norm": 0.12597429752349854, "learning_rate": 8.312577833125779e-06, "loss": 0.0032, "step": 272 }, { "epoch": 0.33997509339975096, "grad_norm": 0.09458588808774948, "learning_rate": 8.343711083437112e-06, "loss": 0.002, "step": 273 }, { "epoch": 0.3412204234122042, "grad_norm": 0.20183101296424866, "learning_rate": 8.374844333748444e-06, "loss": 0.0043, "step": 274 }, { "epoch": 0.3424657534246575, "grad_norm": 0.16585314273834229, "learning_rate": 8.405977584059777e-06, "loss": 0.0026, "step": 275 }, { "epoch": 0.3437110834371108, "grad_norm": 0.05950070172548294, "learning_rate": 8.437110834371109e-06, "loss": 0.0018, "step": 276 }, { "epoch": 0.3449564134495641, "grad_norm": 0.062412526458501816, "learning_rate": 8.468244084682442e-06, "loss": 0.0017, "step": 277 }, { "epoch": 0.34620174346201743, "grad_norm": 297.8834533691406, "learning_rate": 8.499377334993774e-06, "loss": 2.7641, "step": 278 }, { "epoch": 0.34744707347447074, "grad_norm": 0.18788257241249084, "learning_rate": 8.530510585305106e-06, "loss": 0.0031, "step": 279 }, { "epoch": 0.34869240348692404, "grad_norm": 0.05538473278284073, "learning_rate": 8.561643835616438e-06, "loss": 0.0014, "step": 280 }, { "epoch": 0.34993773349937735, "grad_norm": 0.05929434299468994, "learning_rate": 8.592777085927771e-06, "loss": 0.0015, "step": 281 }, { "epoch": 0.35118306351183065, "grad_norm": 0.15558889508247375, "learning_rate": 8.623910336239103e-06, "loss": 0.0032, "step": 282 }, { "epoch": 0.35242839352428396, "grad_norm": 0.0714510902762413, "learning_rate": 8.655043586550436e-06, "loss": 0.002, "step": 283 }, { "epoch": 0.35367372353673726, "grad_norm": 2.3466129302978516, "learning_rate": 8.686176836861768e-06, "loss": 0.0066, "step": 284 }, { "epoch": 0.3549190535491905, "grad_norm": 17.250829696655273, "learning_rate": 8.717310087173102e-06, "loss": 0.0224, "step": 285 }, { "epoch": 0.3561643835616438, "grad_norm": 0.03599457070231438, "learning_rate": 8.748443337484433e-06, "loss": 0.0011, "step": 286 }, { "epoch": 0.3574097135740971, "grad_norm": 0.05941268801689148, "learning_rate": 8.779576587795767e-06, "loss": 0.0019, "step": 287 }, { "epoch": 0.3586550435865504, "grad_norm": 1.2639917135238647, "learning_rate": 8.810709838107099e-06, "loss": 0.0044, "step": 288 }, { "epoch": 0.35990037359900373, "grad_norm": 0.04103681072592735, "learning_rate": 8.841843088418432e-06, "loss": 0.001, "step": 289 }, { "epoch": 0.36114570361145704, "grad_norm": 0.03893645480275154, "learning_rate": 8.872976338729764e-06, "loss": 0.001, "step": 290 }, { "epoch": 0.36239103362391034, "grad_norm": 0.038509551435709, "learning_rate": 8.904109589041095e-06, "loss": 0.0009, "step": 291 }, { "epoch": 0.36363636363636365, "grad_norm": 0.03188912197947502, "learning_rate": 8.935242839352429e-06, "loss": 0.001, "step": 292 }, { "epoch": 0.36488169364881695, "grad_norm": 0.048545584082603455, "learning_rate": 8.96637608966376e-06, "loss": 0.0011, "step": 293 }, { "epoch": 0.36612702366127026, "grad_norm": 0.0602889247238636, "learning_rate": 8.997509339975094e-06, "loss": 0.0015, "step": 294 }, { "epoch": 0.36737235367372356, "grad_norm": 0.05375710129737854, "learning_rate": 9.028642590286426e-06, "loss": 0.0016, "step": 295 }, { "epoch": 0.3686176836861768, "grad_norm": 0.043809376657009125, "learning_rate": 9.05977584059776e-06, "loss": 0.0012, "step": 296 }, { "epoch": 0.3698630136986301, "grad_norm": 0.0780409425497055, "learning_rate": 9.090909090909091e-06, "loss": 0.0022, "step": 297 }, { "epoch": 0.3711083437110834, "grad_norm": 0.06276142597198486, "learning_rate": 9.122042341220424e-06, "loss": 0.0017, "step": 298 }, { "epoch": 0.3723536737235367, "grad_norm": 0.060071829706430435, "learning_rate": 9.153175591531756e-06, "loss": 0.0014, "step": 299 }, { "epoch": 0.37359900373599003, "grad_norm": 0.032719388604164124, "learning_rate": 9.18430884184309e-06, "loss": 0.0007, "step": 300 }, { "epoch": 0.37484433374844334, "grad_norm": 0.034909844398498535, "learning_rate": 9.215442092154421e-06, "loss": 0.001, "step": 301 }, { "epoch": 0.37608966376089664, "grad_norm": 0.034523140639066696, "learning_rate": 9.246575342465753e-06, "loss": 0.0011, "step": 302 }, { "epoch": 0.37733499377334995, "grad_norm": 0.05015862360596657, "learning_rate": 9.277708592777087e-06, "loss": 0.0013, "step": 303 }, { "epoch": 0.37858032378580325, "grad_norm": 0.05602340027689934, "learning_rate": 9.308841843088418e-06, "loss": 0.0016, "step": 304 }, { "epoch": 0.37982565379825656, "grad_norm": 0.04742440581321716, "learning_rate": 9.339975093399752e-06, "loss": 0.0014, "step": 305 }, { "epoch": 0.38107098381070986, "grad_norm": 0.03035055100917816, "learning_rate": 9.371108343711084e-06, "loss": 0.0009, "step": 306 }, { "epoch": 0.3823163138231631, "grad_norm": 241.25111389160156, "learning_rate": 9.402241594022417e-06, "loss": 0.1876, "step": 307 }, { "epoch": 0.3835616438356164, "grad_norm": 0.03797473758459091, "learning_rate": 9.433374844333749e-06, "loss": 0.001, "step": 308 }, { "epoch": 0.3848069738480697, "grad_norm": 0.03934524580836296, "learning_rate": 9.464508094645082e-06, "loss": 0.001, "step": 309 }, { "epoch": 0.386052303860523, "grad_norm": 0.04892684891819954, "learning_rate": 9.495641344956414e-06, "loss": 0.0013, "step": 310 }, { "epoch": 0.38729763387297633, "grad_norm": 0.06903809309005737, "learning_rate": 9.526774595267747e-06, "loss": 0.0018, "step": 311 }, { "epoch": 0.38854296388542964, "grad_norm": 0.17654924094676971, "learning_rate": 9.557907845579077e-06, "loss": 0.0018, "step": 312 }, { "epoch": 0.38978829389788294, "grad_norm": 0.047983210533857346, "learning_rate": 9.589041095890411e-06, "loss": 0.001, "step": 313 }, { "epoch": 0.39103362391033625, "grad_norm": 0.0729343593120575, "learning_rate": 9.620174346201744e-06, "loss": 0.0018, "step": 314 }, { "epoch": 0.39227895392278955, "grad_norm": 0.025607705116271973, "learning_rate": 9.651307596513076e-06, "loss": 0.0007, "step": 315 }, { "epoch": 0.39352428393524286, "grad_norm": 0.0369686633348465, "learning_rate": 9.68244084682441e-06, "loss": 0.001, "step": 316 }, { "epoch": 0.39476961394769616, "grad_norm": 0.03150925785303116, "learning_rate": 9.713574097135741e-06, "loss": 0.001, "step": 317 }, { "epoch": 0.3960149439601494, "grad_norm": 537.4097900390625, "learning_rate": 9.744707347447075e-06, "loss": 0.9077, "step": 318 }, { "epoch": 0.3972602739726027, "grad_norm": 0.036139559000730515, "learning_rate": 9.775840597758406e-06, "loss": 0.0011, "step": 319 }, { "epoch": 0.398505603985056, "grad_norm": 0.10030055046081543, "learning_rate": 9.80697384806974e-06, "loss": 0.0019, "step": 320 }, { "epoch": 0.39975093399750933, "grad_norm": 0.20713728666305542, "learning_rate": 9.838107098381072e-06, "loss": 0.0013, "step": 321 }, { "epoch": 0.40099626400996263, "grad_norm": 0.21006031334400177, "learning_rate": 9.869240348692405e-06, "loss": 0.0021, "step": 322 }, { "epoch": 0.40224159402241594, "grad_norm": 409.08544921875, "learning_rate": 9.900373599003735e-06, "loss": 1.8641, "step": 323 }, { "epoch": 0.40348692403486924, "grad_norm": 0.04977629333734512, "learning_rate": 9.931506849315069e-06, "loss": 0.0012, "step": 324 }, { "epoch": 0.40473225404732255, "grad_norm": 0.06899397075176239, "learning_rate": 9.9626400996264e-06, "loss": 0.0011, "step": 325 }, { "epoch": 0.40597758405977585, "grad_norm": 0.3704112470149994, "learning_rate": 9.993773349937734e-06, "loss": 0.0014, "step": 326 }, { "epoch": 0.40722291407222916, "grad_norm": 0.03436332195997238, "learning_rate": 1.0024906600249066e-05, "loss": 0.0011, "step": 327 }, { "epoch": 0.40846824408468246, "grad_norm": 0.03816661238670349, "learning_rate": 1.0056039850560399e-05, "loss": 0.0009, "step": 328 }, { "epoch": 0.40971357409713577, "grad_norm": 0.053675808012485504, "learning_rate": 1.008717310087173e-05, "loss": 0.0014, "step": 329 }, { "epoch": 0.410958904109589, "grad_norm": 0.024651149287819862, "learning_rate": 1.0118306351183064e-05, "loss": 0.0007, "step": 330 }, { "epoch": 0.4122042341220423, "grad_norm": 0.03284426033496857, "learning_rate": 1.0149439601494398e-05, "loss": 0.001, "step": 331 }, { "epoch": 0.41344956413449563, "grad_norm": 0.03643254190683365, "learning_rate": 1.018057285180573e-05, "loss": 0.0011, "step": 332 }, { "epoch": 0.41469489414694893, "grad_norm": 0.02989336848258972, "learning_rate": 1.0211706102117063e-05, "loss": 0.0008, "step": 333 }, { "epoch": 0.41594022415940224, "grad_norm": 0.020424343645572662, "learning_rate": 1.0242839352428395e-05, "loss": 0.0007, "step": 334 }, { "epoch": 0.41718555417185554, "grad_norm": 0.03185396268963814, "learning_rate": 1.0273972602739726e-05, "loss": 0.0009, "step": 335 }, { "epoch": 0.41843088418430885, "grad_norm": 0.022784588858485222, "learning_rate": 1.0305105853051058e-05, "loss": 0.0006, "step": 336 }, { "epoch": 0.41967621419676215, "grad_norm": 0.1662231832742691, "learning_rate": 1.0336239103362392e-05, "loss": 0.0018, "step": 337 }, { "epoch": 0.42092154420921546, "grad_norm": 0.05111798271536827, "learning_rate": 1.0367372353673723e-05, "loss": 0.0014, "step": 338 }, { "epoch": 0.42216687422166876, "grad_norm": 0.024023687466979027, "learning_rate": 1.0398505603985057e-05, "loss": 0.0007, "step": 339 }, { "epoch": 0.42341220423412207, "grad_norm": 0.07146386057138443, "learning_rate": 1.0429638854296388e-05, "loss": 0.0019, "step": 340 }, { "epoch": 0.4246575342465753, "grad_norm": 0.01847468502819538, "learning_rate": 1.0460772104607722e-05, "loss": 0.0006, "step": 341 }, { "epoch": 0.4259028642590286, "grad_norm": 0.11909367889165878, "learning_rate": 1.0491905354919054e-05, "loss": 0.0009, "step": 342 }, { "epoch": 0.42714819427148193, "grad_norm": 0.07260438799858093, "learning_rate": 1.0523038605230387e-05, "loss": 0.002, "step": 343 }, { "epoch": 0.42839352428393523, "grad_norm": 113.6898193359375, "learning_rate": 1.0554171855541719e-05, "loss": 0.0637, "step": 344 }, { "epoch": 0.42963885429638854, "grad_norm": 0.018576975911855698, "learning_rate": 1.0585305105853052e-05, "loss": 0.0006, "step": 345 }, { "epoch": 0.43088418430884184, "grad_norm": 0.03654215857386589, "learning_rate": 1.0616438356164384e-05, "loss": 0.0007, "step": 346 }, { "epoch": 0.43212951432129515, "grad_norm": 0.025475049391388893, "learning_rate": 1.0647571606475716e-05, "loss": 0.0007, "step": 347 }, { "epoch": 0.43337484433374845, "grad_norm": 0.02617563307285309, "learning_rate": 1.067870485678705e-05, "loss": 0.0008, "step": 348 }, { "epoch": 0.43462017434620176, "grad_norm": 0.07997260987758636, "learning_rate": 1.0709838107098381e-05, "loss": 0.0016, "step": 349 }, { "epoch": 0.43586550435865506, "grad_norm": 0.020727328956127167, "learning_rate": 1.0740971357409714e-05, "loss": 0.0007, "step": 350 }, { "epoch": 0.43711083437110837, "grad_norm": 0.02753385342657566, "learning_rate": 1.0772104607721046e-05, "loss": 0.0007, "step": 351 }, { "epoch": 0.4383561643835616, "grad_norm": 0.04742880165576935, "learning_rate": 1.080323785803238e-05, "loss": 0.0009, "step": 352 }, { "epoch": 0.4396014943960149, "grad_norm": 0.03920525684952736, "learning_rate": 1.0834371108343711e-05, "loss": 0.0011, "step": 353 }, { "epoch": 0.44084682440846823, "grad_norm": 0.04735913872718811, "learning_rate": 1.0865504358655045e-05, "loss": 0.0012, "step": 354 }, { "epoch": 0.44209215442092153, "grad_norm": 0.028404802083969116, "learning_rate": 1.0896637608966377e-05, "loss": 0.0009, "step": 355 }, { "epoch": 0.44333748443337484, "grad_norm": 0.02533857710659504, "learning_rate": 1.092777085927771e-05, "loss": 0.0006, "step": 356 }, { "epoch": 0.44458281444582815, "grad_norm": 0.04108303785324097, "learning_rate": 1.0958904109589042e-05, "loss": 0.0013, "step": 357 }, { "epoch": 0.44582814445828145, "grad_norm": 0.03464365378022194, "learning_rate": 1.0990037359900373e-05, "loss": 0.0009, "step": 358 }, { "epoch": 0.44707347447073476, "grad_norm": 0.030825745314359665, "learning_rate": 1.1021170610211707e-05, "loss": 0.0008, "step": 359 }, { "epoch": 0.44831880448318806, "grad_norm": 0.04480734467506409, "learning_rate": 1.1052303860523039e-05, "loss": 0.0012, "step": 360 }, { "epoch": 0.44956413449564137, "grad_norm": 0.02541348710656166, "learning_rate": 1.1083437110834372e-05, "loss": 0.0008, "step": 361 }, { "epoch": 0.45080946450809467, "grad_norm": 0.02149001508951187, "learning_rate": 1.1114570361145704e-05, "loss": 0.0006, "step": 362 }, { "epoch": 0.4520547945205479, "grad_norm": 0.05121343955397606, "learning_rate": 1.1145703611457037e-05, "loss": 0.0015, "step": 363 }, { "epoch": 0.4533001245330012, "grad_norm": 0.022881271317601204, "learning_rate": 1.1176836861768369e-05, "loss": 0.0007, "step": 364 }, { "epoch": 0.45454545454545453, "grad_norm": 0.029813582077622414, "learning_rate": 1.1207970112079703e-05, "loss": 0.0007, "step": 365 }, { "epoch": 0.45579078455790784, "grad_norm": 0.0214352048933506, "learning_rate": 1.1239103362391034e-05, "loss": 0.0007, "step": 366 }, { "epoch": 0.45703611457036114, "grad_norm": 0.04457417130470276, "learning_rate": 1.1270236612702368e-05, "loss": 0.0008, "step": 367 }, { "epoch": 0.45828144458281445, "grad_norm": 0.019106173887848854, "learning_rate": 1.1301369863013698e-05, "loss": 0.0006, "step": 368 }, { "epoch": 0.45952677459526775, "grad_norm": 0.022846408188343048, "learning_rate": 1.1332503113325031e-05, "loss": 0.0006, "step": 369 }, { "epoch": 0.46077210460772106, "grad_norm": 0.018946994096040726, "learning_rate": 1.1363636363636365e-05, "loss": 0.0006, "step": 370 }, { "epoch": 0.46201743462017436, "grad_norm": 0.021404925733804703, "learning_rate": 1.1394769613947696e-05, "loss": 0.0006, "step": 371 }, { "epoch": 0.46326276463262767, "grad_norm": 0.01195521280169487, "learning_rate": 1.142590286425903e-05, "loss": 0.0004, "step": 372 }, { "epoch": 0.46450809464508097, "grad_norm": 0.03864084184169769, "learning_rate": 1.1457036114570362e-05, "loss": 0.001, "step": 373 }, { "epoch": 0.4657534246575342, "grad_norm": 0.058303095400333405, "learning_rate": 1.1488169364881695e-05, "loss": 0.0012, "step": 374 }, { "epoch": 0.4669987546699875, "grad_norm": 0.013412773609161377, "learning_rate": 1.1519302615193027e-05, "loss": 0.0004, "step": 375 }, { "epoch": 0.46824408468244083, "grad_norm": 0.02416684851050377, "learning_rate": 1.155043586550436e-05, "loss": 0.0007, "step": 376 }, { "epoch": 0.46948941469489414, "grad_norm": 0.016587672755122185, "learning_rate": 1.1581569115815692e-05, "loss": 0.0005, "step": 377 }, { "epoch": 0.47073474470734744, "grad_norm": 0.020129237323999405, "learning_rate": 1.1612702366127025e-05, "loss": 0.0006, "step": 378 }, { "epoch": 0.47198007471980075, "grad_norm": 0.2290887087583542, "learning_rate": 1.1643835616438355e-05, "loss": 0.0013, "step": 379 }, { "epoch": 0.47322540473225405, "grad_norm": 0.0186260174959898, "learning_rate": 1.1674968866749689e-05, "loss": 0.0006, "step": 380 }, { "epoch": 0.47447073474470736, "grad_norm": 0.03915928676724434, "learning_rate": 1.170610211706102e-05, "loss": 0.0009, "step": 381 }, { "epoch": 0.47571606475716066, "grad_norm": 0.024174867197871208, "learning_rate": 1.1737235367372354e-05, "loss": 0.0006, "step": 382 }, { "epoch": 0.47696139476961397, "grad_norm": 0.06258780509233475, "learning_rate": 1.1768368617683686e-05, "loss": 0.0012, "step": 383 }, { "epoch": 0.47820672478206727, "grad_norm": 0.0187270175665617, "learning_rate": 1.179950186799502e-05, "loss": 0.0006, "step": 384 }, { "epoch": 0.4794520547945205, "grad_norm": 0.036254920065402985, "learning_rate": 1.1830635118306351e-05, "loss": 0.0011, "step": 385 }, { "epoch": 0.4806973848069738, "grad_norm": 0.04100683704018593, "learning_rate": 1.1861768368617684e-05, "loss": 0.0008, "step": 386 }, { "epoch": 0.48194271481942713, "grad_norm": 0.023180831223726273, "learning_rate": 1.1892901618929018e-05, "loss": 0.0007, "step": 387 }, { "epoch": 0.48318804483188044, "grad_norm": 36.136348724365234, "learning_rate": 1.192403486924035e-05, "loss": 4.5358, "step": 388 }, { "epoch": 0.48443337484433374, "grad_norm": 0.06236216425895691, "learning_rate": 1.1955168119551683e-05, "loss": 0.0013, "step": 389 }, { "epoch": 0.48567870485678705, "grad_norm": 0.11113505810499191, "learning_rate": 1.1986301369863013e-05, "loss": 0.0014, "step": 390 }, { "epoch": 0.48692403486924035, "grad_norm": 0.028809353709220886, "learning_rate": 1.2017434620174347e-05, "loss": 0.0006, "step": 391 }, { "epoch": 0.48816936488169366, "grad_norm": 0.04308629035949707, "learning_rate": 1.2048567870485678e-05, "loss": 0.001, "step": 392 }, { "epoch": 0.48941469489414696, "grad_norm": 0.03488301858305931, "learning_rate": 1.2079701120797012e-05, "loss": 0.001, "step": 393 }, { "epoch": 0.49066002490660027, "grad_norm": 0.03795866668224335, "learning_rate": 1.2110834371108344e-05, "loss": 0.0009, "step": 394 }, { "epoch": 0.4919053549190536, "grad_norm": 179.07867431640625, "learning_rate": 1.2141967621419677e-05, "loss": 0.306, "step": 395 }, { "epoch": 0.4931506849315068, "grad_norm": 0.07366206496953964, "learning_rate": 1.2173100871731009e-05, "loss": 0.0016, "step": 396 }, { "epoch": 0.4943960149439601, "grad_norm": 0.1270761936903, "learning_rate": 1.2204234122042342e-05, "loss": 0.0023, "step": 397 }, { "epoch": 0.49564134495641343, "grad_norm": 0.1619614213705063, "learning_rate": 1.2235367372353674e-05, "loss": 0.0025, "step": 398 }, { "epoch": 0.49688667496886674, "grad_norm": 0.027039946988224983, "learning_rate": 1.2266500622665007e-05, "loss": 0.0005, "step": 399 }, { "epoch": 0.49813200498132004, "grad_norm": 0.012688295915722847, "learning_rate": 1.2297633872976339e-05, "loss": 0.0003, "step": 400 }, { "epoch": 0.49937733499377335, "grad_norm": 0.04193650931119919, "learning_rate": 1.2328767123287671e-05, "loss": 0.001, "step": 401 }, { "epoch": 0.5006226650062267, "grad_norm": 0.2457994669675827, "learning_rate": 1.2359900373599004e-05, "loss": 0.0033, "step": 402 }, { "epoch": 0.50186799501868, "grad_norm": 0.07151038944721222, "learning_rate": 1.2391033623910336e-05, "loss": 0.0012, "step": 403 }, { "epoch": 0.5031133250311333, "grad_norm": 0.03706686571240425, "learning_rate": 1.242216687422167e-05, "loss": 0.001, "step": 404 }, { "epoch": 0.5043586550435866, "grad_norm": 0.03082493133842945, "learning_rate": 1.2453300124533001e-05, "loss": 0.0008, "step": 405 }, { "epoch": 0.5056039850560399, "grad_norm": 0.02312391996383667, "learning_rate": 1.2484433374844335e-05, "loss": 0.0007, "step": 406 }, { "epoch": 0.5068493150684932, "grad_norm": 43.44374084472656, "learning_rate": 1.2515566625155666e-05, "loss": 4.0239, "step": 407 }, { "epoch": 0.5080946450809465, "grad_norm": 0.04549500346183777, "learning_rate": 1.2546699875467e-05, "loss": 0.0011, "step": 408 }, { "epoch": 0.5093399750933998, "grad_norm": 0.44390103220939636, "learning_rate": 1.2577833125778332e-05, "loss": 0.0017, "step": 409 }, { "epoch": 0.5105853051058531, "grad_norm": 0.017668342217803, "learning_rate": 1.2608966376089665e-05, "loss": 0.0004, "step": 410 }, { "epoch": 0.5118306351183064, "grad_norm": 0.02797042578458786, "learning_rate": 1.2640099626400997e-05, "loss": 0.0005, "step": 411 }, { "epoch": 0.5130759651307597, "grad_norm": 0.05557764694094658, "learning_rate": 1.267123287671233e-05, "loss": 0.0011, "step": 412 }, { "epoch": 0.5143212951432129, "grad_norm": 0.028871331363916397, "learning_rate": 1.2702366127023662e-05, "loss": 0.0007, "step": 413 }, { "epoch": 0.5155666251556662, "grad_norm": 0.04884202778339386, "learning_rate": 1.2733499377334995e-05, "loss": 0.001, "step": 414 }, { "epoch": 0.5168119551681195, "grad_norm": 0.014481289312243462, "learning_rate": 1.2764632627646327e-05, "loss": 0.0004, "step": 415 }, { "epoch": 0.5180572851805728, "grad_norm": 0.08000053465366364, "learning_rate": 1.279576587795766e-05, "loss": 0.0015, "step": 416 }, { "epoch": 0.5193026151930261, "grad_norm": 0.036073487251996994, "learning_rate": 1.2826899128268992e-05, "loss": 0.0007, "step": 417 }, { "epoch": 0.5205479452054794, "grad_norm": 0.08941499143838882, "learning_rate": 1.2858032378580322e-05, "loss": 0.0015, "step": 418 }, { "epoch": 0.5217932752179327, "grad_norm": 0.06853260844945908, "learning_rate": 1.2889165628891656e-05, "loss": 0.0013, "step": 419 }, { "epoch": 0.523038605230386, "grad_norm": 0.026791630312800407, "learning_rate": 1.2920298879202988e-05, "loss": 0.0007, "step": 420 }, { "epoch": 0.5242839352428393, "grad_norm": 0.3121366500854492, "learning_rate": 1.2951432129514321e-05, "loss": 0.0039, "step": 421 }, { "epoch": 0.5255292652552926, "grad_norm": 0.02174542099237442, "learning_rate": 1.2982565379825653e-05, "loss": 0.0006, "step": 422 }, { "epoch": 0.526774595267746, "grad_norm": 0.053185317665338516, "learning_rate": 1.3013698630136986e-05, "loss": 0.0011, "step": 423 }, { "epoch": 0.5280199252801993, "grad_norm": 0.033572856336832047, "learning_rate": 1.3044831880448318e-05, "loss": 0.0009, "step": 424 }, { "epoch": 0.5292652552926526, "grad_norm": 0.0287881251424551, "learning_rate": 1.3075965130759652e-05, "loss": 0.0008, "step": 425 }, { "epoch": 0.5305105853051059, "grad_norm": 0.029981469735503197, "learning_rate": 1.3107098381070983e-05, "loss": 0.0006, "step": 426 }, { "epoch": 0.5317559153175592, "grad_norm": 0.028788315132260323, "learning_rate": 1.3138231631382317e-05, "loss": 0.0005, "step": 427 }, { "epoch": 0.5330012453300125, "grad_norm": 0.021008843556046486, "learning_rate": 1.316936488169365e-05, "loss": 0.0005, "step": 428 }, { "epoch": 0.5342465753424658, "grad_norm": 0.04118547961115837, "learning_rate": 1.3200498132004982e-05, "loss": 0.001, "step": 429 }, { "epoch": 0.5354919053549191, "grad_norm": 0.012453455477952957, "learning_rate": 1.3231631382316315e-05, "loss": 0.0003, "step": 430 }, { "epoch": 0.5367372353673724, "grad_norm": 0.06938812136650085, "learning_rate": 1.3262764632627647e-05, "loss": 0.0011, "step": 431 }, { "epoch": 0.5379825653798257, "grad_norm": 0.017569739371538162, "learning_rate": 1.329389788293898e-05, "loss": 0.0005, "step": 432 }, { "epoch": 0.539227895392279, "grad_norm": 0.026109851896762848, "learning_rate": 1.3325031133250312e-05, "loss": 0.0006, "step": 433 }, { "epoch": 0.5404732254047323, "grad_norm": 0.015702908858656883, "learning_rate": 1.3356164383561646e-05, "loss": 0.0004, "step": 434 }, { "epoch": 0.5417185554171855, "grad_norm": 0.025982121005654335, "learning_rate": 1.3387297633872977e-05, "loss": 0.0007, "step": 435 }, { "epoch": 0.5429638854296388, "grad_norm": 0.06682372093200684, "learning_rate": 1.3418430884184311e-05, "loss": 0.0013, "step": 436 }, { "epoch": 0.5442092154420921, "grad_norm": 0.016124481335282326, "learning_rate": 1.3449564134495643e-05, "loss": 0.0005, "step": 437 }, { "epoch": 0.5454545454545454, "grad_norm": 0.018914785236120224, "learning_rate": 1.3480697384806976e-05, "loss": 0.0005, "step": 438 }, { "epoch": 0.5466998754669987, "grad_norm": 0.01492242980748415, "learning_rate": 1.3511830635118308e-05, "loss": 0.0004, "step": 439 }, { "epoch": 0.547945205479452, "grad_norm": 0.06164323166012764, "learning_rate": 1.3542963885429638e-05, "loss": 0.0011, "step": 440 }, { "epoch": 0.5491905354919053, "grad_norm": 0.07254376262426376, "learning_rate": 1.3574097135740971e-05, "loss": 0.0015, "step": 441 }, { "epoch": 0.5504358655043586, "grad_norm": 0.09924010187387466, "learning_rate": 1.3605230386052303e-05, "loss": 0.0019, "step": 442 }, { "epoch": 0.5516811955168119, "grad_norm": 0.01098677609115839, "learning_rate": 1.3636363636363637e-05, "loss": 0.0003, "step": 443 }, { "epoch": 0.5529265255292652, "grad_norm": 0.030665650963783264, "learning_rate": 1.3667496886674968e-05, "loss": 0.001, "step": 444 }, { "epoch": 0.5541718555417185, "grad_norm": 0.04467572271823883, "learning_rate": 1.3698630136986302e-05, "loss": 0.001, "step": 445 }, { "epoch": 0.5554171855541719, "grad_norm": 0.01499516423791647, "learning_rate": 1.3729763387297633e-05, "loss": 0.0004, "step": 446 }, { "epoch": 0.5566625155666252, "grad_norm": 0.01595112681388855, "learning_rate": 1.3760896637608967e-05, "loss": 0.0005, "step": 447 }, { "epoch": 0.5579078455790785, "grad_norm": 0.02192739024758339, "learning_rate": 1.3792029887920299e-05, "loss": 0.0006, "step": 448 }, { "epoch": 0.5591531755915318, "grad_norm": 0.0317448228597641, "learning_rate": 1.3823163138231632e-05, "loss": 0.0006, "step": 449 }, { "epoch": 0.5603985056039851, "grad_norm": 0.01051297876983881, "learning_rate": 1.3854296388542964e-05, "loss": 0.0003, "step": 450 }, { "epoch": 0.5616438356164384, "grad_norm": 0.014249038882553577, "learning_rate": 1.3885429638854297e-05, "loss": 0.0004, "step": 451 }, { "epoch": 0.5628891656288917, "grad_norm": 0.026663757860660553, "learning_rate": 1.3916562889165629e-05, "loss": 0.0007, "step": 452 }, { "epoch": 0.564134495641345, "grad_norm": 0.018503081053495407, "learning_rate": 1.3947696139476963e-05, "loss": 0.0005, "step": 453 }, { "epoch": 0.5653798256537983, "grad_norm": 0.013995744287967682, "learning_rate": 1.3978829389788294e-05, "loss": 0.0004, "step": 454 }, { "epoch": 0.5666251556662516, "grad_norm": 0.06841859221458435, "learning_rate": 1.4009962640099628e-05, "loss": 0.0012, "step": 455 }, { "epoch": 0.5678704856787049, "grad_norm": 0.052551478147506714, "learning_rate": 1.404109589041096e-05, "loss": 0.0009, "step": 456 }, { "epoch": 0.5691158156911582, "grad_norm": 0.01047549955546856, "learning_rate": 1.4072229140722293e-05, "loss": 0.0004, "step": 457 }, { "epoch": 0.5703611457036114, "grad_norm": 0.01352018117904663, "learning_rate": 1.4103362391033625e-05, "loss": 0.0004, "step": 458 }, { "epoch": 0.5716064757160647, "grad_norm": 0.023181084543466568, "learning_rate": 1.4134495641344958e-05, "loss": 0.0006, "step": 459 }, { "epoch": 0.572851805728518, "grad_norm": 0.01287688035517931, "learning_rate": 1.4165628891656292e-05, "loss": 0.0004, "step": 460 }, { "epoch": 0.5740971357409713, "grad_norm": 0.013366766273975372, "learning_rate": 1.4196762141967623e-05, "loss": 0.0004, "step": 461 }, { "epoch": 0.5753424657534246, "grad_norm": 0.01742659881711006, "learning_rate": 1.4227895392278957e-05, "loss": 0.0005, "step": 462 }, { "epoch": 0.5765877957658779, "grad_norm": 0.018992751836776733, "learning_rate": 1.4259028642590285e-05, "loss": 0.0004, "step": 463 }, { "epoch": 0.5778331257783312, "grad_norm": 0.013830466195940971, "learning_rate": 1.4290161892901619e-05, "loss": 0.0005, "step": 464 }, { "epoch": 0.5790784557907845, "grad_norm": 0.2647791802883148, "learning_rate": 1.432129514321295e-05, "loss": 0.0015, "step": 465 }, { "epoch": 0.5803237858032378, "grad_norm": 0.05277368426322937, "learning_rate": 1.4352428393524284e-05, "loss": 0.0014, "step": 466 }, { "epoch": 0.5815691158156912, "grad_norm": 0.04205463454127312, "learning_rate": 1.4383561643835617e-05, "loss": 0.0011, "step": 467 }, { "epoch": 0.5828144458281445, "grad_norm": 0.01518219243735075, "learning_rate": 1.4414694894146949e-05, "loss": 0.0004, "step": 468 }, { "epoch": 0.5840597758405978, "grad_norm": 0.011395282112061977, "learning_rate": 1.4445828144458282e-05, "loss": 0.0004, "step": 469 }, { "epoch": 0.5853051058530511, "grad_norm": 0.014821592718362808, "learning_rate": 1.4476961394769614e-05, "loss": 0.0005, "step": 470 }, { "epoch": 0.5865504358655044, "grad_norm": 0.01130912359803915, "learning_rate": 1.4508094645080948e-05, "loss": 0.0004, "step": 471 }, { "epoch": 0.5877957658779577, "grad_norm": 0.02256758324801922, "learning_rate": 1.453922789539228e-05, "loss": 0.0006, "step": 472 }, { "epoch": 0.589041095890411, "grad_norm": 0.1458512842655182, "learning_rate": 1.4570361145703613e-05, "loss": 0.0014, "step": 473 }, { "epoch": 0.5902864259028643, "grad_norm": 0.07600380480289459, "learning_rate": 1.4601494396014945e-05, "loss": 0.0016, "step": 474 }, { "epoch": 0.5915317559153176, "grad_norm": 0.007826216518878937, "learning_rate": 1.4632627646326278e-05, "loss": 0.0002, "step": 475 }, { "epoch": 0.5927770859277709, "grad_norm": 0.013695678673684597, "learning_rate": 1.466376089663761e-05, "loss": 0.0004, "step": 476 }, { "epoch": 0.5940224159402242, "grad_norm": 0.034744229167699814, "learning_rate": 1.4694894146948943e-05, "loss": 0.0009, "step": 477 }, { "epoch": 0.5952677459526775, "grad_norm": 0.015751633793115616, "learning_rate": 1.4726027397260275e-05, "loss": 0.0005, "step": 478 }, { "epoch": 0.5965130759651308, "grad_norm": 0.01636291854083538, "learning_rate": 1.4757160647571608e-05, "loss": 0.0004, "step": 479 }, { "epoch": 0.597758405977584, "grad_norm": 0.019713019952178, "learning_rate": 1.478829389788294e-05, "loss": 0.0006, "step": 480 }, { "epoch": 0.5990037359900373, "grad_norm": 0.020456036552786827, "learning_rate": 1.4819427148194274e-05, "loss": 0.0005, "step": 481 }, { "epoch": 0.6002490660024906, "grad_norm": 0.027187447994947433, "learning_rate": 1.4850560398505605e-05, "loss": 0.0006, "step": 482 }, { "epoch": 0.6014943960149439, "grad_norm": 0.024321310222148895, "learning_rate": 1.4881693648816939e-05, "loss": 0.0007, "step": 483 }, { "epoch": 0.6027397260273972, "grad_norm": 0.01486989390105009, "learning_rate": 1.491282689912827e-05, "loss": 0.0004, "step": 484 }, { "epoch": 0.6039850560398505, "grad_norm": 0.022661667317152023, "learning_rate": 1.49439601494396e-05, "loss": 0.0007, "step": 485 }, { "epoch": 0.6052303860523038, "grad_norm": 0.01003281120210886, "learning_rate": 1.4975093399750934e-05, "loss": 0.0003, "step": 486 }, { "epoch": 0.6064757160647571, "grad_norm": 0.01938827708363533, "learning_rate": 1.5006226650062266e-05, "loss": 0.0005, "step": 487 }, { "epoch": 0.6077210460772104, "grad_norm": 0.058401111513376236, "learning_rate": 1.50373599003736e-05, "loss": 0.0006, "step": 488 }, { "epoch": 0.6089663760896638, "grad_norm": 0.008321065455675125, "learning_rate": 1.5068493150684931e-05, "loss": 0.0003, "step": 489 }, { "epoch": 0.6102117061021171, "grad_norm": 0.01695171184837818, "learning_rate": 1.5099626400996264e-05, "loss": 0.0005, "step": 490 }, { "epoch": 0.6114570361145704, "grad_norm": 0.008688063360750675, "learning_rate": 1.5130759651307596e-05, "loss": 0.0003, "step": 491 }, { "epoch": 0.6127023661270237, "grad_norm": 0.009470910765230656, "learning_rate": 1.516189290161893e-05, "loss": 0.0003, "step": 492 }, { "epoch": 0.613947696139477, "grad_norm": 0.010343602858483791, "learning_rate": 1.5193026151930261e-05, "loss": 0.0003, "step": 493 }, { "epoch": 0.6151930261519303, "grad_norm": 0.031660452485084534, "learning_rate": 1.5224159402241595e-05, "loss": 0.0006, "step": 494 }, { "epoch": 0.6164383561643836, "grad_norm": 0.02456934005022049, "learning_rate": 1.5255292652552926e-05, "loss": 0.0005, "step": 495 }, { "epoch": 0.6176836861768369, "grad_norm": 0.022074950858950615, "learning_rate": 1.5286425902864258e-05, "loss": 0.0006, "step": 496 }, { "epoch": 0.6189290161892902, "grad_norm": 0.013984983786940575, "learning_rate": 1.5317559153175593e-05, "loss": 0.0004, "step": 497 }, { "epoch": 0.6201743462017435, "grad_norm": 0.02767989970743656, "learning_rate": 1.5348692403486925e-05, "loss": 0.0004, "step": 498 }, { "epoch": 0.6214196762141968, "grad_norm": 0.011965448036789894, "learning_rate": 1.5379825653798257e-05, "loss": 0.0003, "step": 499 }, { "epoch": 0.6226650062266501, "grad_norm": 0.018284225836396217, "learning_rate": 1.541095890410959e-05, "loss": 0.0005, "step": 500 }, { "epoch": 0.6239103362391034, "grad_norm": 0.010995174758136272, "learning_rate": 1.5442092154420924e-05, "loss": 0.0002, "step": 501 }, { "epoch": 0.6251556662515566, "grad_norm": 0.008704639971256256, "learning_rate": 1.5473225404732256e-05, "loss": 0.0002, "step": 502 }, { "epoch": 0.6264009962640099, "grad_norm": 0.030416160821914673, "learning_rate": 1.5504358655043587e-05, "loss": 0.0007, "step": 503 }, { "epoch": 0.6276463262764632, "grad_norm": 0.02834182232618332, "learning_rate": 1.5535491905354922e-05, "loss": 0.0007, "step": 504 }, { "epoch": 0.6288916562889165, "grad_norm": 0.008636824786663055, "learning_rate": 1.5566625155666254e-05, "loss": 0.0003, "step": 505 }, { "epoch": 0.6301369863013698, "grad_norm": 0.037112049758434296, "learning_rate": 1.5597758405977586e-05, "loss": 0.0009, "step": 506 }, { "epoch": 0.6313823163138231, "grad_norm": 0.012123404070734978, "learning_rate": 1.5628891656288914e-05, "loss": 0.0003, "step": 507 }, { "epoch": 0.6326276463262764, "grad_norm": 36.184539794921875, "learning_rate": 1.566002490660025e-05, "loss": 0.0304, "step": 508 }, { "epoch": 0.6338729763387297, "grad_norm": 0.03620361536741257, "learning_rate": 1.569115815691158e-05, "loss": 0.0009, "step": 509 }, { "epoch": 0.635118306351183, "grad_norm": 0.01849571242928505, "learning_rate": 1.5722291407222913e-05, "loss": 0.0005, "step": 510 }, { "epoch": 0.6363636363636364, "grad_norm": 0.010837621986865997, "learning_rate": 1.5753424657534248e-05, "loss": 0.0003, "step": 511 }, { "epoch": 0.6376089663760897, "grad_norm": 0.017697712406516075, "learning_rate": 1.578455790784558e-05, "loss": 0.0004, "step": 512 }, { "epoch": 0.638854296388543, "grad_norm": 0.00896854791790247, "learning_rate": 1.581569115815691e-05, "loss": 0.0003, "step": 513 }, { "epoch": 0.6400996264009963, "grad_norm": 0.009376812726259232, "learning_rate": 1.5846824408468243e-05, "loss": 0.0003, "step": 514 }, { "epoch": 0.6413449564134496, "grad_norm": 0.03261823207139969, "learning_rate": 1.587795765877958e-05, "loss": 0.0006, "step": 515 }, { "epoch": 0.6425902864259029, "grad_norm": 71.34445190429688, "learning_rate": 1.590909090909091e-05, "loss": 4.0159, "step": 516 }, { "epoch": 0.6438356164383562, "grad_norm": 0.02780863456428051, "learning_rate": 1.5940224159402242e-05, "loss": 0.0006, "step": 517 }, { "epoch": 0.6450809464508095, "grad_norm": 0.008818407543003559, "learning_rate": 1.5971357409713574e-05, "loss": 0.0003, "step": 518 }, { "epoch": 0.6463262764632628, "grad_norm": 0.030920347198843956, "learning_rate": 1.600249066002491e-05, "loss": 0.0007, "step": 519 }, { "epoch": 0.6475716064757161, "grad_norm": 0.018262671306729317, "learning_rate": 1.603362391033624e-05, "loss": 0.0005, "step": 520 }, { "epoch": 0.6488169364881694, "grad_norm": 0.011576538905501366, "learning_rate": 1.6064757160647572e-05, "loss": 0.0004, "step": 521 }, { "epoch": 0.6500622665006227, "grad_norm": 0.010801947675645351, "learning_rate": 1.6095890410958904e-05, "loss": 0.0003, "step": 522 }, { "epoch": 0.651307596513076, "grad_norm": 0.013210455887019634, "learning_rate": 1.612702366127024e-05, "loss": 0.0005, "step": 523 }, { "epoch": 0.6525529265255293, "grad_norm": 0.014238444156944752, "learning_rate": 1.615815691158157e-05, "loss": 0.0004, "step": 524 }, { "epoch": 0.6537982565379825, "grad_norm": 0.007543179206550121, "learning_rate": 1.6189290161892903e-05, "loss": 0.0002, "step": 525 }, { "epoch": 0.6550435865504358, "grad_norm": 0.007191088050603867, "learning_rate": 1.6220423412204234e-05, "loss": 0.0002, "step": 526 }, { "epoch": 0.6562889165628891, "grad_norm": 0.011641144752502441, "learning_rate": 1.625155666251557e-05, "loss": 0.0003, "step": 527 }, { "epoch": 0.6575342465753424, "grad_norm": 0.018345683813095093, "learning_rate": 1.62826899128269e-05, "loss": 0.0005, "step": 528 }, { "epoch": 0.6587795765877957, "grad_norm": 0.3033308684825897, "learning_rate": 1.6313823163138233e-05, "loss": 0.0012, "step": 529 }, { "epoch": 0.660024906600249, "grad_norm": 0.03083566203713417, "learning_rate": 1.6344956413449565e-05, "loss": 0.0007, "step": 530 }, { "epoch": 0.6612702366127023, "grad_norm": 0.011249137111008167, "learning_rate": 1.6376089663760897e-05, "loss": 0.0003, "step": 531 }, { "epoch": 0.6625155666251556, "grad_norm": 0.009096617810428143, "learning_rate": 1.640722291407223e-05, "loss": 0.0003, "step": 532 }, { "epoch": 0.663760896637609, "grad_norm": 0.007661182899028063, "learning_rate": 1.643835616438356e-05, "loss": 0.0002, "step": 533 }, { "epoch": 0.6650062266500623, "grad_norm": 0.03464965149760246, "learning_rate": 1.6469489414694895e-05, "loss": 0.0006, "step": 534 }, { "epoch": 0.6662515566625156, "grad_norm": 0.017583874985575676, "learning_rate": 1.6500622665006227e-05, "loss": 0.0005, "step": 535 }, { "epoch": 0.6674968866749689, "grad_norm": 0.012846691533923149, "learning_rate": 1.653175591531756e-05, "loss": 0.0003, "step": 536 }, { "epoch": 0.6687422166874222, "grad_norm": 0.008167251013219357, "learning_rate": 1.656288916562889e-05, "loss": 0.0002, "step": 537 }, { "epoch": 0.6699875466998755, "grad_norm": 0.09242931753396988, "learning_rate": 1.6594022415940226e-05, "loss": 0.0006, "step": 538 }, { "epoch": 0.6712328767123288, "grad_norm": 0.007621095050126314, "learning_rate": 1.6625155666251557e-05, "loss": 0.0003, "step": 539 }, { "epoch": 0.6724782067247821, "grad_norm": 364.0179138183594, "learning_rate": 1.665628891656289e-05, "loss": 2.4925, "step": 540 }, { "epoch": 0.6737235367372354, "grad_norm": 0.029700903221964836, "learning_rate": 1.6687422166874224e-05, "loss": 0.0005, "step": 541 }, { "epoch": 0.6749688667496887, "grad_norm": 0.009756062179803848, "learning_rate": 1.6718555417185556e-05, "loss": 0.0003, "step": 542 }, { "epoch": 0.676214196762142, "grad_norm": 0.02434486895799637, "learning_rate": 1.6749688667496888e-05, "loss": 0.0007, "step": 543 }, { "epoch": 0.6774595267745953, "grad_norm": 0.0061378516256809235, "learning_rate": 1.678082191780822e-05, "loss": 0.0002, "step": 544 }, { "epoch": 0.6787048567870486, "grad_norm": 0.007974776439368725, "learning_rate": 1.6811955168119555e-05, "loss": 0.0002, "step": 545 }, { "epoch": 0.6799501867995019, "grad_norm": 0.023721277713775635, "learning_rate": 1.6843088418430886e-05, "loss": 0.0007, "step": 546 }, { "epoch": 0.6811955168119551, "grad_norm": 0.06722849607467651, "learning_rate": 1.6874221668742218e-05, "loss": 0.0014, "step": 547 }, { "epoch": 0.6824408468244084, "grad_norm": 0.021218659356236458, "learning_rate": 1.690535491905355e-05, "loss": 0.0005, "step": 548 }, { "epoch": 0.6836861768368617, "grad_norm": 0.007651370484381914, "learning_rate": 1.6936488169364885e-05, "loss": 0.0003, "step": 549 }, { "epoch": 0.684931506849315, "grad_norm": 0.023434964939951897, "learning_rate": 1.6967621419676217e-05, "loss": 0.0004, "step": 550 }, { "epoch": 0.6861768368617683, "grad_norm": 0.010944285430014133, "learning_rate": 1.699875466998755e-05, "loss": 0.0002, "step": 551 }, { "epoch": 0.6874221668742216, "grad_norm": 0.007479478605091572, "learning_rate": 1.702988792029888e-05, "loss": 0.0002, "step": 552 }, { "epoch": 0.688667496886675, "grad_norm": 0.016678282991051674, "learning_rate": 1.7061021170610212e-05, "loss": 0.0004, "step": 553 }, { "epoch": 0.6899128268991283, "grad_norm": 0.008227194659411907, "learning_rate": 1.7092154420921544e-05, "loss": 0.0002, "step": 554 }, { "epoch": 0.6911581569115816, "grad_norm": 0.016022512689232826, "learning_rate": 1.7123287671232875e-05, "loss": 0.0004, "step": 555 }, { "epoch": 0.6924034869240349, "grad_norm": 0.01723802089691162, "learning_rate": 1.715442092154421e-05, "loss": 0.0004, "step": 556 }, { "epoch": 0.6936488169364882, "grad_norm": 0.007776948623359203, "learning_rate": 1.7185554171855542e-05, "loss": 0.0002, "step": 557 }, { "epoch": 0.6948941469489415, "grad_norm": 0.061478786170482635, "learning_rate": 1.7216687422166874e-05, "loss": 0.0004, "step": 558 }, { "epoch": 0.6961394769613948, "grad_norm": 0.030175473541021347, "learning_rate": 1.7247820672478206e-05, "loss": 0.0005, "step": 559 }, { "epoch": 0.6973848069738481, "grad_norm": 0.03586643561720848, "learning_rate": 1.727895392278954e-05, "loss": 0.0009, "step": 560 }, { "epoch": 0.6986301369863014, "grad_norm": 0.01669226959347725, "learning_rate": 1.7310087173100873e-05, "loss": 0.0004, "step": 561 }, { "epoch": 0.6998754669987547, "grad_norm": 0.013228816911578178, "learning_rate": 1.7341220423412205e-05, "loss": 0.0003, "step": 562 }, { "epoch": 0.701120797011208, "grad_norm": 0.16547606885433197, "learning_rate": 1.7372353673723536e-05, "loss": 0.0014, "step": 563 }, { "epoch": 0.7023661270236613, "grad_norm": 0.20769615471363068, "learning_rate": 1.740348692403487e-05, "loss": 0.0007, "step": 564 }, { "epoch": 0.7036114570361146, "grad_norm": Infinity, "learning_rate": 1.740348692403487e-05, "loss": 3.7559, "step": 565 }, { "epoch": 0.7048567870485679, "grad_norm": 0.010459132492542267, "learning_rate": 1.7434620174346203e-05, "loss": 0.0003, "step": 566 }, { "epoch": 0.7061021170610212, "grad_norm": 7.497586727142334, "learning_rate": 1.7465753424657535e-05, "loss": 0.008, "step": 567 }, { "epoch": 0.7073474470734745, "grad_norm": 0.011709270067512989, "learning_rate": 1.7496886674968867e-05, "loss": 0.0003, "step": 568 }, { "epoch": 0.7085927770859277, "grad_norm": 0.024786679074168205, "learning_rate": 1.7528019925280202e-05, "loss": 0.0005, "step": 569 }, { "epoch": 0.709838107098381, "grad_norm": 0.007164615672081709, "learning_rate": 1.7559153175591534e-05, "loss": 0.0003, "step": 570 }, { "epoch": 0.7110834371108343, "grad_norm": 0.006929496768862009, "learning_rate": 1.7590286425902865e-05, "loss": 0.0002, "step": 571 }, { "epoch": 0.7123287671232876, "grad_norm": 0.01036135945469141, "learning_rate": 1.7621419676214197e-05, "loss": 0.0003, "step": 572 }, { "epoch": 0.7135740971357409, "grad_norm": 0.01619466207921505, "learning_rate": 1.7652552926525532e-05, "loss": 0.0004, "step": 573 }, { "epoch": 0.7148194271481942, "grad_norm": 0.007037854287773371, "learning_rate": 1.7683686176836864e-05, "loss": 0.0002, "step": 574 }, { "epoch": 0.7160647571606475, "grad_norm": 0.015169711783528328, "learning_rate": 1.7714819427148192e-05, "loss": 0.0004, "step": 575 }, { "epoch": 0.7173100871731009, "grad_norm": 0.014573472552001476, "learning_rate": 1.7745952677459527e-05, "loss": 0.0003, "step": 576 }, { "epoch": 0.7185554171855542, "grad_norm": 0.012262790463864803, "learning_rate": 1.777708592777086e-05, "loss": 0.0003, "step": 577 }, { "epoch": 0.7198007471980075, "grad_norm": 0.011037294752895832, "learning_rate": 1.780821917808219e-05, "loss": 0.0003, "step": 578 }, { "epoch": 0.7210460772104608, "grad_norm": 0.012611133977770805, "learning_rate": 1.7839352428393523e-05, "loss": 0.0003, "step": 579 }, { "epoch": 0.7222914072229141, "grad_norm": 0.13023485243320465, "learning_rate": 1.7870485678704858e-05, "loss": 0.0009, "step": 580 }, { "epoch": 0.7235367372353674, "grad_norm": 0.006935072597116232, "learning_rate": 1.790161892901619e-05, "loss": 0.0002, "step": 581 }, { "epoch": 0.7247820672478207, "grad_norm": 0.026650428771972656, "learning_rate": 1.793275217932752e-05, "loss": 0.0006, "step": 582 }, { "epoch": 0.726027397260274, "grad_norm": 0.015044482424855232, "learning_rate": 1.7963885429638856e-05, "loss": 0.0004, "step": 583 }, { "epoch": 0.7272727272727273, "grad_norm": 0.019932331517338753, "learning_rate": 1.7995018679950188e-05, "loss": 0.0005, "step": 584 }, { "epoch": 0.7285180572851806, "grad_norm": 0.01698875240981579, "learning_rate": 1.802615193026152e-05, "loss": 0.0004, "step": 585 }, { "epoch": 0.7297633872976339, "grad_norm": 0.4486841857433319, "learning_rate": 1.805728518057285e-05, "loss": 0.0005, "step": 586 }, { "epoch": 0.7310087173100872, "grad_norm": 0.01894947700202465, "learning_rate": 1.8088418430884187e-05, "loss": 0.0006, "step": 587 }, { "epoch": 0.7322540473225405, "grad_norm": 0.006948466412723064, "learning_rate": 1.811955168119552e-05, "loss": 0.0002, "step": 588 }, { "epoch": 0.7334993773349938, "grad_norm": 15.503718376159668, "learning_rate": 1.815068493150685e-05, "loss": 0.0137, "step": 589 }, { "epoch": 0.7347447073474471, "grad_norm": 0.021334033459424973, "learning_rate": 1.8181818181818182e-05, "loss": 0.0006, "step": 590 }, { "epoch": 0.7359900373599004, "grad_norm": 0.02985548786818981, "learning_rate": 1.8212951432129517e-05, "loss": 0.0005, "step": 591 }, { "epoch": 0.7372353673723536, "grad_norm": 0.007480076979845762, "learning_rate": 1.824408468244085e-05, "loss": 0.0002, "step": 592 }, { "epoch": 0.7384806973848069, "grad_norm": 0.006202853284776211, "learning_rate": 1.827521793275218e-05, "loss": 0.0002, "step": 593 }, { "epoch": 0.7397260273972602, "grad_norm": 0.020105713978409767, "learning_rate": 1.8306351183063512e-05, "loss": 0.0005, "step": 594 }, { "epoch": 0.7409713574097135, "grad_norm": 0.01176950428634882, "learning_rate": 1.8337484433374848e-05, "loss": 0.0003, "step": 595 }, { "epoch": 0.7422166874221668, "grad_norm": 0.02436145208775997, "learning_rate": 1.836861768368618e-05, "loss": 0.0005, "step": 596 }, { "epoch": 0.7434620174346201, "grad_norm": 0.015877658501267433, "learning_rate": 1.839975093399751e-05, "loss": 0.0004, "step": 597 }, { "epoch": 0.7447073474470735, "grad_norm": 0.0258621908724308, "learning_rate": 1.8430884184308843e-05, "loss": 0.0006, "step": 598 }, { "epoch": 0.7459526774595268, "grad_norm": 0.0054780724458396435, "learning_rate": 1.8462017434620175e-05, "loss": 0.0002, "step": 599 }, { "epoch": 0.7471980074719801, "grad_norm": 0.01809469237923622, "learning_rate": 1.8493150684931506e-05, "loss": 0.0004, "step": 600 }, { "epoch": 0.7484433374844334, "grad_norm": 0.012986347079277039, "learning_rate": 1.8524283935242838e-05, "loss": 0.0003, "step": 601 }, { "epoch": 0.7496886674968867, "grad_norm": 0.004867818206548691, "learning_rate": 1.8555417185554173e-05, "loss": 0.0001, "step": 602 }, { "epoch": 0.75093399750934, "grad_norm": 0.005523454863578081, "learning_rate": 1.8586550435865505e-05, "loss": 0.0002, "step": 603 }, { "epoch": 0.7521793275217933, "grad_norm": 0.009668633341789246, "learning_rate": 1.8617683686176837e-05, "loss": 0.0003, "step": 604 }, { "epoch": 0.7534246575342466, "grad_norm": 0.0070527163334190845, "learning_rate": 1.864881693648817e-05, "loss": 0.0002, "step": 605 }, { "epoch": 0.7546699875466999, "grad_norm": 0.006774348672479391, "learning_rate": 1.8679950186799504e-05, "loss": 0.0002, "step": 606 }, { "epoch": 0.7559153175591532, "grad_norm": 0.007995886728167534, "learning_rate": 1.8711083437110835e-05, "loss": 0.0002, "step": 607 }, { "epoch": 0.7571606475716065, "grad_norm": 30.348756790161133, "learning_rate": 1.8742216687422167e-05, "loss": 4.172, "step": 608 }, { "epoch": 0.7584059775840598, "grad_norm": 0.01787879690527916, "learning_rate": 1.87733499377335e-05, "loss": 0.0004, "step": 609 }, { "epoch": 0.7596513075965131, "grad_norm": 0.06024169921875, "learning_rate": 1.8804483188044834e-05, "loss": 0.0011, "step": 610 }, { "epoch": 0.7608966376089664, "grad_norm": 0.06412393599748611, "learning_rate": 1.8835616438356166e-05, "loss": 0.0014, "step": 611 }, { "epoch": 0.7621419676214197, "grad_norm": 0.01381937600672245, "learning_rate": 1.8866749688667497e-05, "loss": 0.0005, "step": 612 }, { "epoch": 0.763387297633873, "grad_norm": 0.01991051435470581, "learning_rate": 1.889788293897883e-05, "loss": 0.0003, "step": 613 }, { "epoch": 0.7646326276463262, "grad_norm": 0.14104107022285461, "learning_rate": 1.8929016189290164e-05, "loss": 0.0026, "step": 614 }, { "epoch": 0.7658779576587795, "grad_norm": 0.0066263917833566666, "learning_rate": 1.8960149439601496e-05, "loss": 0.0002, "step": 615 }, { "epoch": 0.7671232876712328, "grad_norm": 0.006442869547754526, "learning_rate": 1.8991282689912828e-05, "loss": 0.0002, "step": 616 }, { "epoch": 0.7683686176836861, "grad_norm": 0.20366807281970978, "learning_rate": 1.9022415940224163e-05, "loss": 0.0028, "step": 617 }, { "epoch": 0.7696139476961394, "grad_norm": 0.16002459824085236, "learning_rate": 1.9053549190535495e-05, "loss": 0.0023, "step": 618 }, { "epoch": 0.7708592777085927, "grad_norm": 0.007126240525394678, "learning_rate": 1.9084682440846827e-05, "loss": 0.0002, "step": 619 }, { "epoch": 0.772104607721046, "grad_norm": 0.22348296642303467, "learning_rate": 1.9115815691158155e-05, "loss": 0.0034, "step": 620 }, { "epoch": 0.7733499377334994, "grad_norm": 0.01117734331637621, "learning_rate": 1.914694894146949e-05, "loss": 0.0003, "step": 621 }, { "epoch": 0.7745952677459527, "grad_norm": 0.017832182347774506, "learning_rate": 1.9178082191780822e-05, "loss": 0.0004, "step": 622 }, { "epoch": 0.775840597758406, "grad_norm": 0.10084803402423859, "learning_rate": 1.9209215442092154e-05, "loss": 0.002, "step": 623 }, { "epoch": 0.7770859277708593, "grad_norm": 0.0404939204454422, "learning_rate": 1.924034869240349e-05, "loss": 0.0009, "step": 624 }, { "epoch": 0.7783312577833126, "grad_norm": 0.006709231995046139, "learning_rate": 1.927148194271482e-05, "loss": 0.0002, "step": 625 }, { "epoch": 0.7795765877957659, "grad_norm": 0.006246612407267094, "learning_rate": 1.9302615193026152e-05, "loss": 0.0002, "step": 626 }, { "epoch": 0.7808219178082192, "grad_norm": 0.007551430258899927, "learning_rate": 1.9333748443337484e-05, "loss": 0.0002, "step": 627 }, { "epoch": 0.7820672478206725, "grad_norm": 0.010194691829383373, "learning_rate": 1.936488169364882e-05, "loss": 0.0002, "step": 628 }, { "epoch": 0.7833125778331258, "grad_norm": 0.007259845733642578, "learning_rate": 1.939601494396015e-05, "loss": 0.0002, "step": 629 }, { "epoch": 0.7845579078455791, "grad_norm": 0.6343588829040527, "learning_rate": 1.9427148194271483e-05, "loss": 0.0014, "step": 630 }, { "epoch": 0.7858032378580324, "grad_norm": 0.004895548801869154, "learning_rate": 1.9458281444582814e-05, "loss": 0.0001, "step": 631 }, { "epoch": 0.7870485678704857, "grad_norm": 0.023873023688793182, "learning_rate": 1.948941469489415e-05, "loss": 0.0006, "step": 632 }, { "epoch": 0.788293897882939, "grad_norm": 0.06282692402601242, "learning_rate": 1.952054794520548e-05, "loss": 0.0014, "step": 633 }, { "epoch": 0.7895392278953923, "grad_norm": 0.01570272073149681, "learning_rate": 1.9551681195516813e-05, "loss": 0.0005, "step": 634 }, { "epoch": 0.7907845579078456, "grad_norm": 0.004377361387014389, "learning_rate": 1.9582814445828145e-05, "loss": 0.0001, "step": 635 }, { "epoch": 0.7920298879202988, "grad_norm": 0.005370027385652065, "learning_rate": 1.961394769613948e-05, "loss": 0.0001, "step": 636 }, { "epoch": 0.7932752179327521, "grad_norm": 0.016998134553432465, "learning_rate": 1.964508094645081e-05, "loss": 0.0003, "step": 637 }, { "epoch": 0.7945205479452054, "grad_norm": 0.02801138535141945, "learning_rate": 1.9676214196762143e-05, "loss": 0.0007, "step": 638 }, { "epoch": 0.7957658779576587, "grad_norm": 0.007101301569491625, "learning_rate": 1.9707347447073475e-05, "loss": 0.0002, "step": 639 }, { "epoch": 0.797011207970112, "grad_norm": 0.007805291563272476, "learning_rate": 1.973848069738481e-05, "loss": 0.0002, "step": 640 }, { "epoch": 0.7982565379825654, "grad_norm": 0.01866893284022808, "learning_rate": 1.9769613947696142e-05, "loss": 0.0004, "step": 641 }, { "epoch": 0.7995018679950187, "grad_norm": 0.008472064509987831, "learning_rate": 1.980074719800747e-05, "loss": 0.0002, "step": 642 }, { "epoch": 0.800747198007472, "grad_norm": 0.011058184318244457, "learning_rate": 1.9831880448318805e-05, "loss": 0.0004, "step": 643 }, { "epoch": 0.8019925280199253, "grad_norm": 0.01657005585730076, "learning_rate": 1.9863013698630137e-05, "loss": 0.0004, "step": 644 }, { "epoch": 0.8032378580323786, "grad_norm": 0.007903863675892353, "learning_rate": 1.989414694894147e-05, "loss": 0.0002, "step": 645 }, { "epoch": 0.8044831880448319, "grad_norm": 0.008648911491036415, "learning_rate": 1.99252801992528e-05, "loss": 0.0003, "step": 646 }, { "epoch": 0.8057285180572852, "grad_norm": 0.005954551976174116, "learning_rate": 1.9956413449564136e-05, "loss": 0.0001, "step": 647 }, { "epoch": 0.8069738480697385, "grad_norm": 0.012240339070558548, "learning_rate": 1.9987546699875468e-05, "loss": 0.0004, "step": 648 }, { "epoch": 0.8082191780821918, "grad_norm": 0.012209310196340084, "learning_rate": 2.00186799501868e-05, "loss": 0.0004, "step": 649 }, { "epoch": 0.8094645080946451, "grad_norm": 0.013876602053642273, "learning_rate": 2.004981320049813e-05, "loss": 0.0004, "step": 650 }, { "epoch": 0.8107098381070984, "grad_norm": 0.006682861130684614, "learning_rate": 2.0080946450809466e-05, "loss": 0.0002, "step": 651 }, { "epoch": 0.8119551681195517, "grad_norm": 0.01869480311870575, "learning_rate": 2.0112079701120798e-05, "loss": 0.0004, "step": 652 }, { "epoch": 0.813200498132005, "grad_norm": 0.006386366207152605, "learning_rate": 2.014321295143213e-05, "loss": 0.0002, "step": 653 }, { "epoch": 0.8144458281444583, "grad_norm": 0.031244048848748207, "learning_rate": 2.017434620174346e-05, "loss": 0.0007, "step": 654 }, { "epoch": 0.8156911581569116, "grad_norm": 0.005839107092469931, "learning_rate": 2.0205479452054797e-05, "loss": 0.0002, "step": 655 }, { "epoch": 0.8169364881693649, "grad_norm": 0.012466920539736748, "learning_rate": 2.023661270236613e-05, "loss": 0.0003, "step": 656 }, { "epoch": 0.8181818181818182, "grad_norm": 0.011677310802042484, "learning_rate": 2.026774595267746e-05, "loss": 0.0003, "step": 657 }, { "epoch": 0.8194271481942715, "grad_norm": 325.08514404296875, "learning_rate": 2.0298879202988795e-05, "loss": 0.185, "step": 658 }, { "epoch": 0.8206724782067247, "grad_norm": 0.00978070218116045, "learning_rate": 2.0330012453300127e-05, "loss": 0.0003, "step": 659 }, { "epoch": 0.821917808219178, "grad_norm": 0.009361130185425282, "learning_rate": 2.036114570361146e-05, "loss": 0.0003, "step": 660 }, { "epoch": 0.8231631382316313, "grad_norm": 0.007570465561002493, "learning_rate": 2.039227895392279e-05, "loss": 0.0003, "step": 661 }, { "epoch": 0.8244084682440846, "grad_norm": 0.00575603824108839, "learning_rate": 2.0423412204234126e-05, "loss": 0.0002, "step": 662 }, { "epoch": 0.825653798256538, "grad_norm": 0.014008327387273312, "learning_rate": 2.0454545454545457e-05, "loss": 0.0004, "step": 663 }, { "epoch": 0.8268991282689913, "grad_norm": 0.00547524681314826, "learning_rate": 2.048567870485679e-05, "loss": 0.0001, "step": 664 }, { "epoch": 0.8281444582814446, "grad_norm": 0.026367267593741417, "learning_rate": 2.051681195516812e-05, "loss": 0.0005, "step": 665 }, { "epoch": 0.8293897882938979, "grad_norm": 0.0041604661382734776, "learning_rate": 2.0547945205479453e-05, "loss": 0.0001, "step": 666 }, { "epoch": 0.8306351183063512, "grad_norm": 0.01260537002235651, "learning_rate": 2.0579078455790784e-05, "loss": 0.0004, "step": 667 }, { "epoch": 0.8318804483188045, "grad_norm": 0.005095213185995817, "learning_rate": 2.0610211706102116e-05, "loss": 0.0002, "step": 668 }, { "epoch": 0.8331257783312578, "grad_norm": 0.004534134641289711, "learning_rate": 2.064134495641345e-05, "loss": 0.0001, "step": 669 }, { "epoch": 0.8343711083437111, "grad_norm": 0.015001599676907063, "learning_rate": 2.0672478206724783e-05, "loss": 0.0004, "step": 670 }, { "epoch": 0.8356164383561644, "grad_norm": 0.005808024201542139, "learning_rate": 2.0703611457036115e-05, "loss": 0.0002, "step": 671 }, { "epoch": 0.8368617683686177, "grad_norm": 0.008496883325278759, "learning_rate": 2.0734744707347447e-05, "loss": 0.0003, "step": 672 }, { "epoch": 0.838107098381071, "grad_norm": 410.8919677734375, "learning_rate": 2.076587795765878e-05, "loss": 1.7746, "step": 673 }, { "epoch": 0.8393524283935243, "grad_norm": 0.15478110313415527, "learning_rate": 2.0797011207970113e-05, "loss": 0.0008, "step": 674 }, { "epoch": 0.8405977584059776, "grad_norm": 0.017121130600571632, "learning_rate": 2.0828144458281445e-05, "loss": 0.0004, "step": 675 }, { "epoch": 0.8418430884184309, "grad_norm": 0.01048367191106081, "learning_rate": 2.0859277708592777e-05, "loss": 0.0003, "step": 676 }, { "epoch": 0.8430884184308842, "grad_norm": 0.013435076922178268, "learning_rate": 2.0890410958904112e-05, "loss": 0.0004, "step": 677 }, { "epoch": 0.8443337484433375, "grad_norm": 0.0057032410986721516, "learning_rate": 2.0921544209215444e-05, "loss": 0.0002, "step": 678 }, { "epoch": 0.8455790784557908, "grad_norm": 0.05629182606935501, "learning_rate": 2.0952677459526776e-05, "loss": 0.0005, "step": 679 }, { "epoch": 0.8468244084682441, "grad_norm": 0.8133471608161926, "learning_rate": 2.0983810709838107e-05, "loss": 0.0012, "step": 680 }, { "epoch": 0.8480697384806973, "grad_norm": 0.011576468124985695, "learning_rate": 2.1014943960149442e-05, "loss": 0.0003, "step": 681 }, { "epoch": 0.8493150684931506, "grad_norm": 0.079744853079319, "learning_rate": 2.1046077210460774e-05, "loss": 0.0006, "step": 682 }, { "epoch": 0.8505603985056039, "grad_norm": 0.019048074260354042, "learning_rate": 2.1077210460772106e-05, "loss": 0.0004, "step": 683 }, { "epoch": 0.8518057285180572, "grad_norm": 0.004764070268720388, "learning_rate": 2.1108343711083438e-05, "loss": 0.0001, "step": 684 }, { "epoch": 0.8530510585305106, "grad_norm": 0.022517533972859383, "learning_rate": 2.1139476961394773e-05, "loss": 0.0003, "step": 685 }, { "epoch": 0.8542963885429639, "grad_norm": 0.17990639805793762, "learning_rate": 2.1170610211706105e-05, "loss": 0.0007, "step": 686 }, { "epoch": 0.8555417185554172, "grad_norm": 0.0133855314925313, "learning_rate": 2.1201743462017433e-05, "loss": 0.0004, "step": 687 }, { "epoch": 0.8567870485678705, "grad_norm": 0.01034181471914053, "learning_rate": 2.1232876712328768e-05, "loss": 0.0003, "step": 688 }, { "epoch": 0.8580323785803238, "grad_norm": 0.09839920699596405, "learning_rate": 2.12640099626401e-05, "loss": 0.0007, "step": 689 }, { "epoch": 0.8592777085927771, "grad_norm": 0.28286799788475037, "learning_rate": 2.129514321295143e-05, "loss": 0.0009, "step": 690 }, { "epoch": 0.8605230386052304, "grad_norm": 0.004863832611590624, "learning_rate": 2.1326276463262763e-05, "loss": 0.0001, "step": 691 }, { "epoch": 0.8617683686176837, "grad_norm": 0.007945407181978226, "learning_rate": 2.13574097135741e-05, "loss": 0.0002, "step": 692 }, { "epoch": 0.863013698630137, "grad_norm": 0.17650844156742096, "learning_rate": 2.138854296388543e-05, "loss": 0.0006, "step": 693 }, { "epoch": 0.8642590286425903, "grad_norm": 36.761592864990234, "learning_rate": 2.1419676214196762e-05, "loss": 4.8048, "step": 694 }, { "epoch": 0.8655043586550436, "grad_norm": 43.7182731628418, "learning_rate": 2.1450809464508094e-05, "loss": 4.1331, "step": 695 }, { "epoch": 0.8667496886674969, "grad_norm": 0.031437598168849945, "learning_rate": 2.148194271481943e-05, "loss": 0.0005, "step": 696 }, { "epoch": 0.8679950186799502, "grad_norm": 0.17908449470996857, "learning_rate": 2.151307596513076e-05, "loss": 0.0018, "step": 697 }, { "epoch": 0.8692403486924035, "grad_norm": 43.03351974487305, "learning_rate": 2.1544209215442092e-05, "loss": 0.9142, "step": 698 }, { "epoch": 0.8704856787048568, "grad_norm": 0.07657460123300552, "learning_rate": 2.1575342465753427e-05, "loss": 0.0007, "step": 699 }, { "epoch": 0.8717310087173101, "grad_norm": 43.546669006347656, "learning_rate": 2.160647571606476e-05, "loss": 1.2326, "step": 700 }, { "epoch": 0.8729763387297634, "grad_norm": 0.15518978238105774, "learning_rate": 2.163760896637609e-05, "loss": 0.0013, "step": 701 }, { "epoch": 0.8742216687422167, "grad_norm": 20.484352111816406, "learning_rate": 2.1668742216687423e-05, "loss": 0.4034, "step": 702 }, { "epoch": 0.8754669987546699, "grad_norm": 8.134427070617676, "learning_rate": 2.1699875466998758e-05, "loss": 0.1308, "step": 703 }, { "epoch": 0.8767123287671232, "grad_norm": 31.111207962036133, "learning_rate": 2.173100871731009e-05, "loss": 1.3048, "step": 704 }, { "epoch": 0.8779576587795765, "grad_norm": 1.6822067499160767, "learning_rate": 2.176214196762142e-05, "loss": 0.0337, "step": 705 }, { "epoch": 0.8792029887920298, "grad_norm": 0.016219645738601685, "learning_rate": 2.1793275217932753e-05, "loss": 0.0002, "step": 706 }, { "epoch": 0.8804483188044832, "grad_norm": 0.9385362267494202, "learning_rate": 2.1824408468244088e-05, "loss": 0.0118, "step": 707 }, { "epoch": 0.8816936488169365, "grad_norm": 59.062347412109375, "learning_rate": 2.185554171855542e-05, "loss": 1.5594, "step": 708 }, { "epoch": 0.8829389788293898, "grad_norm": 0.8278292417526245, "learning_rate": 2.188667496886675e-05, "loss": 0.0164, "step": 709 }, { "epoch": 0.8841843088418431, "grad_norm": 0.1193016767501831, "learning_rate": 2.1917808219178083e-05, "loss": 0.0026, "step": 710 }, { "epoch": 0.8854296388542964, "grad_norm": 0.06685473769903183, "learning_rate": 2.1948941469489415e-05, "loss": 0.0007, "step": 711 }, { "epoch": 0.8866749688667497, "grad_norm": 0.2482631653547287, "learning_rate": 2.1980074719800747e-05, "loss": 0.0044, "step": 712 }, { "epoch": 0.887920298879203, "grad_norm": 0.09288740158081055, "learning_rate": 2.201120797011208e-05, "loss": 0.001, "step": 713 }, { "epoch": 0.8891656288916563, "grad_norm": 0.07905003428459167, "learning_rate": 2.2042341220423414e-05, "loss": 0.001, "step": 714 }, { "epoch": 0.8904109589041096, "grad_norm": 0.03586210682988167, "learning_rate": 2.2073474470734746e-05, "loss": 0.0007, "step": 715 }, { "epoch": 0.8916562889165629, "grad_norm": 0.029501890763640404, "learning_rate": 2.2104607721046077e-05, "loss": 0.0005, "step": 716 }, { "epoch": 0.8929016189290162, "grad_norm": 1.9498989582061768, "learning_rate": 2.213574097135741e-05, "loss": 0.0056, "step": 717 }, { "epoch": 0.8941469489414695, "grad_norm": 0.011584372259676456, "learning_rate": 2.2166874221668744e-05, "loss": 0.0002, "step": 718 }, { "epoch": 0.8953922789539228, "grad_norm": 0.052831344306468964, "learning_rate": 2.2198007471980076e-05, "loss": 0.0007, "step": 719 }, { "epoch": 0.8966376089663761, "grad_norm": 152.57171630859375, "learning_rate": 2.2229140722291408e-05, "loss": 0.5103, "step": 720 }, { "epoch": 0.8978829389788294, "grad_norm": 0.03796133026480675, "learning_rate": 2.226027397260274e-05, "loss": 0.0008, "step": 721 }, { "epoch": 0.8991282689912827, "grad_norm": 9.698473930358887, "learning_rate": 2.2291407222914075e-05, "loss": 0.0168, "step": 722 }, { "epoch": 0.900373599003736, "grad_norm": 0.014799389988183975, "learning_rate": 2.2322540473225406e-05, "loss": 0.0003, "step": 723 }, { "epoch": 0.9016189290161893, "grad_norm": 0.015290978364646435, "learning_rate": 2.2353673723536738e-05, "loss": 0.0004, "step": 724 }, { "epoch": 0.9028642590286425, "grad_norm": 0.0121547756716609, "learning_rate": 2.238480697384807e-05, "loss": 0.0004, "step": 725 }, { "epoch": 0.9041095890410958, "grad_norm": 0.043171875178813934, "learning_rate": 2.2415940224159405e-05, "loss": 0.001, "step": 726 }, { "epoch": 0.9053549190535491, "grad_norm": 0.02570340782403946, "learning_rate": 2.2447073474470737e-05, "loss": 0.0004, "step": 727 }, { "epoch": 0.9066002490660025, "grad_norm": 0.4008868634700775, "learning_rate": 2.247820672478207e-05, "loss": 0.0015, "step": 728 }, { "epoch": 0.9078455790784558, "grad_norm": 0.012521167285740376, "learning_rate": 2.2509339975093404e-05, "loss": 0.0003, "step": 729 }, { "epoch": 0.9090909090909091, "grad_norm": 0.039595190435647964, "learning_rate": 2.2540473225404735e-05, "loss": 0.0008, "step": 730 }, { "epoch": 0.9103362391033624, "grad_norm": 0.0371573381125927, "learning_rate": 2.2571606475716064e-05, "loss": 0.0007, "step": 731 }, { "epoch": 0.9115815691158157, "grad_norm": 0.0111406734213233, "learning_rate": 2.2602739726027396e-05, "loss": 0.0003, "step": 732 }, { "epoch": 0.912826899128269, "grad_norm": 34.578346252441406, "learning_rate": 2.263387297633873e-05, "loss": 4.4143, "step": 733 }, { "epoch": 0.9140722291407223, "grad_norm": 0.006715845782309771, "learning_rate": 2.2665006226650062e-05, "loss": 0.0002, "step": 734 }, { "epoch": 0.9153175591531756, "grad_norm": 0.014482389204204082, "learning_rate": 2.2696139476961394e-05, "loss": 0.0004, "step": 735 }, { "epoch": 0.9165628891656289, "grad_norm": 0.0057504503056406975, "learning_rate": 2.272727272727273e-05, "loss": 0.0001, "step": 736 }, { "epoch": 0.9178082191780822, "grad_norm": 0.04472869634628296, "learning_rate": 2.275840597758406e-05, "loss": 0.001, "step": 737 }, { "epoch": 0.9190535491905355, "grad_norm": 0.05841754376888275, "learning_rate": 2.2789539227895393e-05, "loss": 0.001, "step": 738 }, { "epoch": 0.9202988792029888, "grad_norm": 0.009739454835653305, "learning_rate": 2.2820672478206725e-05, "loss": 0.0002, "step": 739 }, { "epoch": 0.9215442092154421, "grad_norm": 0.011922300793230534, "learning_rate": 2.285180572851806e-05, "loss": 0.0004, "step": 740 }, { "epoch": 0.9227895392278954, "grad_norm": 0.05216851085424423, "learning_rate": 2.288293897882939e-05, "loss": 0.001, "step": 741 }, { "epoch": 0.9240348692403487, "grad_norm": 0.007307402323931456, "learning_rate": 2.2914072229140723e-05, "loss": 0.0002, "step": 742 }, { "epoch": 0.925280199252802, "grad_norm": 0.04301249235868454, "learning_rate": 2.2945205479452055e-05, "loss": 0.0005, "step": 743 }, { "epoch": 0.9265255292652553, "grad_norm": 0.013793856836855412, "learning_rate": 2.297633872976339e-05, "loss": 0.0003, "step": 744 }, { "epoch": 0.9277708592777086, "grad_norm": 0.1124817505478859, "learning_rate": 2.3007471980074722e-05, "loss": 0.0022, "step": 745 }, { "epoch": 0.9290161892901619, "grad_norm": 0.005083655938506126, "learning_rate": 2.3038605230386054e-05, "loss": 0.0001, "step": 746 }, { "epoch": 0.9302615193026152, "grad_norm": 0.005723627284169197, "learning_rate": 2.3069738480697385e-05, "loss": 0.0001, "step": 747 }, { "epoch": 0.9315068493150684, "grad_norm": 0.08036380261182785, "learning_rate": 2.310087173100872e-05, "loss": 0.0014, "step": 748 }, { "epoch": 0.9327521793275217, "grad_norm": 0.007362319156527519, "learning_rate": 2.3132004981320052e-05, "loss": 0.0002, "step": 749 }, { "epoch": 0.933997509339975, "grad_norm": 1.5796531438827515, "learning_rate": 2.3163138231631384e-05, "loss": 0.0147, "step": 750 }, { "epoch": 0.9352428393524284, "grad_norm": 0.038087982684373856, "learning_rate": 2.3194271481942716e-05, "loss": 0.0008, "step": 751 }, { "epoch": 0.9364881693648817, "grad_norm": 0.005102880764752626, "learning_rate": 2.322540473225405e-05, "loss": 0.0001, "step": 752 }, { "epoch": 0.937733499377335, "grad_norm": 306.6837158203125, "learning_rate": 2.3256537982565383e-05, "loss": 3.1504, "step": 753 }, { "epoch": 0.9389788293897883, "grad_norm": 0.006043303292244673, "learning_rate": 2.328767123287671e-05, "loss": 0.0001, "step": 754 }, { "epoch": 0.9402241594022416, "grad_norm": 0.027712326496839523, "learning_rate": 2.3318804483188046e-05, "loss": 0.0008, "step": 755 }, { "epoch": 0.9414694894146949, "grad_norm": 0.015633290633559227, "learning_rate": 2.3349937733499378e-05, "loss": 0.0004, "step": 756 }, { "epoch": 0.9427148194271482, "grad_norm": 0.007909745909273624, "learning_rate": 2.338107098381071e-05, "loss": 0.0002, "step": 757 }, { "epoch": 0.9439601494396015, "grad_norm": 0.018452487885951996, "learning_rate": 2.341220423412204e-05, "loss": 0.0004, "step": 758 }, { "epoch": 0.9452054794520548, "grad_norm": 0.010309605859220028, "learning_rate": 2.3443337484433376e-05, "loss": 0.0002, "step": 759 }, { "epoch": 0.9464508094645081, "grad_norm": 0.005897897761315107, "learning_rate": 2.3474470734744708e-05, "loss": 0.0001, "step": 760 }, { "epoch": 0.9476961394769614, "grad_norm": 0.024718550965189934, "learning_rate": 2.350560398505604e-05, "loss": 0.0007, "step": 761 }, { "epoch": 0.9489414694894147, "grad_norm": 0.014151460491120815, "learning_rate": 2.3536737235367372e-05, "loss": 0.0004, "step": 762 }, { "epoch": 0.950186799501868, "grad_norm": 0.05046864598989487, "learning_rate": 2.3567870485678707e-05, "loss": 0.0005, "step": 763 }, { "epoch": 0.9514321295143213, "grad_norm": 0.05455144867300987, "learning_rate": 2.359900373599004e-05, "loss": 0.0006, "step": 764 }, { "epoch": 0.9526774595267746, "grad_norm": 0.02435392327606678, "learning_rate": 2.363013698630137e-05, "loss": 0.0003, "step": 765 }, { "epoch": 0.9539227895392279, "grad_norm": 0.025639377534389496, "learning_rate": 2.3661270236612702e-05, "loss": 0.0005, "step": 766 }, { "epoch": 0.9551681195516812, "grad_norm": 0.015089256688952446, "learning_rate": 2.3692403486924037e-05, "loss": 0.0004, "step": 767 }, { "epoch": 0.9564134495641345, "grad_norm": 0.032805927097797394, "learning_rate": 2.372353673723537e-05, "loss": 0.0006, "step": 768 }, { "epoch": 0.9576587795765878, "grad_norm": 0.015525261871516705, "learning_rate": 2.37546699875467e-05, "loss": 0.0004, "step": 769 }, { "epoch": 0.958904109589041, "grad_norm": 0.008337048813700676, "learning_rate": 2.3785803237858036e-05, "loss": 0.0002, "step": 770 }, { "epoch": 0.9601494396014943, "grad_norm": 0.037120576947927475, "learning_rate": 2.3816936488169368e-05, "loss": 0.0004, "step": 771 }, { "epoch": 0.9613947696139477, "grad_norm": 0.01175164058804512, "learning_rate": 2.38480697384807e-05, "loss": 0.0003, "step": 772 }, { "epoch": 0.962640099626401, "grad_norm": 0.010447794571518898, "learning_rate": 2.387920298879203e-05, "loss": 0.0003, "step": 773 }, { "epoch": 0.9638854296388543, "grad_norm": 0.010614910162985325, "learning_rate": 2.3910336239103366e-05, "loss": 0.0001, "step": 774 }, { "epoch": 0.9651307596513076, "grad_norm": 0.07238447666168213, "learning_rate": 2.3941469489414698e-05, "loss": 0.0007, "step": 775 }, { "epoch": 0.9663760896637609, "grad_norm": 0.03060179576277733, "learning_rate": 2.3972602739726026e-05, "loss": 0.0007, "step": 776 }, { "epoch": 0.9676214196762142, "grad_norm": 0.08607795089483261, "learning_rate": 2.400373599003736e-05, "loss": 0.0004, "step": 777 }, { "epoch": 0.9688667496886675, "grad_norm": 0.030211659148335457, "learning_rate": 2.4034869240348693e-05, "loss": 0.0003, "step": 778 }, { "epoch": 0.9701120797011208, "grad_norm": 0.006784611847251654, "learning_rate": 2.4066002490660025e-05, "loss": 0.0002, "step": 779 }, { "epoch": 0.9713574097135741, "grad_norm": 0.011817213147878647, "learning_rate": 2.4097135740971357e-05, "loss": 0.0003, "step": 780 }, { "epoch": 0.9726027397260274, "grad_norm": 0.029583904892206192, "learning_rate": 2.4128268991282692e-05, "loss": 0.0004, "step": 781 }, { "epoch": 0.9738480697384807, "grad_norm": 0.007558898068964481, "learning_rate": 2.4159402241594024e-05, "loss": 0.0003, "step": 782 }, { "epoch": 0.975093399750934, "grad_norm": 481.3611755371094, "learning_rate": 2.4190535491905355e-05, "loss": 2.5255, "step": 783 }, { "epoch": 0.9763387297633873, "grad_norm": 127.75431060791016, "learning_rate": 2.4221668742216687e-05, "loss": 0.0841, "step": 784 }, { "epoch": 0.9775840597758406, "grad_norm": 0.01205628365278244, "learning_rate": 2.4252801992528022e-05, "loss": 0.0004, "step": 785 }, { "epoch": 0.9788293897882939, "grad_norm": 411.4049377441406, "learning_rate": 2.4283935242839354e-05, "loss": 1.7384, "step": 786 }, { "epoch": 0.9800747198007472, "grad_norm": 1.6122777462005615, "learning_rate": 2.4315068493150686e-05, "loss": 0.0018, "step": 787 }, { "epoch": 0.9813200498132005, "grad_norm": 0.013621006160974503, "learning_rate": 2.4346201743462018e-05, "loss": 0.0004, "step": 788 }, { "epoch": 0.9825653798256538, "grad_norm": 0.0152182187885046, "learning_rate": 2.4377334993773353e-05, "loss": 0.0003, "step": 789 }, { "epoch": 0.9838107098381071, "grad_norm": 241.25070190429688, "learning_rate": 2.4408468244084684e-05, "loss": 0.1739, "step": 790 }, { "epoch": 0.9850560398505604, "grad_norm": 0.009512806311249733, "learning_rate": 2.4439601494396016e-05, "loss": 0.0003, "step": 791 }, { "epoch": 0.9863013698630136, "grad_norm": 12.394267082214355, "learning_rate": 2.4470734744707348e-05, "loss": 0.0218, "step": 792 }, { "epoch": 0.987546699875467, "grad_norm": 0.008201587945222855, "learning_rate": 2.4501867995018683e-05, "loss": 0.0002, "step": 793 }, { "epoch": 0.9887920298879203, "grad_norm": 0.049125440418720245, "learning_rate": 2.4533001245330015e-05, "loss": 0.0006, "step": 794 }, { "epoch": 0.9900373599003736, "grad_norm": 0.0920347198843956, "learning_rate": 2.4564134495641347e-05, "loss": 0.001, "step": 795 }, { "epoch": 0.9912826899128269, "grad_norm": 35.2567253112793, "learning_rate": 2.4595267745952678e-05, "loss": 0.0267, "step": 796 }, { "epoch": 0.9925280199252802, "grad_norm": 0.01363935973495245, "learning_rate": 2.4626400996264013e-05, "loss": 0.0003, "step": 797 }, { "epoch": 0.9937733499377335, "grad_norm": 0.009647058323025703, "learning_rate": 2.4657534246575342e-05, "loss": 0.0003, "step": 798 }, { "epoch": 0.9950186799501868, "grad_norm": 0.005581174045801163, "learning_rate": 2.4688667496886674e-05, "loss": 0.0002, "step": 799 }, { "epoch": 0.9962640099626401, "grad_norm": 0.006403461564332247, "learning_rate": 2.471980074719801e-05, "loss": 0.0002, "step": 800 }, { "epoch": 0.9975093399750934, "grad_norm": 0.018721066415309906, "learning_rate": 2.475093399750934e-05, "loss": 0.0005, "step": 801 }, { "epoch": 0.9987546699875467, "grad_norm": 0.0068865250796079636, "learning_rate": 2.4782067247820672e-05, "loss": 0.0002, "step": 802 }, { "epoch": 1.0, "grad_norm": 148.17623901367188, "learning_rate": 2.4813200498132004e-05, "loss": 0.2457, "step": 803 }, { "epoch": 1.0, "eval_accuracy": 0.9760765550239234, "eval_f1_macro": 0.9768339768339769, "eval_f1_micro": 0.9760765550239234, "eval_f1_weighted": 0.9760457655194498, "eval_loss": 0.244761124253273, "eval_precision_macro": 0.978448275862069, "eval_precision_micro": 0.9760765550239234, "eval_precision_weighted": 0.978138920970137, "eval_recall_macro": 0.9772727272727273, "eval_recall_micro": 0.9760765550239234, "eval_recall_weighted": 0.9760765550239234, "eval_runtime": 29.9929, "eval_samples_per_second": 6.968, "eval_steps_per_second": 0.467, "step": 803 } ], "logging_steps": 1, "max_steps": 16060, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.394707013520589e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }