{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3035, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016474464579901153, "grad_norm": 5.606867346944261, "learning_rate": 2.6315789473684213e-07, "loss": 0.9609, "step": 1 }, { "epoch": 0.0032948929159802307, "grad_norm": 5.5030408042526275, "learning_rate": 5.263157894736843e-07, "loss": 0.945, "step": 2 }, { "epoch": 0.004942339373970346, "grad_norm": 5.524544274919888, "learning_rate": 7.894736842105263e-07, "loss": 0.9564, "step": 3 }, { "epoch": 0.006589785831960461, "grad_norm": 5.508421175275815, "learning_rate": 1.0526315789473685e-06, "loss": 0.953, "step": 4 }, { "epoch": 0.008237232289950576, "grad_norm": 5.477570713342696, "learning_rate": 1.3157894736842106e-06, "loss": 0.9511, "step": 5 }, { "epoch": 0.009884678747940691, "grad_norm": 5.109337058599501, "learning_rate": 1.5789473684210526e-06, "loss": 0.9377, "step": 6 }, { "epoch": 0.011532125205930808, "grad_norm": 4.223637432908828, "learning_rate": 1.8421052631578948e-06, "loss": 0.918, "step": 7 }, { "epoch": 0.013179571663920923, "grad_norm": 4.09298106079945, "learning_rate": 2.105263157894737e-06, "loss": 0.9226, "step": 8 }, { "epoch": 0.014827018121911038, "grad_norm": 3.807688465163409, "learning_rate": 2.368421052631579e-06, "loss": 0.9044, "step": 9 }, { "epoch": 0.016474464579901153, "grad_norm": 2.2811370351583076, "learning_rate": 2.631578947368421e-06, "loss": 0.8492, "step": 10 }, { "epoch": 0.018121911037891267, "grad_norm": 2.087009259701483, "learning_rate": 2.8947368421052634e-06, "loss": 0.8536, "step": 11 }, { "epoch": 0.019769357495881382, "grad_norm": 2.0441175712514434, "learning_rate": 3.157894736842105e-06, "loss": 0.8532, "step": 12 }, { "epoch": 0.0214168039538715, "grad_norm": 3.452089775076319, "learning_rate": 3.421052631578948e-06, "loss": 0.8483, "step": 13 }, { "epoch": 0.023064250411861616, "grad_norm": 3.7205931385613775, "learning_rate": 3.6842105263157896e-06, "loss": 0.8488, "step": 14 }, { "epoch": 0.02471169686985173, "grad_norm": 3.613035906102933, "learning_rate": 3.947368421052632e-06, "loss": 0.8371, "step": 15 }, { "epoch": 0.026359143327841845, "grad_norm": 3.494344949318209, "learning_rate": 4.210526315789474e-06, "loss": 0.8292, "step": 16 }, { "epoch": 0.02800658978583196, "grad_norm": 2.6541020816296723, "learning_rate": 4.473684210526316e-06, "loss": 0.7935, "step": 17 }, { "epoch": 0.029654036243822075, "grad_norm": 2.3310669904049526, "learning_rate": 4.736842105263158e-06, "loss": 0.7712, "step": 18 }, { "epoch": 0.03130148270181219, "grad_norm": 2.0100627448218016, "learning_rate": 5e-06, "loss": 0.7628, "step": 19 }, { "epoch": 0.032948929159802305, "grad_norm": 1.5860147656550168, "learning_rate": 5.263157894736842e-06, "loss": 0.7683, "step": 20 }, { "epoch": 0.03459637561779242, "grad_norm": 1.2927018041850777, "learning_rate": 5.526315789473685e-06, "loss": 0.7477, "step": 21 }, { "epoch": 0.036243822075782535, "grad_norm": 1.3063331971644814, "learning_rate": 5.789473684210527e-06, "loss": 0.7384, "step": 22 }, { "epoch": 0.03789126853377265, "grad_norm": 1.3688198891184145, "learning_rate": 6.0526315789473685e-06, "loss": 0.7269, "step": 23 }, { "epoch": 0.039538714991762765, "grad_norm": 1.3830905151640158, "learning_rate": 6.31578947368421e-06, "loss": 0.7292, "step": 24 }, { "epoch": 0.04118616144975288, "grad_norm": 1.1335441724401047, "learning_rate": 6.578947368421054e-06, "loss": 0.7141, "step": 25 }, { "epoch": 0.042833607907743, "grad_norm": 0.9665536012724556, "learning_rate": 6.842105263157896e-06, "loss": 0.7159, "step": 26 }, { "epoch": 0.044481054365733116, "grad_norm": 0.9250723581049018, "learning_rate": 7.1052631578947375e-06, "loss": 0.6896, "step": 27 }, { "epoch": 0.04612850082372323, "grad_norm": 0.8809472618224354, "learning_rate": 7.368421052631579e-06, "loss": 0.6964, "step": 28 }, { "epoch": 0.047775947281713346, "grad_norm": 0.8047046501099524, "learning_rate": 7.631578947368423e-06, "loss": 0.6919, "step": 29 }, { "epoch": 0.04942339373970346, "grad_norm": 0.7739921031888567, "learning_rate": 7.894736842105265e-06, "loss": 0.6734, "step": 30 }, { "epoch": 0.051070840197693576, "grad_norm": 0.8570577929723563, "learning_rate": 8.157894736842106e-06, "loss": 0.6809, "step": 31 }, { "epoch": 0.05271828665568369, "grad_norm": 0.6313688361586013, "learning_rate": 8.421052631578948e-06, "loss": 0.6623, "step": 32 }, { "epoch": 0.054365733113673806, "grad_norm": 0.6416916697327776, "learning_rate": 8.68421052631579e-06, "loss": 0.6612, "step": 33 }, { "epoch": 0.05601317957166392, "grad_norm": 0.6665242546602088, "learning_rate": 8.947368421052632e-06, "loss": 0.6659, "step": 34 }, { "epoch": 0.057660626029654036, "grad_norm": 0.5631482419593337, "learning_rate": 9.210526315789474e-06, "loss": 0.6617, "step": 35 }, { "epoch": 0.05930807248764415, "grad_norm": 0.5331745162877777, "learning_rate": 9.473684210526315e-06, "loss": 0.6476, "step": 36 }, { "epoch": 0.060955518945634266, "grad_norm": 0.48612133029619203, "learning_rate": 9.736842105263159e-06, "loss": 0.6394, "step": 37 }, { "epoch": 0.06260296540362438, "grad_norm": 0.47632070933774906, "learning_rate": 1e-05, "loss": 0.6437, "step": 38 }, { "epoch": 0.0642504118616145, "grad_norm": 0.45923880014427315, "learning_rate": 1.0263157894736844e-05, "loss": 0.6394, "step": 39 }, { "epoch": 0.06589785831960461, "grad_norm": 0.4462589516524617, "learning_rate": 1.0526315789473684e-05, "loss": 0.6449, "step": 40 }, { "epoch": 0.06754530477759473, "grad_norm": 0.38305849820960597, "learning_rate": 1.0789473684210528e-05, "loss": 0.6434, "step": 41 }, { "epoch": 0.06919275123558484, "grad_norm": 0.405533566112312, "learning_rate": 1.105263157894737e-05, "loss": 0.6403, "step": 42 }, { "epoch": 0.07084019769357495, "grad_norm": 0.34839285738973025, "learning_rate": 1.1315789473684212e-05, "loss": 0.6293, "step": 43 }, { "epoch": 0.07248764415156507, "grad_norm": 0.4197060050380805, "learning_rate": 1.1578947368421053e-05, "loss": 0.6285, "step": 44 }, { "epoch": 0.07413509060955518, "grad_norm": 0.35879086820924383, "learning_rate": 1.1842105263157895e-05, "loss": 0.6209, "step": 45 }, { "epoch": 0.0757825370675453, "grad_norm": 0.3264974539302913, "learning_rate": 1.2105263157894737e-05, "loss": 0.621, "step": 46 }, { "epoch": 0.07742998352553541, "grad_norm": 0.36657105191934125, "learning_rate": 1.236842105263158e-05, "loss": 0.6332, "step": 47 }, { "epoch": 0.07907742998352553, "grad_norm": 0.3199549464474638, "learning_rate": 1.263157894736842e-05, "loss": 0.615, "step": 48 }, { "epoch": 0.08072487644151564, "grad_norm": 0.3132307626030351, "learning_rate": 1.2894736842105264e-05, "loss": 0.6202, "step": 49 }, { "epoch": 0.08237232289950576, "grad_norm": 0.3288995653866969, "learning_rate": 1.3157894736842108e-05, "loss": 0.6241, "step": 50 }, { "epoch": 0.08401976935749589, "grad_norm": 0.3102139173669284, "learning_rate": 1.3421052631578948e-05, "loss": 0.6145, "step": 51 }, { "epoch": 0.085667215815486, "grad_norm": 0.36158780034937954, "learning_rate": 1.3684210526315791e-05, "loss": 0.6027, "step": 52 }, { "epoch": 0.08731466227347612, "grad_norm": 0.3068211094871939, "learning_rate": 1.3947368421052631e-05, "loss": 0.6146, "step": 53 }, { "epoch": 0.08896210873146623, "grad_norm": 0.29638378848449565, "learning_rate": 1.4210526315789475e-05, "loss": 0.6067, "step": 54 }, { "epoch": 0.09060955518945635, "grad_norm": 0.3658359791450759, "learning_rate": 1.4473684210526317e-05, "loss": 0.6147, "step": 55 }, { "epoch": 0.09225700164744646, "grad_norm": 0.31914407264471356, "learning_rate": 1.4736842105263159e-05, "loss": 0.6029, "step": 56 }, { "epoch": 0.09390444810543658, "grad_norm": 0.353864966322472, "learning_rate": 1.5000000000000002e-05, "loss": 0.6006, "step": 57 }, { "epoch": 0.09555189456342669, "grad_norm": 0.3086005577693372, "learning_rate": 1.5263157894736846e-05, "loss": 0.6026, "step": 58 }, { "epoch": 0.09719934102141681, "grad_norm": 0.3317752234362949, "learning_rate": 1.5526315789473686e-05, "loss": 0.6014, "step": 59 }, { "epoch": 0.09884678747940692, "grad_norm": 0.29220969052913914, "learning_rate": 1.578947368421053e-05, "loss": 0.602, "step": 60 }, { "epoch": 0.10049423393739704, "grad_norm": 0.28767596732818435, "learning_rate": 1.605263157894737e-05, "loss": 0.5829, "step": 61 }, { "epoch": 0.10214168039538715, "grad_norm": 0.32934619017511707, "learning_rate": 1.6315789473684213e-05, "loss": 0.6029, "step": 62 }, { "epoch": 0.10378912685337727, "grad_norm": 0.32730903548601903, "learning_rate": 1.6578947368421053e-05, "loss": 0.5896, "step": 63 }, { "epoch": 0.10543657331136738, "grad_norm": 0.43006744674851627, "learning_rate": 1.6842105263157896e-05, "loss": 0.5879, "step": 64 }, { "epoch": 0.1070840197693575, "grad_norm": 0.40956875563823736, "learning_rate": 1.7105263157894737e-05, "loss": 0.5945, "step": 65 }, { "epoch": 0.10873146622734761, "grad_norm": 0.3731568405902921, "learning_rate": 1.736842105263158e-05, "loss": 0.5938, "step": 66 }, { "epoch": 0.11037891268533773, "grad_norm": 0.49011652547020024, "learning_rate": 1.763157894736842e-05, "loss": 0.5945, "step": 67 }, { "epoch": 0.11202635914332784, "grad_norm": 0.5774743593873469, "learning_rate": 1.7894736842105264e-05, "loss": 0.5881, "step": 68 }, { "epoch": 0.11367380560131796, "grad_norm": 0.49658639012431705, "learning_rate": 1.8157894736842107e-05, "loss": 0.5919, "step": 69 }, { "epoch": 0.11532125205930807, "grad_norm": 0.6928985851874031, "learning_rate": 1.8421052631578947e-05, "loss": 0.5849, "step": 70 }, { "epoch": 0.11696869851729819, "grad_norm": 1.0150791732647075, "learning_rate": 1.868421052631579e-05, "loss": 0.5919, "step": 71 }, { "epoch": 0.1186161449752883, "grad_norm": 1.084028281161954, "learning_rate": 1.894736842105263e-05, "loss": 0.594, "step": 72 }, { "epoch": 0.12026359143327842, "grad_norm": 0.5798235825816589, "learning_rate": 1.9210526315789474e-05, "loss": 0.5831, "step": 73 }, { "epoch": 0.12191103789126853, "grad_norm": 0.9840294604221991, "learning_rate": 1.9473684210526318e-05, "loss": 0.589, "step": 74 }, { "epoch": 0.12355848434925865, "grad_norm": 1.027363372158991, "learning_rate": 1.9736842105263158e-05, "loss": 0.5889, "step": 75 }, { "epoch": 0.12520593080724876, "grad_norm": 0.8289600065347142, "learning_rate": 2e-05, "loss": 0.5953, "step": 76 }, { "epoch": 0.12685337726523888, "grad_norm": 1.2114930428830757, "learning_rate": 2.0263157894736842e-05, "loss": 0.5941, "step": 77 }, { "epoch": 0.128500823723229, "grad_norm": 0.7031783894979402, "learning_rate": 2.052631578947369e-05, "loss": 0.5851, "step": 78 }, { "epoch": 0.1301482701812191, "grad_norm": 0.960726872462596, "learning_rate": 2.078947368421053e-05, "loss": 0.5857, "step": 79 }, { "epoch": 0.13179571663920922, "grad_norm": 0.8276466392439115, "learning_rate": 2.105263157894737e-05, "loss": 0.5754, "step": 80 }, { "epoch": 0.13344316309719934, "grad_norm": 1.006336646779249, "learning_rate": 2.1315789473684216e-05, "loss": 0.5826, "step": 81 }, { "epoch": 0.13509060955518945, "grad_norm": 1.0549561406018424, "learning_rate": 2.1578947368421056e-05, "loss": 0.5688, "step": 82 }, { "epoch": 0.13673805601317957, "grad_norm": 0.647906987486769, "learning_rate": 2.1842105263157896e-05, "loss": 0.5768, "step": 83 }, { "epoch": 0.13838550247116968, "grad_norm": 0.9870276685201461, "learning_rate": 2.210526315789474e-05, "loss": 0.5739, "step": 84 }, { "epoch": 0.1400329489291598, "grad_norm": 0.7762628359008718, "learning_rate": 2.2368421052631583e-05, "loss": 0.5876, "step": 85 }, { "epoch": 0.1416803953871499, "grad_norm": 0.9038917922095141, "learning_rate": 2.2631578947368423e-05, "loss": 0.5709, "step": 86 }, { "epoch": 0.14332784184514002, "grad_norm": 0.9260523789992919, "learning_rate": 2.2894736842105263e-05, "loss": 0.5809, "step": 87 }, { "epoch": 0.14497528830313014, "grad_norm": 0.9685814173292929, "learning_rate": 2.3157894736842107e-05, "loss": 0.5742, "step": 88 }, { "epoch": 0.14662273476112025, "grad_norm": 1.2358283733908286, "learning_rate": 2.342105263157895e-05, "loss": 0.5797, "step": 89 }, { "epoch": 0.14827018121911037, "grad_norm": 0.9405460727811819, "learning_rate": 2.368421052631579e-05, "loss": 0.5647, "step": 90 }, { "epoch": 0.14991762767710048, "grad_norm": 1.0813487708487648, "learning_rate": 2.3947368421052634e-05, "loss": 0.5714, "step": 91 }, { "epoch": 0.1515650741350906, "grad_norm": 0.7307822442517492, "learning_rate": 2.4210526315789474e-05, "loss": 0.5733, "step": 92 }, { "epoch": 0.15321252059308071, "grad_norm": 1.204714365175125, "learning_rate": 2.4473684210526318e-05, "loss": 0.5732, "step": 93 }, { "epoch": 0.15485996705107083, "grad_norm": 0.6335770089850356, "learning_rate": 2.473684210526316e-05, "loss": 0.5755, "step": 94 }, { "epoch": 0.15650741350906094, "grad_norm": 1.073074079441158, "learning_rate": 2.5e-05, "loss": 0.5678, "step": 95 }, { "epoch": 0.15815485996705106, "grad_norm": 0.8664732475671978, "learning_rate": 2.526315789473684e-05, "loss": 0.575, "step": 96 }, { "epoch": 0.15980230642504117, "grad_norm": 1.0991682561343303, "learning_rate": 2.5526315789473688e-05, "loss": 0.5694, "step": 97 }, { "epoch": 0.1614497528830313, "grad_norm": 0.9018754204024617, "learning_rate": 2.578947368421053e-05, "loss": 0.573, "step": 98 }, { "epoch": 0.1630971993410214, "grad_norm": 0.8969427095038454, "learning_rate": 2.605263157894737e-05, "loss": 0.5699, "step": 99 }, { "epoch": 0.16474464579901152, "grad_norm": 0.8586222468870607, "learning_rate": 2.6315789473684215e-05, "loss": 0.5652, "step": 100 }, { "epoch": 0.16639209225700163, "grad_norm": 1.067444385363332, "learning_rate": 2.6578947368421055e-05, "loss": 0.5599, "step": 101 }, { "epoch": 0.16803953871499178, "grad_norm": 1.3678427302282534, "learning_rate": 2.6842105263157896e-05, "loss": 0.566, "step": 102 }, { "epoch": 0.1696869851729819, "grad_norm": 0.7345341275035281, "learning_rate": 2.7105263157894742e-05, "loss": 0.5645, "step": 103 }, { "epoch": 0.171334431630972, "grad_norm": 1.048926919214303, "learning_rate": 2.7368421052631583e-05, "loss": 0.5615, "step": 104 }, { "epoch": 0.17298187808896212, "grad_norm": 1.3882487285597853, "learning_rate": 2.7631578947368423e-05, "loss": 0.5656, "step": 105 }, { "epoch": 0.17462932454695224, "grad_norm": 0.719275589578617, "learning_rate": 2.7894736842105263e-05, "loss": 0.5639, "step": 106 }, { "epoch": 0.17627677100494235, "grad_norm": 1.4037154922104396, "learning_rate": 2.815789473684211e-05, "loss": 0.573, "step": 107 }, { "epoch": 0.17792421746293247, "grad_norm": 0.8010466442306526, "learning_rate": 2.842105263157895e-05, "loss": 0.57, "step": 108 }, { "epoch": 0.17957166392092258, "grad_norm": 1.09060669413238, "learning_rate": 2.868421052631579e-05, "loss": 0.5595, "step": 109 }, { "epoch": 0.1812191103789127, "grad_norm": 0.879691420330049, "learning_rate": 2.8947368421052634e-05, "loss": 0.5644, "step": 110 }, { "epoch": 0.1828665568369028, "grad_norm": 0.7290189617880098, "learning_rate": 2.9210526315789477e-05, "loss": 0.5569, "step": 111 }, { "epoch": 0.18451400329489293, "grad_norm": 0.8235980505278054, "learning_rate": 2.9473684210526317e-05, "loss": 0.5608, "step": 112 }, { "epoch": 0.18616144975288304, "grad_norm": 1.202423085058067, "learning_rate": 2.973684210526316e-05, "loss": 0.5638, "step": 113 }, { "epoch": 0.18780889621087316, "grad_norm": 0.7576311183262442, "learning_rate": 3.0000000000000004e-05, "loss": 0.5613, "step": 114 }, { "epoch": 0.18945634266886327, "grad_norm": 1.2898308354668777, "learning_rate": 3.0263157894736844e-05, "loss": 0.562, "step": 115 }, { "epoch": 0.19110378912685339, "grad_norm": 1.2394716114835673, "learning_rate": 3.052631578947369e-05, "loss": 0.5559, "step": 116 }, { "epoch": 0.1927512355848435, "grad_norm": 1.018425291867, "learning_rate": 3.078947368421053e-05, "loss": 0.5553, "step": 117 }, { "epoch": 0.19439868204283361, "grad_norm": 0.7630734184108184, "learning_rate": 3.105263157894737e-05, "loss": 0.5536, "step": 118 }, { "epoch": 0.19604612850082373, "grad_norm": 0.7254727885011482, "learning_rate": 3.1315789473684215e-05, "loss": 0.5604, "step": 119 }, { "epoch": 0.19769357495881384, "grad_norm": 1.35994903255289, "learning_rate": 3.157894736842106e-05, "loss": 0.5684, "step": 120 }, { "epoch": 0.19934102141680396, "grad_norm": 0.982935383402244, "learning_rate": 3.1842105263157895e-05, "loss": 0.5716, "step": 121 }, { "epoch": 0.20098846787479407, "grad_norm": 1.301396257038502, "learning_rate": 3.210526315789474e-05, "loss": 0.5574, "step": 122 }, { "epoch": 0.2026359143327842, "grad_norm": 0.5801481715585196, "learning_rate": 3.236842105263158e-05, "loss": 0.5496, "step": 123 }, { "epoch": 0.2042833607907743, "grad_norm": 1.0155381099999612, "learning_rate": 3.2631578947368426e-05, "loss": 0.5505, "step": 124 }, { "epoch": 0.20593080724876442, "grad_norm": 1.0469752880441707, "learning_rate": 3.289473684210526e-05, "loss": 0.5503, "step": 125 }, { "epoch": 0.20757825370675453, "grad_norm": 1.0724214805779027, "learning_rate": 3.3157894736842106e-05, "loss": 0.5526, "step": 126 }, { "epoch": 0.20922570016474465, "grad_norm": 1.3170966673285873, "learning_rate": 3.342105263157895e-05, "loss": 0.5586, "step": 127 }, { "epoch": 0.21087314662273476, "grad_norm": 0.7609795614311489, "learning_rate": 3.368421052631579e-05, "loss": 0.549, "step": 128 }, { "epoch": 0.21252059308072488, "grad_norm": 1.1849926489099287, "learning_rate": 3.3947368421052636e-05, "loss": 0.5556, "step": 129 }, { "epoch": 0.214168039538715, "grad_norm": 1.2035749122347912, "learning_rate": 3.421052631578947e-05, "loss": 0.5566, "step": 130 }, { "epoch": 0.2158154859967051, "grad_norm": 1.6173966610508668, "learning_rate": 3.447368421052632e-05, "loss": 0.5671, "step": 131 }, { "epoch": 0.21746293245469522, "grad_norm": 0.8761042655367213, "learning_rate": 3.473684210526316e-05, "loss": 0.5513, "step": 132 }, { "epoch": 0.21911037891268534, "grad_norm": 2.044870385477803, "learning_rate": 3.5000000000000004e-05, "loss": 0.5754, "step": 133 }, { "epoch": 0.22075782537067545, "grad_norm": 1.2758804852721555, "learning_rate": 3.526315789473684e-05, "loss": 0.5582, "step": 134 }, { "epoch": 0.22240527182866557, "grad_norm": 2.0404980529727212, "learning_rate": 3.552631578947369e-05, "loss": 0.579, "step": 135 }, { "epoch": 0.22405271828665568, "grad_norm": 1.665963336665113, "learning_rate": 3.578947368421053e-05, "loss": 0.5662, "step": 136 }, { "epoch": 0.2257001647446458, "grad_norm": 1.9811587573344454, "learning_rate": 3.605263157894737e-05, "loss": 0.5548, "step": 137 }, { "epoch": 0.2273476112026359, "grad_norm": 1.4896909208576015, "learning_rate": 3.6315789473684214e-05, "loss": 0.5623, "step": 138 }, { "epoch": 0.22899505766062603, "grad_norm": 1.8916949300041206, "learning_rate": 3.657894736842106e-05, "loss": 0.561, "step": 139 }, { "epoch": 0.23064250411861614, "grad_norm": 0.9959614915975089, "learning_rate": 3.6842105263157895e-05, "loss": 0.5606, "step": 140 }, { "epoch": 0.23228995057660626, "grad_norm": 2.483652083739719, "learning_rate": 3.7105263157894745e-05, "loss": 0.5687, "step": 141 }, { "epoch": 0.23393739703459637, "grad_norm": 2.026757446306994, "learning_rate": 3.736842105263158e-05, "loss": 0.5631, "step": 142 }, { "epoch": 0.2355848434925865, "grad_norm": 1.4490195627203315, "learning_rate": 3.7631578947368425e-05, "loss": 0.5596, "step": 143 }, { "epoch": 0.2372322899505766, "grad_norm": 1.3561950735853467, "learning_rate": 3.789473684210526e-05, "loss": 0.5529, "step": 144 }, { "epoch": 0.23887973640856672, "grad_norm": 1.2461684284376127, "learning_rate": 3.815789473684211e-05, "loss": 0.5577, "step": 145 }, { "epoch": 0.24052718286655683, "grad_norm": 1.26255711260266, "learning_rate": 3.842105263157895e-05, "loss": 0.5505, "step": 146 }, { "epoch": 0.24217462932454695, "grad_norm": 1.153627928570833, "learning_rate": 3.868421052631579e-05, "loss": 0.5554, "step": 147 }, { "epoch": 0.24382207578253706, "grad_norm": 0.9797319869798142, "learning_rate": 3.8947368421052636e-05, "loss": 0.545, "step": 148 }, { "epoch": 0.24546952224052718, "grad_norm": 1.2965416509974745, "learning_rate": 3.921052631578948e-05, "loss": 0.5653, "step": 149 }, { "epoch": 0.2471169686985173, "grad_norm": 1.0390667051038687, "learning_rate": 3.9473684210526316e-05, "loss": 0.5533, "step": 150 }, { "epoch": 0.2487644151565074, "grad_norm": 1.4233149253840334, "learning_rate": 3.973684210526316e-05, "loss": 0.5594, "step": 151 }, { "epoch": 0.2504118616144975, "grad_norm": 0.849821635678982, "learning_rate": 4e-05, "loss": 0.5551, "step": 152 }, { "epoch": 0.25205930807248766, "grad_norm": 1.5577346170292459, "learning_rate": 4.026315789473685e-05, "loss": 0.554, "step": 153 }, { "epoch": 0.25370675453047775, "grad_norm": 1.148611630961357, "learning_rate": 4.0526315789473684e-05, "loss": 0.558, "step": 154 }, { "epoch": 0.2553542009884679, "grad_norm": 1.3144640789488864, "learning_rate": 4.078947368421053e-05, "loss": 0.5534, "step": 155 }, { "epoch": 0.257001647446458, "grad_norm": 1.153760952279454, "learning_rate": 4.105263157894738e-05, "loss": 0.5468, "step": 156 }, { "epoch": 0.2586490939044481, "grad_norm": 1.2030772433981922, "learning_rate": 4.1315789473684214e-05, "loss": 0.5527, "step": 157 }, { "epoch": 0.2602965403624382, "grad_norm": 1.123721724221617, "learning_rate": 4.157894736842106e-05, "loss": 0.5444, "step": 158 }, { "epoch": 0.26194398682042835, "grad_norm": 1.2555055135603268, "learning_rate": 4.1842105263157894e-05, "loss": 0.5477, "step": 159 }, { "epoch": 0.26359143327841844, "grad_norm": 1.1213466136443964, "learning_rate": 4.210526315789474e-05, "loss": 0.5598, "step": 160 }, { "epoch": 0.2652388797364086, "grad_norm": 0.7656707413042361, "learning_rate": 4.236842105263158e-05, "loss": 0.5441, "step": 161 }, { "epoch": 0.26688632619439867, "grad_norm": 0.9610244028784484, "learning_rate": 4.263157894736843e-05, "loss": 0.5463, "step": 162 }, { "epoch": 0.2685337726523888, "grad_norm": 1.0727177908338896, "learning_rate": 4.289473684210527e-05, "loss": 0.5463, "step": 163 }, { "epoch": 0.2701812191103789, "grad_norm": 1.711923801499504, "learning_rate": 4.315789473684211e-05, "loss": 0.5558, "step": 164 }, { "epoch": 0.27182866556836904, "grad_norm": 0.7058981593239295, "learning_rate": 4.342105263157895e-05, "loss": 0.55, "step": 165 }, { "epoch": 0.27347611202635913, "grad_norm": 2.26422593677963, "learning_rate": 4.368421052631579e-05, "loss": 0.5559, "step": 166 }, { "epoch": 0.2751235584843493, "grad_norm": 1.0601196130358697, "learning_rate": 4.394736842105263e-05, "loss": 0.5497, "step": 167 }, { "epoch": 0.27677100494233936, "grad_norm": 2.495656848713433, "learning_rate": 4.421052631578948e-05, "loss": 0.5639, "step": 168 }, { "epoch": 0.2784184514003295, "grad_norm": 2.170944749668105, "learning_rate": 4.447368421052632e-05, "loss": 0.5634, "step": 169 }, { "epoch": 0.2800658978583196, "grad_norm": 1.818126033995904, "learning_rate": 4.4736842105263166e-05, "loss": 0.5571, "step": 170 }, { "epoch": 0.28171334431630973, "grad_norm": 1.6937451235924088, "learning_rate": 4.5e-05, "loss": 0.5612, "step": 171 }, { "epoch": 0.2833607907742998, "grad_norm": 1.473736626044901, "learning_rate": 4.5263157894736846e-05, "loss": 0.5544, "step": 172 }, { "epoch": 0.28500823723228996, "grad_norm": 1.394628030780419, "learning_rate": 4.552631578947368e-05, "loss": 0.5533, "step": 173 }, { "epoch": 0.28665568369028005, "grad_norm": 1.0050012794552983, "learning_rate": 4.5789473684210527e-05, "loss": 0.5553, "step": 174 }, { "epoch": 0.2883031301482702, "grad_norm": 1.127083298598365, "learning_rate": 4.605263157894738e-05, "loss": 0.544, "step": 175 }, { "epoch": 0.2899505766062603, "grad_norm": 1.3984344059897187, "learning_rate": 4.6315789473684214e-05, "loss": 0.543, "step": 176 }, { "epoch": 0.2915980230642504, "grad_norm": 0.9775413462011177, "learning_rate": 4.657894736842106e-05, "loss": 0.5387, "step": 177 }, { "epoch": 0.2932454695222405, "grad_norm": 1.3977423762899852, "learning_rate": 4.68421052631579e-05, "loss": 0.5495, "step": 178 }, { "epoch": 0.29489291598023065, "grad_norm": 0.8021416421900878, "learning_rate": 4.710526315789474e-05, "loss": 0.5383, "step": 179 }, { "epoch": 0.29654036243822074, "grad_norm": 1.3422109888201612, "learning_rate": 4.736842105263158e-05, "loss": 0.5475, "step": 180 }, { "epoch": 0.2981878088962109, "grad_norm": 0.7799079156058146, "learning_rate": 4.763157894736843e-05, "loss": 0.5504, "step": 181 }, { "epoch": 0.29983525535420097, "grad_norm": 1.3281076880770857, "learning_rate": 4.789473684210527e-05, "loss": 0.5524, "step": 182 }, { "epoch": 0.3014827018121911, "grad_norm": 0.9006223334904306, "learning_rate": 4.815789473684211e-05, "loss": 0.546, "step": 183 }, { "epoch": 0.3031301482701812, "grad_norm": 1.314752176516327, "learning_rate": 4.842105263157895e-05, "loss": 0.5495, "step": 184 }, { "epoch": 0.30477759472817134, "grad_norm": 1.1626447428754518, "learning_rate": 4.868421052631579e-05, "loss": 0.5553, "step": 185 }, { "epoch": 0.30642504118616143, "grad_norm": 1.2750889477750524, "learning_rate": 4.8947368421052635e-05, "loss": 0.5527, "step": 186 }, { "epoch": 0.30807248764415157, "grad_norm": 1.0508165413397874, "learning_rate": 4.921052631578948e-05, "loss": 0.5447, "step": 187 }, { "epoch": 0.30971993410214166, "grad_norm": 1.1209321305206288, "learning_rate": 4.947368421052632e-05, "loss": 0.5395, "step": 188 }, { "epoch": 0.3113673805601318, "grad_norm": 1.262204330726308, "learning_rate": 4.9736842105263166e-05, "loss": 0.5482, "step": 189 }, { "epoch": 0.3130148270181219, "grad_norm": 1.514194804212302, "learning_rate": 5e-05, "loss": 0.5436, "step": 190 }, { "epoch": 0.31466227347611203, "grad_norm": 0.8839639175318805, "learning_rate": 5.0263157894736846e-05, "loss": 0.5355, "step": 191 }, { "epoch": 0.3163097199341021, "grad_norm": 1.533210670619606, "learning_rate": 5.052631578947368e-05, "loss": 0.5384, "step": 192 }, { "epoch": 0.31795716639209226, "grad_norm": 1.1703818506391075, "learning_rate": 5.0789473684210526e-05, "loss": 0.5414, "step": 193 }, { "epoch": 0.31960461285008235, "grad_norm": 1.1467765306350612, "learning_rate": 5.1052631578947376e-05, "loss": 0.5304, "step": 194 }, { "epoch": 0.3212520593080725, "grad_norm": 1.3133003334891802, "learning_rate": 5.131578947368422e-05, "loss": 0.5424, "step": 195 }, { "epoch": 0.3228995057660626, "grad_norm": 0.7753324575008489, "learning_rate": 5.157894736842106e-05, "loss": 0.5392, "step": 196 }, { "epoch": 0.3245469522240527, "grad_norm": 0.9500187703267653, "learning_rate": 5.18421052631579e-05, "loss": 0.5355, "step": 197 }, { "epoch": 0.3261943986820428, "grad_norm": 1.876143757871946, "learning_rate": 5.210526315789474e-05, "loss": 0.5525, "step": 198 }, { "epoch": 0.32784184514003295, "grad_norm": 0.9746710290768762, "learning_rate": 5.236842105263158e-05, "loss": 0.5348, "step": 199 }, { "epoch": 0.32948929159802304, "grad_norm": 1.81493153377205, "learning_rate": 5.263157894736843e-05, "loss": 0.5526, "step": 200 }, { "epoch": 0.3311367380560132, "grad_norm": 1.5743805242693867, "learning_rate": 5.289473684210527e-05, "loss": 0.5402, "step": 201 }, { "epoch": 0.33278418451400327, "grad_norm": 1.460714135119671, "learning_rate": 5.315789473684211e-05, "loss": 0.5383, "step": 202 }, { "epoch": 0.3344316309719934, "grad_norm": 1.4323452647537098, "learning_rate": 5.3421052631578954e-05, "loss": 0.5529, "step": 203 }, { "epoch": 0.33607907742998355, "grad_norm": 1.4511376835962562, "learning_rate": 5.368421052631579e-05, "loss": 0.5427, "step": 204 }, { "epoch": 0.33772652388797364, "grad_norm": 1.0617674459432958, "learning_rate": 5.3947368421052635e-05, "loss": 0.5358, "step": 205 }, { "epoch": 0.3393739703459638, "grad_norm": 1.7710187667223232, "learning_rate": 5.4210526315789485e-05, "loss": 0.5415, "step": 206 }, { "epoch": 0.34102141680395387, "grad_norm": 1.5633017652216616, "learning_rate": 5.447368421052632e-05, "loss": 0.5453, "step": 207 }, { "epoch": 0.342668863261944, "grad_norm": 1.1965990185998416, "learning_rate": 5.4736842105263165e-05, "loss": 0.5396, "step": 208 }, { "epoch": 0.3443163097199341, "grad_norm": 1.2220221632404487, "learning_rate": 5.5e-05, "loss": 0.5403, "step": 209 }, { "epoch": 0.34596375617792424, "grad_norm": 0.9153644236113267, "learning_rate": 5.5263157894736845e-05, "loss": 0.5368, "step": 210 }, { "epoch": 0.34761120263591433, "grad_norm": 1.1710160529409317, "learning_rate": 5.552631578947369e-05, "loss": 0.554, "step": 211 }, { "epoch": 0.34925864909390447, "grad_norm": 1.2417048800711934, "learning_rate": 5.5789473684210526e-05, "loss": 0.5438, "step": 212 }, { "epoch": 0.35090609555189456, "grad_norm": 1.0573213120989264, "learning_rate": 5.6052631578947376e-05, "loss": 0.5375, "step": 213 }, { "epoch": 0.3525535420098847, "grad_norm": 1.2656188872053546, "learning_rate": 5.631578947368422e-05, "loss": 0.5413, "step": 214 }, { "epoch": 0.3542009884678748, "grad_norm": 1.1407306295625592, "learning_rate": 5.6578947368421056e-05, "loss": 0.5341, "step": 215 }, { "epoch": 0.35584843492586493, "grad_norm": 1.3810101836456212, "learning_rate": 5.68421052631579e-05, "loss": 0.5433, "step": 216 }, { "epoch": 0.357495881383855, "grad_norm": 0.7797198168987864, "learning_rate": 5.7105263157894736e-05, "loss": 0.5419, "step": 217 }, { "epoch": 0.35914332784184516, "grad_norm": 0.8091875128902628, "learning_rate": 5.736842105263158e-05, "loss": 0.538, "step": 218 }, { "epoch": 0.36079077429983525, "grad_norm": 1.238166046889853, "learning_rate": 5.763157894736843e-05, "loss": 0.5389, "step": 219 }, { "epoch": 0.3624382207578254, "grad_norm": 1.1955135522690308, "learning_rate": 5.789473684210527e-05, "loss": 0.5365, "step": 220 }, { "epoch": 0.3640856672158155, "grad_norm": 1.1616383208018555, "learning_rate": 5.815789473684211e-05, "loss": 0.5428, "step": 221 }, { "epoch": 0.3657331136738056, "grad_norm": 1.252241470084541, "learning_rate": 5.8421052631578954e-05, "loss": 0.5345, "step": 222 }, { "epoch": 0.3673805601317957, "grad_norm": 0.9248349026133702, "learning_rate": 5.868421052631579e-05, "loss": 0.5321, "step": 223 }, { "epoch": 0.36902800658978585, "grad_norm": 0.7918578256688568, "learning_rate": 5.8947368421052634e-05, "loss": 0.534, "step": 224 }, { "epoch": 0.37067545304777594, "grad_norm": 1.0548817528440542, "learning_rate": 5.9210526315789485e-05, "loss": 0.5392, "step": 225 }, { "epoch": 0.3723228995057661, "grad_norm": 1.6182157271239737, "learning_rate": 5.947368421052632e-05, "loss": 0.5332, "step": 226 }, { "epoch": 0.37397034596375617, "grad_norm": 0.5944197365200183, "learning_rate": 5.9736842105263165e-05, "loss": 0.5327, "step": 227 }, { "epoch": 0.3756177924217463, "grad_norm": 1.3639127259300687, "learning_rate": 6.000000000000001e-05, "loss": 0.5462, "step": 228 }, { "epoch": 0.3772652388797364, "grad_norm": 1.247830906842987, "learning_rate": 6.0263157894736845e-05, "loss": 0.5448, "step": 229 }, { "epoch": 0.37891268533772654, "grad_norm": 0.8677905386167833, "learning_rate": 6.052631578947369e-05, "loss": 0.5323, "step": 230 }, { "epoch": 0.3805601317957166, "grad_norm": 1.7255643406043415, "learning_rate": 6.0789473684210525e-05, "loss": 0.5454, "step": 231 }, { "epoch": 0.38220757825370677, "grad_norm": 0.9707305601000185, "learning_rate": 6.105263157894738e-05, "loss": 0.544, "step": 232 }, { "epoch": 0.38385502471169686, "grad_norm": 1.521081423818225, "learning_rate": 6.131578947368421e-05, "loss": 0.5473, "step": 233 }, { "epoch": 0.385502471169687, "grad_norm": 0.8636820866874046, "learning_rate": 6.157894736842106e-05, "loss": 0.544, "step": 234 }, { "epoch": 0.3871499176276771, "grad_norm": 1.4850688014075228, "learning_rate": 6.18421052631579e-05, "loss": 0.5499, "step": 235 }, { "epoch": 0.38879736408566723, "grad_norm": 0.881460876262411, "learning_rate": 6.210526315789474e-05, "loss": 0.5311, "step": 236 }, { "epoch": 0.3904448105436573, "grad_norm": 1.3547867773187507, "learning_rate": 6.236842105263159e-05, "loss": 0.5475, "step": 237 }, { "epoch": 0.39209225700164746, "grad_norm": 1.0877085550135497, "learning_rate": 6.263157894736843e-05, "loss": 0.5381, "step": 238 }, { "epoch": 0.39373970345963755, "grad_norm": 1.2004148319119494, "learning_rate": 6.289473684210527e-05, "loss": 0.5456, "step": 239 }, { "epoch": 0.3953871499176277, "grad_norm": 1.1258901285570824, "learning_rate": 6.315789473684212e-05, "loss": 0.5479, "step": 240 }, { "epoch": 0.3970345963756178, "grad_norm": 1.2586771127441179, "learning_rate": 6.342105263157895e-05, "loss": 0.541, "step": 241 }, { "epoch": 0.3986820428336079, "grad_norm": 1.5848880094361493, "learning_rate": 6.368421052631579e-05, "loss": 0.5474, "step": 242 }, { "epoch": 0.400329489291598, "grad_norm": 0.7728515795171483, "learning_rate": 6.394736842105263e-05, "loss": 0.5369, "step": 243 }, { "epoch": 0.40197693574958815, "grad_norm": 1.3543650468926498, "learning_rate": 6.421052631578948e-05, "loss": 0.5278, "step": 244 }, { "epoch": 0.40362438220757824, "grad_norm": 1.4509202985179317, "learning_rate": 6.447368421052632e-05, "loss": 0.5472, "step": 245 }, { "epoch": 0.4052718286655684, "grad_norm": 0.8907307439443548, "learning_rate": 6.473684210526316e-05, "loss": 0.5298, "step": 246 }, { "epoch": 0.40691927512355847, "grad_norm": 1.2362276528947735, "learning_rate": 6.500000000000001e-05, "loss": 0.5352, "step": 247 }, { "epoch": 0.4085667215815486, "grad_norm": 0.9777746245553469, "learning_rate": 6.526315789473685e-05, "loss": 0.5389, "step": 248 }, { "epoch": 0.4102141680395387, "grad_norm": 0.9140398772096295, "learning_rate": 6.552631578947368e-05, "loss": 0.5279, "step": 249 }, { "epoch": 0.41186161449752884, "grad_norm": 1.247689479239907, "learning_rate": 6.578947368421052e-05, "loss": 0.5324, "step": 250 }, { "epoch": 0.4135090609555189, "grad_norm": 1.0766764965183868, "learning_rate": 6.605263157894738e-05, "loss": 0.5317, "step": 251 }, { "epoch": 0.41515650741350907, "grad_norm": 0.992629531904876, "learning_rate": 6.631578947368421e-05, "loss": 0.5268, "step": 252 }, { "epoch": 0.41680395387149916, "grad_norm": 1.4074494203526442, "learning_rate": 6.657894736842106e-05, "loss": 0.5476, "step": 253 }, { "epoch": 0.4184514003294893, "grad_norm": 1.1890553554892376, "learning_rate": 6.68421052631579e-05, "loss": 0.5446, "step": 254 }, { "epoch": 0.4200988467874794, "grad_norm": 1.1093401185356067, "learning_rate": 6.710526315789474e-05, "loss": 0.5238, "step": 255 }, { "epoch": 0.42174629324546953, "grad_norm": 0.8498991694650931, "learning_rate": 6.736842105263159e-05, "loss": 0.5307, "step": 256 }, { "epoch": 0.4233937397034596, "grad_norm": 1.070029619987952, "learning_rate": 6.763157894736843e-05, "loss": 0.5247, "step": 257 }, { "epoch": 0.42504118616144976, "grad_norm": 1.6001743624626465, "learning_rate": 6.789473684210527e-05, "loss": 0.5446, "step": 258 }, { "epoch": 0.42668863261943984, "grad_norm": 1.2542639644920668, "learning_rate": 6.815789473684212e-05, "loss": 0.5309, "step": 259 }, { "epoch": 0.42833607907743, "grad_norm": 1.1132915715640095, "learning_rate": 6.842105263157895e-05, "loss": 0.5394, "step": 260 }, { "epoch": 0.4299835255354201, "grad_norm": 1.6765083951076916, "learning_rate": 6.868421052631579e-05, "loss": 0.5404, "step": 261 }, { "epoch": 0.4316309719934102, "grad_norm": 1.098175110396909, "learning_rate": 6.894736842105263e-05, "loss": 0.535, "step": 262 }, { "epoch": 0.4332784184514003, "grad_norm": 1.4091784246381398, "learning_rate": 6.921052631578948e-05, "loss": 0.5446, "step": 263 }, { "epoch": 0.43492586490939045, "grad_norm": 1.3823759578977397, "learning_rate": 6.947368421052632e-05, "loss": 0.5343, "step": 264 }, { "epoch": 0.43657331136738053, "grad_norm": 1.2663714213322648, "learning_rate": 6.973684210526316e-05, "loss": 0.5328, "step": 265 }, { "epoch": 0.4382207578253707, "grad_norm": 1.0186721167708686, "learning_rate": 7.000000000000001e-05, "loss": 0.5294, "step": 266 }, { "epoch": 0.43986820428336076, "grad_norm": 1.6883933876454165, "learning_rate": 7.026315789473685e-05, "loss": 0.5402, "step": 267 }, { "epoch": 0.4415156507413509, "grad_norm": 0.8737471684570032, "learning_rate": 7.052631578947368e-05, "loss": 0.5397, "step": 268 }, { "epoch": 0.443163097199341, "grad_norm": 1.4176033066080103, "learning_rate": 7.078947368421052e-05, "loss": 0.5411, "step": 269 }, { "epoch": 0.44481054365733114, "grad_norm": 0.9032999479435903, "learning_rate": 7.105263157894738e-05, "loss": 0.5471, "step": 270 }, { "epoch": 0.4464579901153213, "grad_norm": 1.1748858081478766, "learning_rate": 7.131578947368422e-05, "loss": 0.547, "step": 271 }, { "epoch": 0.44810543657331137, "grad_norm": 1.2110511831555077, "learning_rate": 7.157894736842105e-05, "loss": 0.5447, "step": 272 }, { "epoch": 0.4497528830313015, "grad_norm": 0.9387198085528272, "learning_rate": 7.18421052631579e-05, "loss": 0.5251, "step": 273 }, { "epoch": 0.4514003294892916, "grad_norm": 1.1609641067261898, "learning_rate": 7.210526315789474e-05, "loss": 0.5296, "step": 274 }, { "epoch": 0.45304777594728174, "grad_norm": 1.0177052891315888, "learning_rate": 7.236842105263159e-05, "loss": 0.5366, "step": 275 }, { "epoch": 0.4546952224052718, "grad_norm": 1.1108403568559186, "learning_rate": 7.263157894736843e-05, "loss": 0.5378, "step": 276 }, { "epoch": 0.45634266886326197, "grad_norm": 1.2679340913591304, "learning_rate": 7.289473684210527e-05, "loss": 0.5308, "step": 277 }, { "epoch": 0.45799011532125206, "grad_norm": 1.0097682867968898, "learning_rate": 7.315789473684212e-05, "loss": 0.5187, "step": 278 }, { "epoch": 0.4596375617792422, "grad_norm": 1.1608793787877074, "learning_rate": 7.342105263157896e-05, "loss": 0.5442, "step": 279 }, { "epoch": 0.4612850082372323, "grad_norm": 0.842447438286422, "learning_rate": 7.368421052631579e-05, "loss": 0.5225, "step": 280 }, { "epoch": 0.46293245469522243, "grad_norm": 0.9049515547945904, "learning_rate": 7.394736842105263e-05, "loss": 0.5299, "step": 281 }, { "epoch": 0.4645799011532125, "grad_norm": 1.133595723803308, "learning_rate": 7.421052631578949e-05, "loss": 0.5202, "step": 282 }, { "epoch": 0.46622734761120266, "grad_norm": 1.1812242948453218, "learning_rate": 7.447368421052632e-05, "loss": 0.53, "step": 283 }, { "epoch": 0.46787479406919275, "grad_norm": 0.7548067927800733, "learning_rate": 7.473684210526316e-05, "loss": 0.5342, "step": 284 }, { "epoch": 0.4695222405271829, "grad_norm": 1.0092214821092191, "learning_rate": 7.500000000000001e-05, "loss": 0.524, "step": 285 }, { "epoch": 0.471169686985173, "grad_norm": 1.0639864792458116, "learning_rate": 7.526315789473685e-05, "loss": 0.5385, "step": 286 }, { "epoch": 0.4728171334431631, "grad_norm": 1.2770004201857859, "learning_rate": 7.55263157894737e-05, "loss": 0.5283, "step": 287 }, { "epoch": 0.4744645799011532, "grad_norm": 1.4461712202777124, "learning_rate": 7.578947368421052e-05, "loss": 0.5314, "step": 288 }, { "epoch": 0.47611202635914335, "grad_norm": 0.924776165068712, "learning_rate": 7.605263157894738e-05, "loss": 0.5239, "step": 289 }, { "epoch": 0.47775947281713343, "grad_norm": 1.1168119100557468, "learning_rate": 7.631578947368422e-05, "loss": 0.5255, "step": 290 }, { "epoch": 0.4794069192751236, "grad_norm": 1.1135510804134192, "learning_rate": 7.657894736842105e-05, "loss": 0.536, "step": 291 }, { "epoch": 0.48105436573311366, "grad_norm": 1.184119920327576, "learning_rate": 7.68421052631579e-05, "loss": 0.5227, "step": 292 }, { "epoch": 0.4827018121911038, "grad_norm": 1.0280836585668538, "learning_rate": 7.710526315789474e-05, "loss": 0.5187, "step": 293 }, { "epoch": 0.4843492586490939, "grad_norm": 1.329336375126176, "learning_rate": 7.736842105263159e-05, "loss": 0.5335, "step": 294 }, { "epoch": 0.48599670510708404, "grad_norm": 1.127707095512435, "learning_rate": 7.763157894736843e-05, "loss": 0.5242, "step": 295 }, { "epoch": 0.4876441515650741, "grad_norm": 1.1390939541659473, "learning_rate": 7.789473684210527e-05, "loss": 0.5241, "step": 296 }, { "epoch": 0.48929159802306427, "grad_norm": 0.8202812538426012, "learning_rate": 7.815789473684212e-05, "loss": 0.5209, "step": 297 }, { "epoch": 0.49093904448105435, "grad_norm": 1.3213617568989764, "learning_rate": 7.842105263157896e-05, "loss": 0.5212, "step": 298 }, { "epoch": 0.4925864909390445, "grad_norm": 1.1577284157639693, "learning_rate": 7.868421052631579e-05, "loss": 0.5226, "step": 299 }, { "epoch": 0.4942339373970346, "grad_norm": 0.8743336112665082, "learning_rate": 7.894736842105263e-05, "loss": 0.52, "step": 300 }, { "epoch": 0.4958813838550247, "grad_norm": 0.7299856083111257, "learning_rate": 7.921052631578949e-05, "loss": 0.5242, "step": 301 }, { "epoch": 0.4975288303130148, "grad_norm": 0.9382587784964266, "learning_rate": 7.947368421052632e-05, "loss": 0.5233, "step": 302 }, { "epoch": 0.49917627677100496, "grad_norm": 1.4919156102329723, "learning_rate": 7.973684210526316e-05, "loss": 0.5318, "step": 303 }, { "epoch": 0.500823723228995, "grad_norm": 0.923310611720198, "learning_rate": 8e-05, "loss": 0.5312, "step": 304 }, { "epoch": 0.5024711696869851, "grad_norm": 1.3612424577088045, "learning_rate": 7.999997353412281e-05, "loss": 0.5225, "step": 305 }, { "epoch": 0.5041186161449753, "grad_norm": 0.8689290740961477, "learning_rate": 7.999989413652627e-05, "loss": 0.5271, "step": 306 }, { "epoch": 0.5057660626029654, "grad_norm": 1.1606635672703784, "learning_rate": 7.999976180731545e-05, "loss": 0.5372, "step": 307 }, { "epoch": 0.5074135090609555, "grad_norm": 0.9889093679221572, "learning_rate": 7.999957654666544e-05, "loss": 0.5154, "step": 308 }, { "epoch": 0.5090609555189456, "grad_norm": 0.9864423170760949, "learning_rate": 7.999933835482139e-05, "loss": 0.5212, "step": 309 }, { "epoch": 0.5107084019769358, "grad_norm": 1.4039305435721747, "learning_rate": 7.999904723209854e-05, "loss": 0.5322, "step": 310 }, { "epoch": 0.5123558484349259, "grad_norm": 0.9484525665920988, "learning_rate": 7.999870317888208e-05, "loss": 0.5152, "step": 311 }, { "epoch": 0.514003294892916, "grad_norm": 1.0714166199118904, "learning_rate": 7.999830619562734e-05, "loss": 0.5177, "step": 312 }, { "epoch": 0.515650741350906, "grad_norm": 1.1404147048534778, "learning_rate": 7.999785628285961e-05, "loss": 0.5221, "step": 313 }, { "epoch": 0.5172981878088962, "grad_norm": 1.118962727055061, "learning_rate": 7.999735344117428e-05, "loss": 0.5259, "step": 314 }, { "epoch": 0.5189456342668863, "grad_norm": 1.2790283336506818, "learning_rate": 7.999679767123675e-05, "loss": 0.5222, "step": 315 }, { "epoch": 0.5205930807248764, "grad_norm": 0.7169162862337511, "learning_rate": 7.999618897378245e-05, "loss": 0.5184, "step": 316 }, { "epoch": 0.5222405271828665, "grad_norm": 1.0149073548635643, "learning_rate": 7.999552734961689e-05, "loss": 0.5195, "step": 317 }, { "epoch": 0.5238879736408567, "grad_norm": 1.2400644667343028, "learning_rate": 7.999481279961558e-05, "loss": 0.5278, "step": 318 }, { "epoch": 0.5255354200988468, "grad_norm": 0.6296882861623729, "learning_rate": 7.999404532472408e-05, "loss": 0.5285, "step": 319 }, { "epoch": 0.5271828665568369, "grad_norm": 0.8225231079573617, "learning_rate": 7.9993224925958e-05, "loss": 0.5169, "step": 320 }, { "epoch": 0.528830313014827, "grad_norm": 0.8588819010841828, "learning_rate": 7.999235160440296e-05, "loss": 0.5208, "step": 321 }, { "epoch": 0.5304777594728172, "grad_norm": 1.0453371245427425, "learning_rate": 7.99914253612146e-05, "loss": 0.5183, "step": 322 }, { "epoch": 0.5321252059308073, "grad_norm": 1.4261916728958957, "learning_rate": 7.999044619761863e-05, "loss": 0.5227, "step": 323 }, { "epoch": 0.5337726523887973, "grad_norm": 0.7489368705694665, "learning_rate": 7.998941411491079e-05, "loss": 0.5175, "step": 324 }, { "epoch": 0.5354200988467874, "grad_norm": 0.9170422074808176, "learning_rate": 7.998832911445679e-05, "loss": 0.5201, "step": 325 }, { "epoch": 0.5370675453047776, "grad_norm": 0.8346490143407053, "learning_rate": 7.998719119769244e-05, "loss": 0.5249, "step": 326 }, { "epoch": 0.5387149917627677, "grad_norm": 0.9924495275277523, "learning_rate": 7.99860003661235e-05, "loss": 0.5259, "step": 327 }, { "epoch": 0.5403624382207578, "grad_norm": 1.6895949961599812, "learning_rate": 7.998475662132583e-05, "loss": 0.5299, "step": 328 }, { "epoch": 0.5420098846787479, "grad_norm": 0.6883533196808757, "learning_rate": 7.998345996494525e-05, "loss": 0.5214, "step": 329 }, { "epoch": 0.5436573311367381, "grad_norm": 1.726101788831882, "learning_rate": 7.998211039869762e-05, "loss": 0.5266, "step": 330 }, { "epoch": 0.5453047775947282, "grad_norm": 0.9917123841749651, "learning_rate": 7.99807079243688e-05, "loss": 0.521, "step": 331 }, { "epoch": 0.5469522240527183, "grad_norm": 1.5809676478151404, "learning_rate": 7.997925254381469e-05, "loss": 0.5382, "step": 332 }, { "epoch": 0.5485996705107083, "grad_norm": 1.3297921220543507, "learning_rate": 7.99777442589612e-05, "loss": 0.5279, "step": 333 }, { "epoch": 0.5502471169686985, "grad_norm": 1.0569007044511725, "learning_rate": 7.997618307180421e-05, "loss": 0.5244, "step": 334 }, { "epoch": 0.5518945634266886, "grad_norm": 1.1033129620278634, "learning_rate": 7.997456898440964e-05, "loss": 0.5292, "step": 335 }, { "epoch": 0.5535420098846787, "grad_norm": 0.9902205011412782, "learning_rate": 7.99729019989134e-05, "loss": 0.5291, "step": 336 }, { "epoch": 0.5551894563426688, "grad_norm": 1.1869904678969239, "learning_rate": 7.99711821175214e-05, "loss": 0.5292, "step": 337 }, { "epoch": 0.556836902800659, "grad_norm": 0.7214130677386668, "learning_rate": 7.996940934250955e-05, "loss": 0.5181, "step": 338 }, { "epoch": 0.5584843492586491, "grad_norm": 0.8164650914370875, "learning_rate": 7.996758367622374e-05, "loss": 0.519, "step": 339 }, { "epoch": 0.5601317957166392, "grad_norm": 0.6293338320533983, "learning_rate": 7.996570512107989e-05, "loss": 0.5258, "step": 340 }, { "epoch": 0.5617792421746294, "grad_norm": 0.6715458670912386, "learning_rate": 7.996377367956386e-05, "loss": 0.5275, "step": 341 }, { "epoch": 0.5634266886326195, "grad_norm": 0.8261456816432898, "learning_rate": 7.996178935423152e-05, "loss": 0.515, "step": 342 }, { "epoch": 0.5650741350906096, "grad_norm": 0.7935835064513209, "learning_rate": 7.995975214770871e-05, "loss": 0.5184, "step": 343 }, { "epoch": 0.5667215815485996, "grad_norm": 1.2083430499033625, "learning_rate": 7.995766206269127e-05, "loss": 0.5157, "step": 344 }, { "epoch": 0.5683690280065898, "grad_norm": 0.7937592662749647, "learning_rate": 7.995551910194498e-05, "loss": 0.5255, "step": 345 }, { "epoch": 0.5700164744645799, "grad_norm": 1.0416300264502805, "learning_rate": 7.995332326830561e-05, "loss": 0.5188, "step": 346 }, { "epoch": 0.57166392092257, "grad_norm": 1.1610964681214184, "learning_rate": 7.99510745646789e-05, "loss": 0.5194, "step": 347 }, { "epoch": 0.5733113673805601, "grad_norm": 1.1465585632626158, "learning_rate": 7.994877299404054e-05, "loss": 0.516, "step": 348 }, { "epoch": 0.5749588138385503, "grad_norm": 0.8026311318647906, "learning_rate": 7.994641855943618e-05, "loss": 0.5108, "step": 349 }, { "epoch": 0.5766062602965404, "grad_norm": 0.6728960523860341, "learning_rate": 7.994401126398144e-05, "loss": 0.5141, "step": 350 }, { "epoch": 0.5782537067545305, "grad_norm": 0.6147938090955936, "learning_rate": 7.994155111086187e-05, "loss": 0.5245, "step": 351 }, { "epoch": 0.5799011532125206, "grad_norm": 0.7947465551745193, "learning_rate": 7.993903810333297e-05, "loss": 0.5199, "step": 352 }, { "epoch": 0.5815485996705108, "grad_norm": 1.029450977256308, "learning_rate": 7.993647224472022e-05, "loss": 0.5133, "step": 353 }, { "epoch": 0.5831960461285008, "grad_norm": 1.2548202279826193, "learning_rate": 7.993385353841896e-05, "loss": 0.523, "step": 354 }, { "epoch": 0.5848434925864909, "grad_norm": 0.8951063729123311, "learning_rate": 7.993118198789454e-05, "loss": 0.5132, "step": 355 }, { "epoch": 0.586490939044481, "grad_norm": 0.8660429079117785, "learning_rate": 7.992845759668218e-05, "loss": 0.5086, "step": 356 }, { "epoch": 0.5881383855024712, "grad_norm": 0.7422724386578905, "learning_rate": 7.992568036838708e-05, "loss": 0.5221, "step": 357 }, { "epoch": 0.5897858319604613, "grad_norm": 0.7433075178684124, "learning_rate": 7.992285030668431e-05, "loss": 0.516, "step": 358 }, { "epoch": 0.5914332784184514, "grad_norm": 1.2809902751015905, "learning_rate": 7.991996741531888e-05, "loss": 0.5183, "step": 359 }, { "epoch": 0.5930807248764415, "grad_norm": 1.0204447342842737, "learning_rate": 7.991703169810572e-05, "loss": 0.5111, "step": 360 }, { "epoch": 0.5947281713344317, "grad_norm": 0.9340507852077303, "learning_rate": 7.99140431589296e-05, "loss": 0.5163, "step": 361 }, { "epoch": 0.5963756177924218, "grad_norm": 0.8518404985626287, "learning_rate": 7.991100180174526e-05, "loss": 0.5042, "step": 362 }, { "epoch": 0.5980230642504119, "grad_norm": 0.7617587859499003, "learning_rate": 7.990790763057732e-05, "loss": 0.5146, "step": 363 }, { "epoch": 0.5996705107084019, "grad_norm": 0.7589117443952316, "learning_rate": 7.990476064952027e-05, "loss": 0.5163, "step": 364 }, { "epoch": 0.6013179571663921, "grad_norm": 0.9371058693388724, "learning_rate": 7.99015608627385e-05, "loss": 0.5155, "step": 365 }, { "epoch": 0.6029654036243822, "grad_norm": 1.4317195146923891, "learning_rate": 7.989830827446626e-05, "loss": 0.5208, "step": 366 }, { "epoch": 0.6046128500823723, "grad_norm": 0.6965192583281499, "learning_rate": 7.989500288900767e-05, "loss": 0.5086, "step": 367 }, { "epoch": 0.6062602965403624, "grad_norm": 0.9018096557423727, "learning_rate": 7.989164471073673e-05, "loss": 0.5068, "step": 368 }, { "epoch": 0.6079077429983526, "grad_norm": 1.384877633023159, "learning_rate": 7.988823374409731e-05, "loss": 0.5246, "step": 369 }, { "epoch": 0.6095551894563427, "grad_norm": 0.9156676788008109, "learning_rate": 7.988476999360311e-05, "loss": 0.5109, "step": 370 }, { "epoch": 0.6112026359143328, "grad_norm": 1.0957192403527365, "learning_rate": 7.988125346383769e-05, "loss": 0.5168, "step": 371 }, { "epoch": 0.6128500823723229, "grad_norm": 0.5991202929034163, "learning_rate": 7.987768415945445e-05, "loss": 0.5229, "step": 372 }, { "epoch": 0.6144975288303131, "grad_norm": 0.8311989142283502, "learning_rate": 7.987406208517664e-05, "loss": 0.5101, "step": 373 }, { "epoch": 0.6161449752883031, "grad_norm": 1.142859609934249, "learning_rate": 7.987038724579732e-05, "loss": 0.4982, "step": 374 }, { "epoch": 0.6177924217462932, "grad_norm": 0.6792373658417231, "learning_rate": 7.986665964617938e-05, "loss": 0.5176, "step": 375 }, { "epoch": 0.6194398682042833, "grad_norm": 0.8401404543253332, "learning_rate": 7.986287929125554e-05, "loss": 0.521, "step": 376 }, { "epoch": 0.6210873146622735, "grad_norm": 0.774746223814845, "learning_rate": 7.985904618602833e-05, "loss": 0.516, "step": 377 }, { "epoch": 0.6227347611202636, "grad_norm": 0.8073196335672899, "learning_rate": 7.985516033557003e-05, "loss": 0.512, "step": 378 }, { "epoch": 0.6243822075782537, "grad_norm": 0.8199524625986636, "learning_rate": 7.985122174502282e-05, "loss": 0.5054, "step": 379 }, { "epoch": 0.6260296540362438, "grad_norm": 1.1961353409460591, "learning_rate": 7.984723041959857e-05, "loss": 0.5196, "step": 380 }, { "epoch": 0.627677100494234, "grad_norm": 1.0428291714273572, "learning_rate": 7.984318636457899e-05, "loss": 0.5072, "step": 381 }, { "epoch": 0.6293245469522241, "grad_norm": 0.9834180108279613, "learning_rate": 7.983908958531554e-05, "loss": 0.5122, "step": 382 }, { "epoch": 0.6309719934102141, "grad_norm": 0.9640590394429475, "learning_rate": 7.98349400872295e-05, "loss": 0.5095, "step": 383 }, { "epoch": 0.6326194398682042, "grad_norm": 0.7840215856333954, "learning_rate": 7.983073787581184e-05, "loss": 0.5108, "step": 384 }, { "epoch": 0.6342668863261944, "grad_norm": 1.0033807568236506, "learning_rate": 7.982648295662333e-05, "loss": 0.5226, "step": 385 }, { "epoch": 0.6359143327841845, "grad_norm": 1.2376750971585369, "learning_rate": 7.98221753352945e-05, "loss": 0.5095, "step": 386 }, { "epoch": 0.6375617792421746, "grad_norm": 0.6261905400379791, "learning_rate": 7.981781501752557e-05, "loss": 0.512, "step": 387 }, { "epoch": 0.6392092257001647, "grad_norm": 0.5804089075463489, "learning_rate": 7.981340200908652e-05, "loss": 0.5066, "step": 388 }, { "epoch": 0.6408566721581549, "grad_norm": 0.8032529288039301, "learning_rate": 7.980893631581707e-05, "loss": 0.5149, "step": 389 }, { "epoch": 0.642504118616145, "grad_norm": 1.0334000179982814, "learning_rate": 7.980441794362664e-05, "loss": 0.5061, "step": 390 }, { "epoch": 0.6441515650741351, "grad_norm": 1.1714084367302269, "learning_rate": 7.979984689849438e-05, "loss": 0.5119, "step": 391 }, { "epoch": 0.6457990115321252, "grad_norm": 0.747467534239943, "learning_rate": 7.979522318646912e-05, "loss": 0.5092, "step": 392 }, { "epoch": 0.6474464579901154, "grad_norm": 0.5932933989216198, "learning_rate": 7.979054681366937e-05, "loss": 0.5116, "step": 393 }, { "epoch": 0.6490939044481054, "grad_norm": 0.68004606937973, "learning_rate": 7.978581778628334e-05, "loss": 0.5155, "step": 394 }, { "epoch": 0.6507413509060955, "grad_norm": 0.7250683607680413, "learning_rate": 7.978103611056898e-05, "loss": 0.5042, "step": 395 }, { "epoch": 0.6523887973640856, "grad_norm": 0.8014337404639501, "learning_rate": 7.977620179285378e-05, "loss": 0.5127, "step": 396 }, { "epoch": 0.6540362438220758, "grad_norm": 1.1583141371906298, "learning_rate": 7.9771314839535e-05, "loss": 0.5125, "step": 397 }, { "epoch": 0.6556836902800659, "grad_norm": 1.152948982572663, "learning_rate": 7.976637525707953e-05, "loss": 0.5013, "step": 398 }, { "epoch": 0.657331136738056, "grad_norm": 0.7203658429986884, "learning_rate": 7.976138305202385e-05, "loss": 0.503, "step": 399 }, { "epoch": 0.6589785831960461, "grad_norm": 0.6300951096680435, "learning_rate": 7.975633823097415e-05, "loss": 0.5122, "step": 400 }, { "epoch": 0.6606260296540363, "grad_norm": 0.6951058070512307, "learning_rate": 7.97512408006062e-05, "loss": 0.5039, "step": 401 }, { "epoch": 0.6622734761120264, "grad_norm": 0.5872912411047928, "learning_rate": 7.974609076766538e-05, "loss": 0.5135, "step": 402 }, { "epoch": 0.6639209225700164, "grad_norm": 0.6418645067611943, "learning_rate": 7.974088813896671e-05, "loss": 0.5087, "step": 403 }, { "epoch": 0.6655683690280065, "grad_norm": 0.6841820242009199, "learning_rate": 7.97356329213948e-05, "loss": 0.513, "step": 404 }, { "epoch": 0.6672158154859967, "grad_norm": 0.5267387695605658, "learning_rate": 7.973032512190384e-05, "loss": 0.5066, "step": 405 }, { "epoch": 0.6688632619439868, "grad_norm": 0.4379047613233503, "learning_rate": 7.972496474751763e-05, "loss": 0.5052, "step": 406 }, { "epoch": 0.6705107084019769, "grad_norm": 0.5241935416943498, "learning_rate": 7.971955180532948e-05, "loss": 0.523, "step": 407 }, { "epoch": 0.6721581548599671, "grad_norm": 0.6825475656873093, "learning_rate": 7.971408630250235e-05, "loss": 0.5068, "step": 408 }, { "epoch": 0.6738056013179572, "grad_norm": 1.0104922516981727, "learning_rate": 7.970856824626866e-05, "loss": 0.4985, "step": 409 }, { "epoch": 0.6754530477759473, "grad_norm": 1.4403899450312274, "learning_rate": 7.970299764393046e-05, "loss": 0.515, "step": 410 }, { "epoch": 0.6771004942339374, "grad_norm": 0.7227476610297536, "learning_rate": 7.969737450285927e-05, "loss": 0.5048, "step": 411 }, { "epoch": 0.6787479406919276, "grad_norm": 1.126524115713911, "learning_rate": 7.969169883049618e-05, "loss": 0.5053, "step": 412 }, { "epoch": 0.6803953871499177, "grad_norm": 0.9646245476161667, "learning_rate": 7.968597063435174e-05, "loss": 0.5152, "step": 413 }, { "epoch": 0.6820428336079077, "grad_norm": 0.8165251917776502, "learning_rate": 7.968018992200606e-05, "loss": 0.5129, "step": 414 }, { "epoch": 0.6836902800658978, "grad_norm": 1.0098147472897987, "learning_rate": 7.967435670110871e-05, "loss": 0.5021, "step": 415 }, { "epoch": 0.685337726523888, "grad_norm": 1.3157783588037077, "learning_rate": 7.966847097937877e-05, "loss": 0.5135, "step": 416 }, { "epoch": 0.6869851729818781, "grad_norm": 0.6721232468580657, "learning_rate": 7.966253276460474e-05, "loss": 0.5023, "step": 417 }, { "epoch": 0.6886326194398682, "grad_norm": 0.8423041643940689, "learning_rate": 7.965654206464469e-05, "loss": 0.4984, "step": 418 }, { "epoch": 0.6902800658978583, "grad_norm": 0.9229444347515319, "learning_rate": 7.965049888742602e-05, "loss": 0.5058, "step": 419 }, { "epoch": 0.6919275123558485, "grad_norm": 0.9965550694019097, "learning_rate": 7.964440324094566e-05, "loss": 0.5089, "step": 420 }, { "epoch": 0.6935749588138386, "grad_norm": 1.1758967296887113, "learning_rate": 7.963825513326992e-05, "loss": 0.5157, "step": 421 }, { "epoch": 0.6952224052718287, "grad_norm": 0.7140489597171833, "learning_rate": 7.963205457253457e-05, "loss": 0.5111, "step": 422 }, { "epoch": 0.6968698517298187, "grad_norm": 0.8853700100643332, "learning_rate": 7.962580156694477e-05, "loss": 0.5149, "step": 423 }, { "epoch": 0.6985172981878089, "grad_norm": 1.1140046709442315, "learning_rate": 7.961949612477508e-05, "loss": 0.5096, "step": 424 }, { "epoch": 0.700164744645799, "grad_norm": 0.7880344934227025, "learning_rate": 7.961313825436944e-05, "loss": 0.5024, "step": 425 }, { "epoch": 0.7018121911037891, "grad_norm": 0.9682931674981861, "learning_rate": 7.960672796414122e-05, "loss": 0.5168, "step": 426 }, { "epoch": 0.7034596375617792, "grad_norm": 1.0340785185936634, "learning_rate": 7.960026526257308e-05, "loss": 0.5136, "step": 427 }, { "epoch": 0.7051070840197694, "grad_norm": 1.08177247514608, "learning_rate": 7.959375015821709e-05, "loss": 0.5091, "step": 428 }, { "epoch": 0.7067545304777595, "grad_norm": 1.024564301953538, "learning_rate": 7.958718265969463e-05, "loss": 0.5001, "step": 429 }, { "epoch": 0.7084019769357496, "grad_norm": 0.9500053743886274, "learning_rate": 7.958056277569647e-05, "loss": 0.5102, "step": 430 }, { "epoch": 0.7100494233937397, "grad_norm": 0.6872522885224973, "learning_rate": 7.957389051498261e-05, "loss": 0.5116, "step": 431 }, { "epoch": 0.7116968698517299, "grad_norm": 0.7338401829260178, "learning_rate": 7.956716588638245e-05, "loss": 0.511, "step": 432 }, { "epoch": 0.71334431630972, "grad_norm": 0.7067580039330683, "learning_rate": 7.956038889879464e-05, "loss": 0.5054, "step": 433 }, { "epoch": 0.71499176276771, "grad_norm": 0.5358420079364553, "learning_rate": 7.955355956118711e-05, "loss": 0.4981, "step": 434 }, { "epoch": 0.7166392092257001, "grad_norm": 0.7179512257483249, "learning_rate": 7.954667788259712e-05, "loss": 0.5102, "step": 435 }, { "epoch": 0.7182866556836903, "grad_norm": 0.720599553623129, "learning_rate": 7.95397438721311e-05, "loss": 0.5152, "step": 436 }, { "epoch": 0.7199341021416804, "grad_norm": 0.7400268680919058, "learning_rate": 7.953275753896481e-05, "loss": 0.5076, "step": 437 }, { "epoch": 0.7215815485996705, "grad_norm": 0.8376707510817792, "learning_rate": 7.952571889234325e-05, "loss": 0.4978, "step": 438 }, { "epoch": 0.7232289950576606, "grad_norm": 0.8968418851482163, "learning_rate": 7.951862794158056e-05, "loss": 0.5022, "step": 439 }, { "epoch": 0.7248764415156508, "grad_norm": 1.11633095995493, "learning_rate": 7.95114846960602e-05, "loss": 0.5051, "step": 440 }, { "epoch": 0.7265238879736409, "grad_norm": 0.7664570048332363, "learning_rate": 7.950428916523476e-05, "loss": 0.5033, "step": 441 }, { "epoch": 0.728171334431631, "grad_norm": 0.6063681914705618, "learning_rate": 7.949704135862604e-05, "loss": 0.498, "step": 442 }, { "epoch": 0.729818780889621, "grad_norm": 0.5398681134596442, "learning_rate": 7.948974128582504e-05, "loss": 0.4948, "step": 443 }, { "epoch": 0.7314662273476112, "grad_norm": 0.5083501384204666, "learning_rate": 7.948238895649187e-05, "loss": 0.5035, "step": 444 }, { "epoch": 0.7331136738056013, "grad_norm": 0.5765558095878606, "learning_rate": 7.947498438035585e-05, "loss": 0.5093, "step": 445 }, { "epoch": 0.7347611202635914, "grad_norm": 0.6801368209498564, "learning_rate": 7.94675275672154e-05, "loss": 0.5065, "step": 446 }, { "epoch": 0.7364085667215815, "grad_norm": 0.7312589499837889, "learning_rate": 7.946001852693806e-05, "loss": 0.505, "step": 447 }, { "epoch": 0.7380560131795717, "grad_norm": 0.877288097059011, "learning_rate": 7.945245726946052e-05, "loss": 0.5035, "step": 448 }, { "epoch": 0.7397034596375618, "grad_norm": 0.947155460630744, "learning_rate": 7.944484380478853e-05, "loss": 0.5043, "step": 449 }, { "epoch": 0.7413509060955519, "grad_norm": 1.0197887258804348, "learning_rate": 7.943717814299694e-05, "loss": 0.5075, "step": 450 }, { "epoch": 0.742998352553542, "grad_norm": 0.9961138480073677, "learning_rate": 7.94294602942297e-05, "loss": 0.4944, "step": 451 }, { "epoch": 0.7446457990115322, "grad_norm": 0.9812164109034337, "learning_rate": 7.942169026869975e-05, "loss": 0.5088, "step": 452 }, { "epoch": 0.7462932454695222, "grad_norm": 0.9132900000349808, "learning_rate": 7.941386807668915e-05, "loss": 0.5068, "step": 453 }, { "epoch": 0.7479406919275123, "grad_norm": 0.7718638731934556, "learning_rate": 7.940599372854895e-05, "loss": 0.5061, "step": 454 }, { "epoch": 0.7495881383855024, "grad_norm": 0.6705600545787145, "learning_rate": 7.939806723469923e-05, "loss": 0.5033, "step": 455 }, { "epoch": 0.7512355848434926, "grad_norm": 0.682760978202661, "learning_rate": 7.939008860562906e-05, "loss": 0.4878, "step": 456 }, { "epoch": 0.7528830313014827, "grad_norm": 0.6830632564253406, "learning_rate": 7.938205785189651e-05, "loss": 0.4976, "step": 457 }, { "epoch": 0.7545304777594728, "grad_norm": 0.6498816858747322, "learning_rate": 7.937397498412863e-05, "loss": 0.5081, "step": 458 }, { "epoch": 0.7561779242174629, "grad_norm": 0.7922582255073181, "learning_rate": 7.936584001302145e-05, "loss": 0.4912, "step": 459 }, { "epoch": 0.7578253706754531, "grad_norm": 0.878457656673209, "learning_rate": 7.93576529493399e-05, "loss": 0.5023, "step": 460 }, { "epoch": 0.7594728171334432, "grad_norm": 0.6720956800895923, "learning_rate": 7.934941380391786e-05, "loss": 0.5069, "step": 461 }, { "epoch": 0.7611202635914333, "grad_norm": 0.5620704754091453, "learning_rate": 7.934112258765818e-05, "loss": 0.5133, "step": 462 }, { "epoch": 0.7627677100494233, "grad_norm": 0.6255099960314313, "learning_rate": 7.933277931153256e-05, "loss": 0.5048, "step": 463 }, { "epoch": 0.7644151565074135, "grad_norm": 0.6307340747087921, "learning_rate": 7.932438398658159e-05, "loss": 0.5036, "step": 464 }, { "epoch": 0.7660626029654036, "grad_norm": 0.6742490854621627, "learning_rate": 7.931593662391477e-05, "loss": 0.5037, "step": 465 }, { "epoch": 0.7677100494233937, "grad_norm": 0.9333635737148888, "learning_rate": 7.930743723471045e-05, "loss": 0.4931, "step": 466 }, { "epoch": 0.7693574958813838, "grad_norm": 1.0790383427913521, "learning_rate": 7.92988858302158e-05, "loss": 0.4911, "step": 467 }, { "epoch": 0.771004942339374, "grad_norm": 0.8266327512569094, "learning_rate": 7.929028242174685e-05, "loss": 0.4994, "step": 468 }, { "epoch": 0.7726523887973641, "grad_norm": 0.7901768281910359, "learning_rate": 7.928162702068842e-05, "loss": 0.5001, "step": 469 }, { "epoch": 0.7742998352553542, "grad_norm": 0.935172622930111, "learning_rate": 7.927291963849418e-05, "loss": 0.4978, "step": 470 }, { "epoch": 0.7759472817133443, "grad_norm": 1.1974849742570035, "learning_rate": 7.926416028668653e-05, "loss": 0.505, "step": 471 }, { "epoch": 0.7775947281713345, "grad_norm": 0.8169336661956909, "learning_rate": 7.925534897685669e-05, "loss": 0.4955, "step": 472 }, { "epoch": 0.7792421746293245, "grad_norm": 0.6283164165546093, "learning_rate": 7.92464857206646e-05, "loss": 0.4959, "step": 473 }, { "epoch": 0.7808896210873146, "grad_norm": 0.5197344185550543, "learning_rate": 7.923757052983894e-05, "loss": 0.501, "step": 474 }, { "epoch": 0.7825370675453048, "grad_norm": 0.5823980522653582, "learning_rate": 7.922860341617714e-05, "loss": 0.4907, "step": 475 }, { "epoch": 0.7841845140032949, "grad_norm": 0.8201455505105524, "learning_rate": 7.921958439154534e-05, "loss": 0.5095, "step": 476 }, { "epoch": 0.785831960461285, "grad_norm": 0.9727013211719665, "learning_rate": 7.921051346787833e-05, "loss": 0.4907, "step": 477 }, { "epoch": 0.7874794069192751, "grad_norm": 1.038414133559853, "learning_rate": 7.920139065717962e-05, "loss": 0.5038, "step": 478 }, { "epoch": 0.7891268533772653, "grad_norm": 0.9437550796972872, "learning_rate": 7.919221597152138e-05, "loss": 0.4997, "step": 479 }, { "epoch": 0.7907742998352554, "grad_norm": 0.8901717542868928, "learning_rate": 7.918298942304442e-05, "loss": 0.5016, "step": 480 }, { "epoch": 0.7924217462932455, "grad_norm": 0.816948709688145, "learning_rate": 7.917371102395814e-05, "loss": 0.502, "step": 481 }, { "epoch": 0.7940691927512356, "grad_norm": 0.733349100521176, "learning_rate": 7.916438078654062e-05, "loss": 0.4977, "step": 482 }, { "epoch": 0.7957166392092258, "grad_norm": 0.6222685741760782, "learning_rate": 7.91549987231385e-05, "loss": 0.5059, "step": 483 }, { "epoch": 0.7973640856672158, "grad_norm": 0.6248581294094404, "learning_rate": 7.9145564846167e-05, "loss": 0.5031, "step": 484 }, { "epoch": 0.7990115321252059, "grad_norm": 0.6393823715403113, "learning_rate": 7.913607916810992e-05, "loss": 0.5063, "step": 485 }, { "epoch": 0.800658978583196, "grad_norm": 0.5900887711277748, "learning_rate": 7.912654170151959e-05, "loss": 0.5007, "step": 486 }, { "epoch": 0.8023064250411862, "grad_norm": 0.67263571241631, "learning_rate": 7.911695245901688e-05, "loss": 0.5006, "step": 487 }, { "epoch": 0.8039538714991763, "grad_norm": 0.8217689915843339, "learning_rate": 7.910731145329119e-05, "loss": 0.5074, "step": 488 }, { "epoch": 0.8056013179571664, "grad_norm": 1.078720416247937, "learning_rate": 7.909761869710039e-05, "loss": 0.5034, "step": 489 }, { "epoch": 0.8072487644151565, "grad_norm": 1.0863414809711716, "learning_rate": 7.908787420327085e-05, "loss": 0.4997, "step": 490 }, { "epoch": 0.8088962108731467, "grad_norm": 0.8707316557339475, "learning_rate": 7.90780779846974e-05, "loss": 0.5057, "step": 491 }, { "epoch": 0.8105436573311368, "grad_norm": 0.8346848099923421, "learning_rate": 7.906823005434331e-05, "loss": 0.4965, "step": 492 }, { "epoch": 0.8121911037891268, "grad_norm": 0.7458176843870734, "learning_rate": 7.90583304252403e-05, "loss": 0.5024, "step": 493 }, { "epoch": 0.8138385502471169, "grad_norm": 0.6000091288771064, "learning_rate": 7.904837911048848e-05, "loss": 0.5027, "step": 494 }, { "epoch": 0.8154859967051071, "grad_norm": 0.45276683418457747, "learning_rate": 7.903837612325634e-05, "loss": 0.4972, "step": 495 }, { "epoch": 0.8171334431630972, "grad_norm": 0.5198546272570085, "learning_rate": 7.902832147678083e-05, "loss": 0.5064, "step": 496 }, { "epoch": 0.8187808896210873, "grad_norm": 0.636628364664917, "learning_rate": 7.901821518436713e-05, "loss": 0.4977, "step": 497 }, { "epoch": 0.8204283360790774, "grad_norm": 0.6625746623193247, "learning_rate": 7.900805725938888e-05, "loss": 0.495, "step": 498 }, { "epoch": 0.8220757825370676, "grad_norm": 0.5878964851909593, "learning_rate": 7.899784771528801e-05, "loss": 0.4987, "step": 499 }, { "epoch": 0.8237232289950577, "grad_norm": 0.5614429170055827, "learning_rate": 7.898758656557471e-05, "loss": 0.505, "step": 500 }, { "epoch": 0.8253706754530478, "grad_norm": 0.6266307728650651, "learning_rate": 7.897727382382752e-05, "loss": 0.4929, "step": 501 }, { "epoch": 0.8270181219110379, "grad_norm": 0.765079787891555, "learning_rate": 7.896690950369321e-05, "loss": 0.5045, "step": 502 }, { "epoch": 0.828665568369028, "grad_norm": 1.0141765745727456, "learning_rate": 7.895649361888685e-05, "loss": 0.4977, "step": 503 }, { "epoch": 0.8303130148270181, "grad_norm": 1.207816835928132, "learning_rate": 7.894602618319168e-05, "loss": 0.5053, "step": 504 }, { "epoch": 0.8319604612850082, "grad_norm": 0.7552701354200756, "learning_rate": 7.893550721045924e-05, "loss": 0.4963, "step": 505 }, { "epoch": 0.8336079077429983, "grad_norm": 0.6839288616797118, "learning_rate": 7.892493671460916e-05, "loss": 0.4984, "step": 506 }, { "epoch": 0.8352553542009885, "grad_norm": 0.627289683113441, "learning_rate": 7.891431470962938e-05, "loss": 0.5044, "step": 507 }, { "epoch": 0.8369028006589786, "grad_norm": 0.5063239934079624, "learning_rate": 7.890364120957587e-05, "loss": 0.4926, "step": 508 }, { "epoch": 0.8385502471169687, "grad_norm": 0.5334806639636591, "learning_rate": 7.889291622857286e-05, "loss": 0.4881, "step": 509 }, { "epoch": 0.8401976935749588, "grad_norm": 0.7167027239832525, "learning_rate": 7.888213978081262e-05, "loss": 0.4922, "step": 510 }, { "epoch": 0.841845140032949, "grad_norm": 0.9138558055819944, "learning_rate": 7.887131188055557e-05, "loss": 0.5046, "step": 511 }, { "epoch": 0.8434925864909391, "grad_norm": 1.131284439185704, "learning_rate": 7.88604325421302e-05, "loss": 0.4932, "step": 512 }, { "epoch": 0.8451400329489291, "grad_norm": 0.7551327553676401, "learning_rate": 7.884950177993308e-05, "loss": 0.4992, "step": 513 }, { "epoch": 0.8467874794069192, "grad_norm": 0.5860637654684694, "learning_rate": 7.88385196084288e-05, "loss": 0.499, "step": 514 }, { "epoch": 0.8484349258649094, "grad_norm": 0.8061464715229663, "learning_rate": 7.882748604215002e-05, "loss": 0.5026, "step": 515 }, { "epoch": 0.8500823723228995, "grad_norm": 0.9709095775581241, "learning_rate": 7.881640109569739e-05, "loss": 0.4886, "step": 516 }, { "epoch": 0.8517298187808896, "grad_norm": 0.9508817452534596, "learning_rate": 7.880526478373952e-05, "loss": 0.4936, "step": 517 }, { "epoch": 0.8533772652388797, "grad_norm": 0.9166653683470548, "learning_rate": 7.879407712101307e-05, "loss": 0.5027, "step": 518 }, { "epoch": 0.8550247116968699, "grad_norm": 1.0392172294731887, "learning_rate": 7.878283812232257e-05, "loss": 0.4944, "step": 519 }, { "epoch": 0.85667215815486, "grad_norm": 0.9086147032946227, "learning_rate": 7.877154780254053e-05, "loss": 0.4996, "step": 520 }, { "epoch": 0.8583196046128501, "grad_norm": 0.8958985356840683, "learning_rate": 7.876020617660737e-05, "loss": 0.4931, "step": 521 }, { "epoch": 0.8599670510708401, "grad_norm": 0.7771623552315035, "learning_rate": 7.874881325953139e-05, "loss": 0.493, "step": 522 }, { "epoch": 0.8616144975288303, "grad_norm": 0.5854267968931518, "learning_rate": 7.873736906638877e-05, "loss": 0.4976, "step": 523 }, { "epoch": 0.8632619439868204, "grad_norm": 0.7698383893596155, "learning_rate": 7.872587361232353e-05, "loss": 0.4934, "step": 524 }, { "epoch": 0.8649093904448105, "grad_norm": 0.7830688801046889, "learning_rate": 7.871432691254752e-05, "loss": 0.5031, "step": 525 }, { "epoch": 0.8665568369028006, "grad_norm": 0.5897847779886887, "learning_rate": 7.870272898234044e-05, "loss": 0.4973, "step": 526 }, { "epoch": 0.8682042833607908, "grad_norm": 0.534474546446405, "learning_rate": 7.869107983704977e-05, "loss": 0.4931, "step": 527 }, { "epoch": 0.8698517298187809, "grad_norm": 0.5559747499184235, "learning_rate": 7.867937949209073e-05, "loss": 0.4921, "step": 528 }, { "epoch": 0.871499176276771, "grad_norm": 0.4884620098407904, "learning_rate": 7.866762796294632e-05, "loss": 0.4982, "step": 529 }, { "epoch": 0.8731466227347611, "grad_norm": 0.3650587769114016, "learning_rate": 7.865582526516726e-05, "loss": 0.4919, "step": 530 }, { "epoch": 0.8747940691927513, "grad_norm": 0.48762775538387576, "learning_rate": 7.864397141437201e-05, "loss": 0.5023, "step": 531 }, { "epoch": 0.8764415156507414, "grad_norm": 0.5508162970086452, "learning_rate": 7.863206642624667e-05, "loss": 0.4971, "step": 532 }, { "epoch": 0.8780889621087314, "grad_norm": 0.6271662705218596, "learning_rate": 7.862011031654506e-05, "loss": 0.4864, "step": 533 }, { "epoch": 0.8797364085667215, "grad_norm": 0.636217891217825, "learning_rate": 7.860810310108861e-05, "loss": 0.501, "step": 534 }, { "epoch": 0.8813838550247117, "grad_norm": 0.6091328201510771, "learning_rate": 7.85960447957664e-05, "loss": 0.4889, "step": 535 }, { "epoch": 0.8830313014827018, "grad_norm": 0.6887108679148758, "learning_rate": 7.858393541653513e-05, "loss": 0.5002, "step": 536 }, { "epoch": 0.8846787479406919, "grad_norm": 0.9838021573101505, "learning_rate": 7.857177497941903e-05, "loss": 0.4993, "step": 537 }, { "epoch": 0.886326194398682, "grad_norm": 1.435866676550646, "learning_rate": 7.855956350050997e-05, "loss": 0.4953, "step": 538 }, { "epoch": 0.8879736408566722, "grad_norm": 0.3779319319617201, "learning_rate": 7.854730099596729e-05, "loss": 0.4865, "step": 539 }, { "epoch": 0.8896210873146623, "grad_norm": 1.093590745748496, "learning_rate": 7.853498748201791e-05, "loss": 0.5101, "step": 540 }, { "epoch": 0.8912685337726524, "grad_norm": 1.4398210119936876, "learning_rate": 7.852262297495623e-05, "loss": 0.4947, "step": 541 }, { "epoch": 0.8929159802306426, "grad_norm": 0.5195245524035972, "learning_rate": 7.851020749114412e-05, "loss": 0.4995, "step": 542 }, { "epoch": 0.8945634266886326, "grad_norm": 1.5203652010583948, "learning_rate": 7.849774104701091e-05, "loss": 0.4983, "step": 543 }, { "epoch": 0.8962108731466227, "grad_norm": 0.45816274447899324, "learning_rate": 7.848522365905336e-05, "loss": 0.5015, "step": 544 }, { "epoch": 0.8978583196046128, "grad_norm": 1.2212063081286775, "learning_rate": 7.847265534383568e-05, "loss": 0.4973, "step": 545 }, { "epoch": 0.899505766062603, "grad_norm": 0.6620821384905953, "learning_rate": 7.846003611798941e-05, "loss": 0.5008, "step": 546 }, { "epoch": 0.9011532125205931, "grad_norm": 0.6923439333277621, "learning_rate": 7.844736599821351e-05, "loss": 0.4966, "step": 547 }, { "epoch": 0.9028006589785832, "grad_norm": 0.7825461054432196, "learning_rate": 7.843464500127429e-05, "loss": 0.4903, "step": 548 }, { "epoch": 0.9044481054365733, "grad_norm": 0.5701753086631366, "learning_rate": 7.842187314400534e-05, "loss": 0.4936, "step": 549 }, { "epoch": 0.9060955518945635, "grad_norm": 0.6005417569018022, "learning_rate": 7.84090504433076e-05, "loss": 0.4898, "step": 550 }, { "epoch": 0.9077429983525536, "grad_norm": 0.5698502249415275, "learning_rate": 7.839617691614925e-05, "loss": 0.4877, "step": 551 }, { "epoch": 0.9093904448105437, "grad_norm": 0.569237180845273, "learning_rate": 7.838325257956575e-05, "loss": 0.4928, "step": 552 }, { "epoch": 0.9110378912685337, "grad_norm": 0.5486154619840518, "learning_rate": 7.837027745065984e-05, "loss": 0.4955, "step": 553 }, { "epoch": 0.9126853377265239, "grad_norm": 0.5447790571941457, "learning_rate": 7.835725154660137e-05, "loss": 0.4935, "step": 554 }, { "epoch": 0.914332784184514, "grad_norm": 0.5792286850777264, "learning_rate": 7.834417488462747e-05, "loss": 0.4933, "step": 555 }, { "epoch": 0.9159802306425041, "grad_norm": 0.5750681659185475, "learning_rate": 7.83310474820424e-05, "loss": 0.4897, "step": 556 }, { "epoch": 0.9176276771004942, "grad_norm": 0.46704371594788674, "learning_rate": 7.831786935621758e-05, "loss": 0.4972, "step": 557 }, { "epoch": 0.9192751235584844, "grad_norm": 0.554985799331578, "learning_rate": 7.830464052459151e-05, "loss": 0.4969, "step": 558 }, { "epoch": 0.9209225700164745, "grad_norm": 0.5643974426982462, "learning_rate": 7.829136100466987e-05, "loss": 0.4989, "step": 559 }, { "epoch": 0.9225700164744646, "grad_norm": 0.4916168616881338, "learning_rate": 7.827803081402533e-05, "loss": 0.4992, "step": 560 }, { "epoch": 0.9242174629324547, "grad_norm": 0.6255314287544853, "learning_rate": 7.826464997029767e-05, "loss": 0.4901, "step": 561 }, { "epoch": 0.9258649093904449, "grad_norm": 0.8277334736607517, "learning_rate": 7.825121849119368e-05, "loss": 0.4869, "step": 562 }, { "epoch": 0.9275123558484349, "grad_norm": 1.152194517581131, "learning_rate": 7.823773639448713e-05, "loss": 0.4945, "step": 563 }, { "epoch": 0.929159802306425, "grad_norm": 1.0453712110813866, "learning_rate": 7.822420369801881e-05, "loss": 0.494, "step": 564 }, { "epoch": 0.9308072487644151, "grad_norm": 0.9970467794717086, "learning_rate": 7.821062041969646e-05, "loss": 0.4952, "step": 565 }, { "epoch": 0.9324546952224053, "grad_norm": 1.014325800700853, "learning_rate": 7.819698657749474e-05, "loss": 0.499, "step": 566 }, { "epoch": 0.9341021416803954, "grad_norm": 1.0127292820205311, "learning_rate": 7.818330218945526e-05, "loss": 0.5059, "step": 567 }, { "epoch": 0.9357495881383855, "grad_norm": 0.8096861684242187, "learning_rate": 7.816956727368642e-05, "loss": 0.5019, "step": 568 }, { "epoch": 0.9373970345963756, "grad_norm": 0.6282242280965742, "learning_rate": 7.815578184836361e-05, "loss": 0.4907, "step": 569 }, { "epoch": 0.9390444810543658, "grad_norm": 0.5616309818370421, "learning_rate": 7.814194593172897e-05, "loss": 0.4876, "step": 570 }, { "epoch": 0.9406919275123559, "grad_norm": 0.5127715753749568, "learning_rate": 7.81280595420915e-05, "loss": 0.492, "step": 571 }, { "epoch": 0.942339373970346, "grad_norm": 0.5311916955911262, "learning_rate": 7.811412269782695e-05, "loss": 0.4978, "step": 572 }, { "epoch": 0.943986820428336, "grad_norm": 0.4575298342097976, "learning_rate": 7.810013541737789e-05, "loss": 0.4949, "step": 573 }, { "epoch": 0.9456342668863262, "grad_norm": 0.4451781885984368, "learning_rate": 7.80860977192536e-05, "loss": 0.4907, "step": 574 }, { "epoch": 0.9472817133443163, "grad_norm": 0.4701210517119293, "learning_rate": 7.807200962203005e-05, "loss": 0.4842, "step": 575 }, { "epoch": 0.9489291598023064, "grad_norm": 0.5247025557659611, "learning_rate": 7.805787114434996e-05, "loss": 0.4958, "step": 576 }, { "epoch": 0.9505766062602965, "grad_norm": 0.5765742480676608, "learning_rate": 7.804368230492268e-05, "loss": 0.4895, "step": 577 }, { "epoch": 0.9522240527182867, "grad_norm": 0.5517191616083444, "learning_rate": 7.80294431225242e-05, "loss": 0.4933, "step": 578 }, { "epoch": 0.9538714991762768, "grad_norm": 0.6631964509718249, "learning_rate": 7.801515361599718e-05, "loss": 0.4989, "step": 579 }, { "epoch": 0.9555189456342669, "grad_norm": 0.7888649537408527, "learning_rate": 7.80008138042508e-05, "loss": 0.4959, "step": 580 }, { "epoch": 0.957166392092257, "grad_norm": 0.9397708567013766, "learning_rate": 7.798642370626087e-05, "loss": 0.4972, "step": 581 }, { "epoch": 0.9588138385502472, "grad_norm": 1.0166449929025212, "learning_rate": 7.797198334106971e-05, "loss": 0.493, "step": 582 }, { "epoch": 0.9604612850082372, "grad_norm": 0.9459718640408957, "learning_rate": 7.795749272778616e-05, "loss": 0.4948, "step": 583 }, { "epoch": 0.9621087314662273, "grad_norm": 0.8390456330488232, "learning_rate": 7.794295188558557e-05, "loss": 0.4909, "step": 584 }, { "epoch": 0.9637561779242174, "grad_norm": 0.803204373975808, "learning_rate": 7.792836083370973e-05, "loss": 0.4981, "step": 585 }, { "epoch": 0.9654036243822076, "grad_norm": 0.7750826215137762, "learning_rate": 7.79137195914669e-05, "loss": 0.4934, "step": 586 }, { "epoch": 0.9670510708401977, "grad_norm": 0.632936216893055, "learning_rate": 7.789902817823174e-05, "loss": 0.4838, "step": 587 }, { "epoch": 0.9686985172981878, "grad_norm": 0.5340232761073261, "learning_rate": 7.788428661344533e-05, "loss": 0.5008, "step": 588 }, { "epoch": 0.9703459637561779, "grad_norm": 0.5752002691606822, "learning_rate": 7.786949491661506e-05, "loss": 0.4953, "step": 589 }, { "epoch": 0.9719934102141681, "grad_norm": 0.6116293344392506, "learning_rate": 7.785465310731472e-05, "loss": 0.4862, "step": 590 }, { "epoch": 0.9736408566721582, "grad_norm": 0.5510687205353862, "learning_rate": 7.783976120518437e-05, "loss": 0.4926, "step": 591 }, { "epoch": 0.9752883031301482, "grad_norm": 0.4529591878886322, "learning_rate": 7.782481922993037e-05, "loss": 0.4991, "step": 592 }, { "epoch": 0.9769357495881383, "grad_norm": 0.4153902597835111, "learning_rate": 7.780982720132535e-05, "loss": 0.4953, "step": 593 }, { "epoch": 0.9785831960461285, "grad_norm": 0.4484743845470241, "learning_rate": 7.779478513920817e-05, "loss": 0.4916, "step": 594 }, { "epoch": 0.9802306425041186, "grad_norm": 0.4891840167312842, "learning_rate": 7.77796930634839e-05, "loss": 0.493, "step": 595 }, { "epoch": 0.9818780889621087, "grad_norm": 0.549826233680089, "learning_rate": 7.776455099412378e-05, "loss": 0.4933, "step": 596 }, { "epoch": 0.9835255354200988, "grad_norm": 0.604210713149151, "learning_rate": 7.774935895116524e-05, "loss": 0.4859, "step": 597 }, { "epoch": 0.985172981878089, "grad_norm": 0.644200131065205, "learning_rate": 7.773411695471178e-05, "loss": 0.4911, "step": 598 }, { "epoch": 0.9868204283360791, "grad_norm": 0.6617627399937894, "learning_rate": 7.771882502493307e-05, "loss": 0.489, "step": 599 }, { "epoch": 0.9884678747940692, "grad_norm": 0.827116565769663, "learning_rate": 7.770348318206485e-05, "loss": 0.4935, "step": 600 }, { "epoch": 0.9901153212520593, "grad_norm": 1.1121491494840383, "learning_rate": 7.768809144640883e-05, "loss": 0.5013, "step": 601 }, { "epoch": 0.9917627677100495, "grad_norm": 1.0552604076911278, "learning_rate": 7.767264983833281e-05, "loss": 0.4838, "step": 602 }, { "epoch": 0.9934102141680395, "grad_norm": 0.8461180844367668, "learning_rate": 7.765715837827061e-05, "loss": 0.4937, "step": 603 }, { "epoch": 0.9950576606260296, "grad_norm": 0.6424920220783643, "learning_rate": 7.764161708672196e-05, "loss": 0.5021, "step": 604 }, { "epoch": 0.9967051070840197, "grad_norm": 0.5683340283670179, "learning_rate": 7.762602598425256e-05, "loss": 0.4852, "step": 605 }, { "epoch": 0.9983525535420099, "grad_norm": 0.6673849025761969, "learning_rate": 7.761038509149402e-05, "loss": 0.4889, "step": 606 }, { "epoch": 1.0, "grad_norm": 0.5783753051212488, "learning_rate": 7.759469442914384e-05, "loss": 0.4908, "step": 607 }, { "epoch": 1.00164744645799, "grad_norm": 0.484367310159223, "learning_rate": 7.757895401796537e-05, "loss": 0.4681, "step": 608 }, { "epoch": 1.0032948929159802, "grad_norm": 0.48793173642057824, "learning_rate": 7.756316387878781e-05, "loss": 0.4774, "step": 609 }, { "epoch": 1.0049423393739703, "grad_norm": 0.5150825132775501, "learning_rate": 7.754732403250614e-05, "loss": 0.4632, "step": 610 }, { "epoch": 1.0065897858319606, "grad_norm": 0.5439165273788066, "learning_rate": 7.753143450008115e-05, "loss": 0.4766, "step": 611 }, { "epoch": 1.0082372322899507, "grad_norm": 0.45655615975090846, "learning_rate": 7.751549530253935e-05, "loss": 0.4779, "step": 612 }, { "epoch": 1.0098846787479407, "grad_norm": 0.45219825553864784, "learning_rate": 7.749950646097297e-05, "loss": 0.4805, "step": 613 }, { "epoch": 1.0115321252059308, "grad_norm": 0.587397519698661, "learning_rate": 7.748346799653998e-05, "loss": 0.4787, "step": 614 }, { "epoch": 1.013179571663921, "grad_norm": 0.6601755514898814, "learning_rate": 7.746737993046394e-05, "loss": 0.4694, "step": 615 }, { "epoch": 1.014827018121911, "grad_norm": 0.6190693559478082, "learning_rate": 7.745124228403412e-05, "loss": 0.4761, "step": 616 }, { "epoch": 1.016474464579901, "grad_norm": 0.601813106756462, "learning_rate": 7.743505507860535e-05, "loss": 0.4685, "step": 617 }, { "epoch": 1.0181219110378912, "grad_norm": 0.7011852124703152, "learning_rate": 7.741881833559808e-05, "loss": 0.4723, "step": 618 }, { "epoch": 1.0197693574958815, "grad_norm": 0.779779029776774, "learning_rate": 7.740253207649826e-05, "loss": 0.4644, "step": 619 }, { "epoch": 1.0214168039538716, "grad_norm": 0.7504481289647391, "learning_rate": 7.738619632285742e-05, "loss": 0.4724, "step": 620 }, { "epoch": 1.0230642504118617, "grad_norm": 0.6946617275173862, "learning_rate": 7.736981109629257e-05, "loss": 0.4734, "step": 621 }, { "epoch": 1.0247116968698518, "grad_norm": 0.6857934463402909, "learning_rate": 7.735337641848617e-05, "loss": 0.4658, "step": 622 }, { "epoch": 1.0263591433278418, "grad_norm": 0.7078077303655529, "learning_rate": 7.733689231118611e-05, "loss": 0.4758, "step": 623 }, { "epoch": 1.028006589785832, "grad_norm": 0.8285829132063434, "learning_rate": 7.732035879620574e-05, "loss": 0.4783, "step": 624 }, { "epoch": 1.029654036243822, "grad_norm": 0.8431970518805209, "learning_rate": 7.730377589542373e-05, "loss": 0.4725, "step": 625 }, { "epoch": 1.031301482701812, "grad_norm": 0.6906527316405107, "learning_rate": 7.728714363078415e-05, "loss": 0.4717, "step": 626 }, { "epoch": 1.0329489291598024, "grad_norm": 0.5977493370744916, "learning_rate": 7.727046202429637e-05, "loss": 0.4696, "step": 627 }, { "epoch": 1.0345963756177925, "grad_norm": 0.701641922994449, "learning_rate": 7.725373109803504e-05, "loss": 0.469, "step": 628 }, { "epoch": 1.0362438220757826, "grad_norm": 0.9140697231104569, "learning_rate": 7.723695087414013e-05, "loss": 0.4685, "step": 629 }, { "epoch": 1.0378912685337727, "grad_norm": 0.9851872311426808, "learning_rate": 7.722012137481678e-05, "loss": 0.475, "step": 630 }, { "epoch": 1.0395387149917628, "grad_norm": 0.9729452404964329, "learning_rate": 7.720324262233536e-05, "loss": 0.4768, "step": 631 }, { "epoch": 1.0411861614497528, "grad_norm": 0.8055438218006098, "learning_rate": 7.718631463903143e-05, "loss": 0.4753, "step": 632 }, { "epoch": 1.042833607907743, "grad_norm": 0.6251005974159219, "learning_rate": 7.716933744730569e-05, "loss": 0.477, "step": 633 }, { "epoch": 1.044481054365733, "grad_norm": 0.5431479826452262, "learning_rate": 7.715231106962393e-05, "loss": 0.4733, "step": 634 }, { "epoch": 1.0461285008237233, "grad_norm": 0.43336464030263755, "learning_rate": 7.713523552851707e-05, "loss": 0.4745, "step": 635 }, { "epoch": 1.0477759472817134, "grad_norm": 0.5144596771050097, "learning_rate": 7.711811084658108e-05, "loss": 0.4695, "step": 636 }, { "epoch": 1.0494233937397035, "grad_norm": 0.6905510074036125, "learning_rate": 7.710093704647693e-05, "loss": 0.4668, "step": 637 }, { "epoch": 1.0510708401976936, "grad_norm": 0.7613679478225978, "learning_rate": 7.70837141509306e-05, "loss": 0.4695, "step": 638 }, { "epoch": 1.0527182866556837, "grad_norm": 0.7986391370800051, "learning_rate": 7.706644218273305e-05, "loss": 0.4747, "step": 639 }, { "epoch": 1.0543657331136738, "grad_norm": 0.8608263410730893, "learning_rate": 7.704912116474017e-05, "loss": 0.4728, "step": 640 }, { "epoch": 1.0560131795716639, "grad_norm": 0.9381181622506664, "learning_rate": 7.703175111987276e-05, "loss": 0.474, "step": 641 }, { "epoch": 1.057660626029654, "grad_norm": 0.9515232699199122, "learning_rate": 7.701433207111649e-05, "loss": 0.4766, "step": 642 }, { "epoch": 1.0593080724876442, "grad_norm": 0.8376979385107007, "learning_rate": 7.699686404152188e-05, "loss": 0.4737, "step": 643 }, { "epoch": 1.0609555189456343, "grad_norm": 0.5866255360378559, "learning_rate": 7.697934705420425e-05, "loss": 0.4749, "step": 644 }, { "epoch": 1.0626029654036244, "grad_norm": 0.48718101493992105, "learning_rate": 7.696178113234376e-05, "loss": 0.469, "step": 645 }, { "epoch": 1.0642504118616145, "grad_norm": 0.45210691252466306, "learning_rate": 7.694416629918524e-05, "loss": 0.4673, "step": 646 }, { "epoch": 1.0658978583196046, "grad_norm": 0.37945492767436684, "learning_rate": 7.692650257803832e-05, "loss": 0.4643, "step": 647 }, { "epoch": 1.0675453047775947, "grad_norm": 0.3651276367742703, "learning_rate": 7.69087899922773e-05, "loss": 0.4788, "step": 648 }, { "epoch": 1.0691927512355848, "grad_norm": 0.445728042334283, "learning_rate": 7.689102856534113e-05, "loss": 0.4682, "step": 649 }, { "epoch": 1.0708401976935749, "grad_norm": 0.5276117238099935, "learning_rate": 7.687321832073338e-05, "loss": 0.473, "step": 650 }, { "epoch": 1.0724876441515652, "grad_norm": 0.6559209561940338, "learning_rate": 7.685535928202225e-05, "loss": 0.4693, "step": 651 }, { "epoch": 1.0741350906095553, "grad_norm": 0.8176528137430009, "learning_rate": 7.683745147284049e-05, "loss": 0.4743, "step": 652 }, { "epoch": 1.0757825370675453, "grad_norm": 0.8244528868776456, "learning_rate": 7.68194949168854e-05, "loss": 0.47, "step": 653 }, { "epoch": 1.0774299835255354, "grad_norm": 0.7262534631126601, "learning_rate": 7.680148963791878e-05, "loss": 0.4615, "step": 654 }, { "epoch": 1.0790774299835255, "grad_norm": 0.5081916648436801, "learning_rate": 7.67834356597669e-05, "loss": 0.4686, "step": 655 }, { "epoch": 1.0807248764415156, "grad_norm": 0.4021475380864586, "learning_rate": 7.676533300632049e-05, "loss": 0.4718, "step": 656 }, { "epoch": 1.0823723228995057, "grad_norm": 0.42918978947250463, "learning_rate": 7.674718170153468e-05, "loss": 0.4745, "step": 657 }, { "epoch": 1.084019769357496, "grad_norm": 0.43905493633016, "learning_rate": 7.672898176942897e-05, "loss": 0.4677, "step": 658 }, { "epoch": 1.085667215815486, "grad_norm": 0.49561418603810903, "learning_rate": 7.67107332340872e-05, "loss": 0.4763, "step": 659 }, { "epoch": 1.0873146622734762, "grad_norm": 0.4958099254838728, "learning_rate": 7.669243611965759e-05, "loss": 0.4657, "step": 660 }, { "epoch": 1.0889621087314663, "grad_norm": 0.45678207684581273, "learning_rate": 7.667409045035258e-05, "loss": 0.4668, "step": 661 }, { "epoch": 1.0906095551894563, "grad_norm": 0.4002799959893767, "learning_rate": 7.665569625044886e-05, "loss": 0.467, "step": 662 }, { "epoch": 1.0922570016474464, "grad_norm": 0.4596712680497988, "learning_rate": 7.663725354428738e-05, "loss": 0.4759, "step": 663 }, { "epoch": 1.0939044481054365, "grad_norm": 0.6522519880793987, "learning_rate": 7.661876235627326e-05, "loss": 0.465, "step": 664 }, { "epoch": 1.0955518945634266, "grad_norm": 0.779090932307406, "learning_rate": 7.660022271087577e-05, "loss": 0.47, "step": 665 }, { "epoch": 1.0971993410214167, "grad_norm": 0.8758758698504628, "learning_rate": 7.658163463262831e-05, "loss": 0.4771, "step": 666 }, { "epoch": 1.098846787479407, "grad_norm": 0.9254189279937258, "learning_rate": 7.656299814612839e-05, "loss": 0.4698, "step": 667 }, { "epoch": 1.100494233937397, "grad_norm": 0.9028910545837401, "learning_rate": 7.654431327603753e-05, "loss": 0.4775, "step": 668 }, { "epoch": 1.1021416803953872, "grad_norm": 0.715912445107084, "learning_rate": 7.652558004708132e-05, "loss": 0.4806, "step": 669 }, { "epoch": 1.1037891268533773, "grad_norm": 0.594015585463618, "learning_rate": 7.650679848404931e-05, "loss": 0.4746, "step": 670 }, { "epoch": 1.1054365733113674, "grad_norm": 0.6906169224307781, "learning_rate": 7.648796861179505e-05, "loss": 0.4736, "step": 671 }, { "epoch": 1.1070840197693574, "grad_norm": 0.8995208926152166, "learning_rate": 7.646909045523598e-05, "loss": 0.4747, "step": 672 }, { "epoch": 1.1087314662273475, "grad_norm": 0.8942465626948667, "learning_rate": 7.645016403935345e-05, "loss": 0.4714, "step": 673 }, { "epoch": 1.1103789126853378, "grad_norm": 0.6313010277586429, "learning_rate": 7.643118938919267e-05, "loss": 0.4701, "step": 674 }, { "epoch": 1.112026359143328, "grad_norm": 0.5004610394405297, "learning_rate": 7.641216652986268e-05, "loss": 0.4698, "step": 675 }, { "epoch": 1.113673805601318, "grad_norm": 0.5174291237066692, "learning_rate": 7.639309548653632e-05, "loss": 0.4705, "step": 676 }, { "epoch": 1.115321252059308, "grad_norm": 0.49542524870771065, "learning_rate": 7.637397628445017e-05, "loss": 0.4652, "step": 677 }, { "epoch": 1.1169686985172982, "grad_norm": 0.4929277491644954, "learning_rate": 7.635480894890455e-05, "loss": 0.4764, "step": 678 }, { "epoch": 1.1186161449752883, "grad_norm": 0.4810857440874895, "learning_rate": 7.63355935052635e-05, "loss": 0.4754, "step": 679 }, { "epoch": 1.1202635914332784, "grad_norm": 0.48903324157751643, "learning_rate": 7.631632997895469e-05, "loss": 0.4773, "step": 680 }, { "epoch": 1.1219110378912684, "grad_norm": 0.5191344533535098, "learning_rate": 7.629701839546941e-05, "loss": 0.4674, "step": 681 }, { "epoch": 1.1235584843492585, "grad_norm": 0.583639488357799, "learning_rate": 7.627765878036258e-05, "loss": 0.4776, "step": 682 }, { "epoch": 1.1252059308072488, "grad_norm": 0.7586018510359961, "learning_rate": 7.625825115925266e-05, "loss": 0.4761, "step": 683 }, { "epoch": 1.126853377265239, "grad_norm": 1.0102348415709785, "learning_rate": 7.62387955578216e-05, "loss": 0.4744, "step": 684 }, { "epoch": 1.128500823723229, "grad_norm": 1.2904628031935859, "learning_rate": 7.621929200181492e-05, "loss": 0.4747, "step": 685 }, { "epoch": 1.130148270181219, "grad_norm": 0.7085839542396242, "learning_rate": 7.619974051704155e-05, "loss": 0.472, "step": 686 }, { "epoch": 1.1317957166392092, "grad_norm": 1.0544713876254228, "learning_rate": 7.618014112937383e-05, "loss": 0.4756, "step": 687 }, { "epoch": 1.1334431630971993, "grad_norm": 1.1559487272427895, "learning_rate": 7.616049386474752e-05, "loss": 0.4804, "step": 688 }, { "epoch": 1.1350906095551894, "grad_norm": 0.9262798149511274, "learning_rate": 7.614079874916173e-05, "loss": 0.4683, "step": 689 }, { "epoch": 1.1367380560131797, "grad_norm": 1.0682191367928102, "learning_rate": 7.612105580867887e-05, "loss": 0.4759, "step": 690 }, { "epoch": 1.1383855024711698, "grad_norm": 0.7637450158176542, "learning_rate": 7.610126506942467e-05, "loss": 0.4732, "step": 691 }, { "epoch": 1.1400329489291599, "grad_norm": 0.8697003092246957, "learning_rate": 7.608142655758809e-05, "loss": 0.4743, "step": 692 }, { "epoch": 1.14168039538715, "grad_norm": 0.70401029054979, "learning_rate": 7.606154029942131e-05, "loss": 0.4721, "step": 693 }, { "epoch": 1.14332784184514, "grad_norm": 0.6635785252983647, "learning_rate": 7.604160632123967e-05, "loss": 0.4782, "step": 694 }, { "epoch": 1.1449752883031301, "grad_norm": 0.6589312286369478, "learning_rate": 7.602162464942173e-05, "loss": 0.4755, "step": 695 }, { "epoch": 1.1466227347611202, "grad_norm": 0.5832789811053308, "learning_rate": 7.600159531040906e-05, "loss": 0.4725, "step": 696 }, { "epoch": 1.1482701812191103, "grad_norm": 0.6497644130875783, "learning_rate": 7.598151833070639e-05, "loss": 0.4701, "step": 697 }, { "epoch": 1.1499176276771004, "grad_norm": 0.6345333740476443, "learning_rate": 7.596139373688146e-05, "loss": 0.474, "step": 698 }, { "epoch": 1.1515650741350907, "grad_norm": 0.5542607805766171, "learning_rate": 7.594122155556503e-05, "loss": 0.4738, "step": 699 }, { "epoch": 1.1532125205930808, "grad_norm": 0.4148709803166888, "learning_rate": 7.592100181345082e-05, "loss": 0.4778, "step": 700 }, { "epoch": 1.1548599670510709, "grad_norm": 0.42328619360246555, "learning_rate": 7.590073453729548e-05, "loss": 0.4677, "step": 701 }, { "epoch": 1.156507413509061, "grad_norm": 0.5338292977653165, "learning_rate": 7.588041975391855e-05, "loss": 0.4664, "step": 702 }, { "epoch": 1.158154859967051, "grad_norm": 0.4473760354460952, "learning_rate": 7.586005749020252e-05, "loss": 0.4727, "step": 703 }, { "epoch": 1.1598023064250411, "grad_norm": 0.44835530765928217, "learning_rate": 7.583964777309262e-05, "loss": 0.4659, "step": 704 }, { "epoch": 1.1614497528830312, "grad_norm": 0.4274663369635527, "learning_rate": 7.581919062959687e-05, "loss": 0.466, "step": 705 }, { "epoch": 1.1630971993410215, "grad_norm": 0.4372945639089772, "learning_rate": 7.579868608678612e-05, "loss": 0.4746, "step": 706 }, { "epoch": 1.1647446457990116, "grad_norm": 0.46261945907025515, "learning_rate": 7.577813417179389e-05, "loss": 0.4689, "step": 707 }, { "epoch": 1.1663920922570017, "grad_norm": 0.4979458653431761, "learning_rate": 7.575753491181642e-05, "loss": 0.4677, "step": 708 }, { "epoch": 1.1680395387149918, "grad_norm": 0.45611335358615795, "learning_rate": 7.573688833411258e-05, "loss": 0.4635, "step": 709 }, { "epoch": 1.1696869851729819, "grad_norm": 0.5101370066381531, "learning_rate": 7.571619446600384e-05, "loss": 0.467, "step": 710 }, { "epoch": 1.171334431630972, "grad_norm": 0.5960776231432636, "learning_rate": 7.569545333487428e-05, "loss": 0.4791, "step": 711 }, { "epoch": 1.172981878088962, "grad_norm": 0.7103692613688294, "learning_rate": 7.567466496817051e-05, "loss": 0.4772, "step": 712 }, { "epoch": 1.1746293245469523, "grad_norm": 0.8570176625108956, "learning_rate": 7.565382939340164e-05, "loss": 0.4792, "step": 713 }, { "epoch": 1.1762767710049424, "grad_norm": 0.8452965277389102, "learning_rate": 7.563294663813929e-05, "loss": 0.4758, "step": 714 }, { "epoch": 1.1779242174629325, "grad_norm": 0.8049148568157238, "learning_rate": 7.561201673001746e-05, "loss": 0.4597, "step": 715 }, { "epoch": 1.1795716639209226, "grad_norm": 0.7493449989727806, "learning_rate": 7.559103969673257e-05, "loss": 0.474, "step": 716 }, { "epoch": 1.1812191103789127, "grad_norm": 0.6615501006955038, "learning_rate": 7.557001556604338e-05, "loss": 0.4668, "step": 717 }, { "epoch": 1.1828665568369028, "grad_norm": 0.6212178289721325, "learning_rate": 7.554894436577104e-05, "loss": 0.4671, "step": 718 }, { "epoch": 1.1845140032948929, "grad_norm": 0.5107768081615388, "learning_rate": 7.552782612379888e-05, "loss": 0.4677, "step": 719 }, { "epoch": 1.186161449752883, "grad_norm": 0.45139740896926017, "learning_rate": 7.550666086807258e-05, "loss": 0.4676, "step": 720 }, { "epoch": 1.187808896210873, "grad_norm": 0.5351945313630997, "learning_rate": 7.54854486266e-05, "loss": 0.4679, "step": 721 }, { "epoch": 1.1894563426688634, "grad_norm": 0.6129299949093029, "learning_rate": 7.546418942745114e-05, "loss": 0.4794, "step": 722 }, { "epoch": 1.1911037891268534, "grad_norm": 0.6528570723937129, "learning_rate": 7.544288329875818e-05, "loss": 0.4642, "step": 723 }, { "epoch": 1.1927512355848435, "grad_norm": 0.665994730681475, "learning_rate": 7.54215302687154e-05, "loss": 0.4643, "step": 724 }, { "epoch": 1.1943986820428336, "grad_norm": 0.6291440966482923, "learning_rate": 7.540013036557911e-05, "loss": 0.468, "step": 725 }, { "epoch": 1.1960461285008237, "grad_norm": 0.6470151440084622, "learning_rate": 7.537868361766769e-05, "loss": 0.4685, "step": 726 }, { "epoch": 1.1976935749588138, "grad_norm": 0.6507905800755579, "learning_rate": 7.535719005336151e-05, "loss": 0.4628, "step": 727 }, { "epoch": 1.1993410214168039, "grad_norm": 0.6454339652951312, "learning_rate": 7.533564970110281e-05, "loss": 0.4716, "step": 728 }, { "epoch": 1.2009884678747942, "grad_norm": 0.6455336237910918, "learning_rate": 7.531406258939586e-05, "loss": 0.4661, "step": 729 }, { "epoch": 1.2026359143327843, "grad_norm": 0.5797507506842737, "learning_rate": 7.529242874680672e-05, "loss": 0.4741, "step": 730 }, { "epoch": 1.2042833607907744, "grad_norm": 0.45533080335699033, "learning_rate": 7.527074820196333e-05, "loss": 0.4632, "step": 731 }, { "epoch": 1.2059308072487644, "grad_norm": 0.3579248391677038, "learning_rate": 7.524902098355545e-05, "loss": 0.4662, "step": 732 }, { "epoch": 1.2075782537067545, "grad_norm": 0.37763457169681347, "learning_rate": 7.522724712033453e-05, "loss": 0.47, "step": 733 }, { "epoch": 1.2092257001647446, "grad_norm": 0.4831138890918233, "learning_rate": 7.520542664111383e-05, "loss": 0.4733, "step": 734 }, { "epoch": 1.2108731466227347, "grad_norm": 0.5098333013955642, "learning_rate": 7.518355957476824e-05, "loss": 0.4733, "step": 735 }, { "epoch": 1.2125205930807248, "grad_norm": 0.5166085902269995, "learning_rate": 7.51616459502343e-05, "loss": 0.4695, "step": 736 }, { "epoch": 1.2141680395387149, "grad_norm": 0.4578522294933931, "learning_rate": 7.513968579651018e-05, "loss": 0.4728, "step": 737 }, { "epoch": 1.2158154859967052, "grad_norm": 0.4995329305409402, "learning_rate": 7.511767914265563e-05, "loss": 0.4686, "step": 738 }, { "epoch": 1.2174629324546953, "grad_norm": 0.6466658043607805, "learning_rate": 7.509562601779191e-05, "loss": 0.4772, "step": 739 }, { "epoch": 1.2191103789126854, "grad_norm": 0.6920768713310551, "learning_rate": 7.507352645110179e-05, "loss": 0.4695, "step": 740 }, { "epoch": 1.2207578253706755, "grad_norm": 0.5753375021345695, "learning_rate": 7.505138047182948e-05, "loss": 0.4617, "step": 741 }, { "epoch": 1.2224052718286655, "grad_norm": 0.49464228217929745, "learning_rate": 7.502918810928064e-05, "loss": 0.4717, "step": 742 }, { "epoch": 1.2240527182866556, "grad_norm": 0.5551739997462773, "learning_rate": 7.500694939282229e-05, "loss": 0.4697, "step": 743 }, { "epoch": 1.2257001647446457, "grad_norm": 0.6265137299662427, "learning_rate": 7.498466435188274e-05, "loss": 0.4601, "step": 744 }, { "epoch": 1.227347611202636, "grad_norm": 0.6971545568714137, "learning_rate": 7.496233301595169e-05, "loss": 0.4788, "step": 745 }, { "epoch": 1.2289950576606261, "grad_norm": 0.7167998826305005, "learning_rate": 7.493995541458005e-05, "loss": 0.4675, "step": 746 }, { "epoch": 1.2306425041186162, "grad_norm": 0.6592300269230261, "learning_rate": 7.491753157737996e-05, "loss": 0.4606, "step": 747 }, { "epoch": 1.2322899505766063, "grad_norm": 0.5988959428446587, "learning_rate": 7.489506153402474e-05, "loss": 0.4694, "step": 748 }, { "epoch": 1.2339373970345964, "grad_norm": 0.5747144404504834, "learning_rate": 7.487254531424889e-05, "loss": 0.4714, "step": 749 }, { "epoch": 1.2355848434925865, "grad_norm": 0.6497532597600497, "learning_rate": 7.484998294784794e-05, "loss": 0.4678, "step": 750 }, { "epoch": 1.2372322899505765, "grad_norm": 0.7180518675038088, "learning_rate": 7.482737446467855e-05, "loss": 0.47, "step": 751 }, { "epoch": 1.2388797364085666, "grad_norm": 0.6345224677445507, "learning_rate": 7.480471989465841e-05, "loss": 0.4672, "step": 752 }, { "epoch": 1.2405271828665567, "grad_norm": 0.6794971757734701, "learning_rate": 7.478201926776615e-05, "loss": 0.4683, "step": 753 }, { "epoch": 1.242174629324547, "grad_norm": 0.8463029789927891, "learning_rate": 7.475927261404136e-05, "loss": 0.4692, "step": 754 }, { "epoch": 1.2438220757825371, "grad_norm": 0.8175680664124694, "learning_rate": 7.473647996358458e-05, "loss": 0.4691, "step": 755 }, { "epoch": 1.2454695222405272, "grad_norm": 0.6305931401480672, "learning_rate": 7.471364134655716e-05, "loss": 0.4736, "step": 756 }, { "epoch": 1.2471169686985173, "grad_norm": 0.4397078700221855, "learning_rate": 7.469075679318131e-05, "loss": 0.472, "step": 757 }, { "epoch": 1.2487644151565074, "grad_norm": 0.3214344580533025, "learning_rate": 7.466782633374001e-05, "loss": 0.4752, "step": 758 }, { "epoch": 1.2504118616144975, "grad_norm": 0.42648710107084675, "learning_rate": 7.4644849998577e-05, "loss": 0.4625, "step": 759 }, { "epoch": 1.2520593080724876, "grad_norm": 0.4782348864902326, "learning_rate": 7.462182781809673e-05, "loss": 0.4621, "step": 760 }, { "epoch": 1.2537067545304779, "grad_norm": 0.5126855073002898, "learning_rate": 7.459875982276431e-05, "loss": 0.4748, "step": 761 }, { "epoch": 1.255354200988468, "grad_norm": 0.5054562655380781, "learning_rate": 7.457564604310547e-05, "loss": 0.4712, "step": 762 }, { "epoch": 1.257001647446458, "grad_norm": 0.4899176201948403, "learning_rate": 7.455248650970653e-05, "loss": 0.4784, "step": 763 }, { "epoch": 1.2586490939044481, "grad_norm": 0.4154742634196543, "learning_rate": 7.452928125321438e-05, "loss": 0.4675, "step": 764 }, { "epoch": 1.2602965403624382, "grad_norm": 0.35146092081798386, "learning_rate": 7.450603030433636e-05, "loss": 0.4723, "step": 765 }, { "epoch": 1.2619439868204283, "grad_norm": 0.2593912528758498, "learning_rate": 7.448273369384034e-05, "loss": 0.4695, "step": 766 }, { "epoch": 1.2635914332784184, "grad_norm": 0.3304020200948066, "learning_rate": 7.445939145255455e-05, "loss": 0.4755, "step": 767 }, { "epoch": 1.2652388797364087, "grad_norm": 0.5002331439270244, "learning_rate": 7.443600361136765e-05, "loss": 0.4713, "step": 768 }, { "epoch": 1.2668863261943986, "grad_norm": 0.5437208703602878, "learning_rate": 7.441257020122865e-05, "loss": 0.4673, "step": 769 }, { "epoch": 1.2685337726523889, "grad_norm": 0.6049125052547121, "learning_rate": 7.438909125314679e-05, "loss": 0.4698, "step": 770 }, { "epoch": 1.270181219110379, "grad_norm": 0.6643016543557881, "learning_rate": 7.436556679819166e-05, "loss": 0.4649, "step": 771 }, { "epoch": 1.271828665568369, "grad_norm": 0.7566146016037285, "learning_rate": 7.434199686749301e-05, "loss": 0.4739, "step": 772 }, { "epoch": 1.2734761120263591, "grad_norm": 0.7705538256612012, "learning_rate": 7.431838149224078e-05, "loss": 0.4705, "step": 773 }, { "epoch": 1.2751235584843492, "grad_norm": 0.8518717290385467, "learning_rate": 7.429472070368505e-05, "loss": 0.4762, "step": 774 }, { "epoch": 1.2767710049423393, "grad_norm": 0.9421602787522935, "learning_rate": 7.427101453313601e-05, "loss": 0.4702, "step": 775 }, { "epoch": 1.2784184514003294, "grad_norm": 0.9476088862102958, "learning_rate": 7.424726301196387e-05, "loss": 0.4639, "step": 776 }, { "epoch": 1.2800658978583197, "grad_norm": 0.7753659736325391, "learning_rate": 7.42234661715989e-05, "loss": 0.4703, "step": 777 }, { "epoch": 1.2817133443163098, "grad_norm": 0.5364638516966361, "learning_rate": 7.41996240435313e-05, "loss": 0.4587, "step": 778 }, { "epoch": 1.2833607907742999, "grad_norm": 0.42014949722390876, "learning_rate": 7.417573665931119e-05, "loss": 0.4743, "step": 779 }, { "epoch": 1.28500823723229, "grad_norm": 0.5198343054666379, "learning_rate": 7.415180405054862e-05, "loss": 0.4695, "step": 780 }, { "epoch": 1.28665568369028, "grad_norm": 0.6088368359857622, "learning_rate": 7.412782624891346e-05, "loss": 0.4697, "step": 781 }, { "epoch": 1.2883031301482701, "grad_norm": 0.5896132922814521, "learning_rate": 7.410380328613541e-05, "loss": 0.4754, "step": 782 }, { "epoch": 1.2899505766062602, "grad_norm": 0.5538287205578329, "learning_rate": 7.407973519400387e-05, "loss": 0.4651, "step": 783 }, { "epoch": 1.2915980230642505, "grad_norm": 0.4984495505027331, "learning_rate": 7.405562200436802e-05, "loss": 0.463, "step": 784 }, { "epoch": 1.2932454695222404, "grad_norm": 0.429012466573384, "learning_rate": 7.403146374913668e-05, "loss": 0.4689, "step": 785 }, { "epoch": 1.2948929159802307, "grad_norm": 0.4136496832136937, "learning_rate": 7.400726046027836e-05, "loss": 0.4665, "step": 786 }, { "epoch": 1.2965403624382208, "grad_norm": 0.46294257604944544, "learning_rate": 7.398301216982107e-05, "loss": 0.4621, "step": 787 }, { "epoch": 1.2981878088962109, "grad_norm": 0.48281436180804815, "learning_rate": 7.395871890985246e-05, "loss": 0.4698, "step": 788 }, { "epoch": 1.299835255354201, "grad_norm": 0.5087577795960315, "learning_rate": 7.393438071251967e-05, "loss": 0.4757, "step": 789 }, { "epoch": 1.301482701812191, "grad_norm": 0.5418467551685757, "learning_rate": 7.390999761002923e-05, "loss": 0.4738, "step": 790 }, { "epoch": 1.3031301482701811, "grad_norm": 0.6143085586717707, "learning_rate": 7.388556963464718e-05, "loss": 0.4648, "step": 791 }, { "epoch": 1.3047775947281712, "grad_norm": 0.7391844183599157, "learning_rate": 7.386109681869891e-05, "loss": 0.475, "step": 792 }, { "epoch": 1.3064250411861615, "grad_norm": 0.87290265719599, "learning_rate": 7.383657919456915e-05, "loss": 0.4744, "step": 793 }, { "epoch": 1.3080724876441516, "grad_norm": 0.9330849217299079, "learning_rate": 7.381201679470192e-05, "loss": 0.4649, "step": 794 }, { "epoch": 1.3097199341021417, "grad_norm": 0.8150777687514055, "learning_rate": 7.378740965160049e-05, "loss": 0.47, "step": 795 }, { "epoch": 1.3113673805601318, "grad_norm": 0.575722843822115, "learning_rate": 7.376275779782734e-05, "loss": 0.4773, "step": 796 }, { "epoch": 1.3130148270181219, "grad_norm": 0.5049248098515596, "learning_rate": 7.373806126600412e-05, "loss": 0.4672, "step": 797 }, { "epoch": 1.314662273476112, "grad_norm": 0.5804995345060519, "learning_rate": 7.37133200888116e-05, "loss": 0.4668, "step": 798 }, { "epoch": 1.316309719934102, "grad_norm": 0.5693831669255286, "learning_rate": 7.368853429898962e-05, "loss": 0.4627, "step": 799 }, { "epoch": 1.3179571663920924, "grad_norm": 0.5644125867767062, "learning_rate": 7.366370392933708e-05, "loss": 0.4674, "step": 800 }, { "epoch": 1.3196046128500822, "grad_norm": 0.5417820144789035, "learning_rate": 7.363882901271183e-05, "loss": 0.4657, "step": 801 }, { "epoch": 1.3212520593080725, "grad_norm": 0.4865420460302095, "learning_rate": 7.361390958203071e-05, "loss": 0.4775, "step": 802 }, { "epoch": 1.3228995057660626, "grad_norm": 0.5759168321293057, "learning_rate": 7.358894567026944e-05, "loss": 0.4719, "step": 803 }, { "epoch": 1.3245469522240527, "grad_norm": 0.5236586367865232, "learning_rate": 7.356393731046264e-05, "loss": 0.4677, "step": 804 }, { "epoch": 1.3261943986820428, "grad_norm": 0.3960020898522966, "learning_rate": 7.353888453570369e-05, "loss": 0.4672, "step": 805 }, { "epoch": 1.327841845140033, "grad_norm": 0.3576961763144241, "learning_rate": 7.35137873791448e-05, "loss": 0.4616, "step": 806 }, { "epoch": 1.329489291598023, "grad_norm": 0.38439297639330294, "learning_rate": 7.348864587399683e-05, "loss": 0.4824, "step": 807 }, { "epoch": 1.331136738056013, "grad_norm": 0.3557478554060588, "learning_rate": 7.346346005352944e-05, "loss": 0.4702, "step": 808 }, { "epoch": 1.3327841845140034, "grad_norm": 0.33621531141359845, "learning_rate": 7.343822995107084e-05, "loss": 0.4649, "step": 809 }, { "epoch": 1.3344316309719935, "grad_norm": 0.3207730023088494, "learning_rate": 7.341295560000786e-05, "loss": 0.4707, "step": 810 }, { "epoch": 1.3360790774299836, "grad_norm": 0.3279281793608847, "learning_rate": 7.338763703378593e-05, "loss": 0.4751, "step": 811 }, { "epoch": 1.3377265238879736, "grad_norm": 0.37983040587927613, "learning_rate": 7.336227428590892e-05, "loss": 0.464, "step": 812 }, { "epoch": 1.3393739703459637, "grad_norm": 0.4356095449814453, "learning_rate": 7.333686738993923e-05, "loss": 0.4661, "step": 813 }, { "epoch": 1.3410214168039538, "grad_norm": 0.44004321795679346, "learning_rate": 7.33114163794976e-05, "loss": 0.4693, "step": 814 }, { "epoch": 1.342668863261944, "grad_norm": 0.47539585501049203, "learning_rate": 7.328592128826325e-05, "loss": 0.461, "step": 815 }, { "epoch": 1.3443163097199342, "grad_norm": 0.5656531631952192, "learning_rate": 7.326038214997365e-05, "loss": 0.4639, "step": 816 }, { "epoch": 1.3459637561779243, "grad_norm": 0.650059097224674, "learning_rate": 7.323479899842458e-05, "loss": 0.4774, "step": 817 }, { "epoch": 1.3476112026359144, "grad_norm": 0.7420899028278646, "learning_rate": 7.320917186747008e-05, "loss": 0.4679, "step": 818 }, { "epoch": 1.3492586490939045, "grad_norm": 0.7601239742697705, "learning_rate": 7.318350079102237e-05, "loss": 0.457, "step": 819 }, { "epoch": 1.3509060955518946, "grad_norm": 0.6700389301417483, "learning_rate": 7.315778580305184e-05, "loss": 0.4689, "step": 820 }, { "epoch": 1.3525535420098846, "grad_norm": 0.5561571030084574, "learning_rate": 7.313202693758694e-05, "loss": 0.4608, "step": 821 }, { "epoch": 1.3542009884678747, "grad_norm": 0.5166731090423742, "learning_rate": 7.310622422871426e-05, "loss": 0.4741, "step": 822 }, { "epoch": 1.355848434925865, "grad_norm": 0.6184405959941028, "learning_rate": 7.308037771057836e-05, "loss": 0.4718, "step": 823 }, { "epoch": 1.357495881383855, "grad_norm": 0.6820773417606982, "learning_rate": 7.305448741738175e-05, "loss": 0.4648, "step": 824 }, { "epoch": 1.3591433278418452, "grad_norm": 0.6975516800299573, "learning_rate": 7.302855338338491e-05, "loss": 0.4593, "step": 825 }, { "epoch": 1.3607907742998353, "grad_norm": 0.7099554392355418, "learning_rate": 7.30025756429062e-05, "loss": 0.4659, "step": 826 }, { "epoch": 1.3624382207578254, "grad_norm": 0.5859583280626293, "learning_rate": 7.297655423032181e-05, "loss": 0.47, "step": 827 }, { "epoch": 1.3640856672158155, "grad_norm": 0.42912452089421177, "learning_rate": 7.295048918006569e-05, "loss": 0.4698, "step": 828 }, { "epoch": 1.3657331136738056, "grad_norm": 0.3785695687926479, "learning_rate": 7.292438052662956e-05, "loss": 0.4686, "step": 829 }, { "epoch": 1.3673805601317957, "grad_norm": 0.4414042811564011, "learning_rate": 7.289822830456287e-05, "loss": 0.4672, "step": 830 }, { "epoch": 1.3690280065897857, "grad_norm": 0.5827607583146401, "learning_rate": 7.287203254847267e-05, "loss": 0.4704, "step": 831 }, { "epoch": 1.370675453047776, "grad_norm": 0.5837719495973097, "learning_rate": 7.284579329302365e-05, "loss": 0.4683, "step": 832 }, { "epoch": 1.3723228995057661, "grad_norm": 0.42419745001021847, "learning_rate": 7.281951057293808e-05, "loss": 0.4746, "step": 833 }, { "epoch": 1.3739703459637562, "grad_norm": 0.3261445619644663, "learning_rate": 7.279318442299567e-05, "loss": 0.4691, "step": 834 }, { "epoch": 1.3756177924217463, "grad_norm": 0.38204961493141, "learning_rate": 7.27668148780337e-05, "loss": 0.4698, "step": 835 }, { "epoch": 1.3772652388797364, "grad_norm": 0.5007399614846824, "learning_rate": 7.274040197294679e-05, "loss": 0.4621, "step": 836 }, { "epoch": 1.3789126853377265, "grad_norm": 0.4609822958537807, "learning_rate": 7.271394574268701e-05, "loss": 0.4727, "step": 837 }, { "epoch": 1.3805601317957166, "grad_norm": 0.39753075085132245, "learning_rate": 7.26874462222637e-05, "loss": 0.465, "step": 838 }, { "epoch": 1.3822075782537069, "grad_norm": 0.4696064782171551, "learning_rate": 7.266090344674354e-05, "loss": 0.4762, "step": 839 }, { "epoch": 1.3838550247116967, "grad_norm": 0.624000107208019, "learning_rate": 7.26343174512504e-05, "loss": 0.4608, "step": 840 }, { "epoch": 1.385502471169687, "grad_norm": 0.7159439712932061, "learning_rate": 7.260768827096536e-05, "loss": 0.4703, "step": 841 }, { "epoch": 1.3871499176276771, "grad_norm": 0.7012627541203976, "learning_rate": 7.258101594112666e-05, "loss": 0.4672, "step": 842 }, { "epoch": 1.3887973640856672, "grad_norm": 0.7032135298693188, "learning_rate": 7.255430049702964e-05, "loss": 0.471, "step": 843 }, { "epoch": 1.3904448105436573, "grad_norm": 0.7361763656025458, "learning_rate": 7.252754197402667e-05, "loss": 0.4777, "step": 844 }, { "epoch": 1.3920922570016474, "grad_norm": 0.7174138394914908, "learning_rate": 7.250074040752715e-05, "loss": 0.4712, "step": 845 }, { "epoch": 1.3937397034596375, "grad_norm": 0.6361612235587935, "learning_rate": 7.247389583299742e-05, "loss": 0.4684, "step": 846 }, { "epoch": 1.3953871499176276, "grad_norm": 0.5590201201254801, "learning_rate": 7.244700828596074e-05, "loss": 0.4647, "step": 847 }, { "epoch": 1.3970345963756179, "grad_norm": 0.5674349582353517, "learning_rate": 7.242007780199723e-05, "loss": 0.4733, "step": 848 }, { "epoch": 1.398682042833608, "grad_norm": 0.6889943956382971, "learning_rate": 7.239310441674386e-05, "loss": 0.4674, "step": 849 }, { "epoch": 1.400329489291598, "grad_norm": 0.7103844070608117, "learning_rate": 7.236608816589432e-05, "loss": 0.4691, "step": 850 }, { "epoch": 1.4019769357495881, "grad_norm": 0.4955158974980977, "learning_rate": 7.233902908519904e-05, "loss": 0.4589, "step": 851 }, { "epoch": 1.4036243822075782, "grad_norm": 0.3962928066309894, "learning_rate": 7.231192721046518e-05, "loss": 0.4682, "step": 852 }, { "epoch": 1.4052718286655683, "grad_norm": 0.5117120980762494, "learning_rate": 7.228478257755643e-05, "loss": 0.4651, "step": 853 }, { "epoch": 1.4069192751235584, "grad_norm": 0.5582976662091699, "learning_rate": 7.225759522239315e-05, "loss": 0.4763, "step": 854 }, { "epoch": 1.4085667215815487, "grad_norm": 0.40824795463441976, "learning_rate": 7.223036518095219e-05, "loss": 0.4721, "step": 855 }, { "epoch": 1.4102141680395386, "grad_norm": 0.37729607421137556, "learning_rate": 7.220309248926692e-05, "loss": 0.4638, "step": 856 }, { "epoch": 1.411861614497529, "grad_norm": 0.47361069397820676, "learning_rate": 7.217577718342708e-05, "loss": 0.4609, "step": 857 }, { "epoch": 1.413509060955519, "grad_norm": 0.5250279727930812, "learning_rate": 7.214841929957888e-05, "loss": 0.4615, "step": 858 }, { "epoch": 1.415156507413509, "grad_norm": 0.628370448814308, "learning_rate": 7.212101887392483e-05, "loss": 0.4693, "step": 859 }, { "epoch": 1.4168039538714992, "grad_norm": 0.6907084742967239, "learning_rate": 7.209357594272375e-05, "loss": 0.4702, "step": 860 }, { "epoch": 1.4184514003294892, "grad_norm": 0.6964726444400706, "learning_rate": 7.206609054229069e-05, "loss": 0.4685, "step": 861 }, { "epoch": 1.4200988467874793, "grad_norm": 0.8087447518362598, "learning_rate": 7.203856270899692e-05, "loss": 0.4733, "step": 862 }, { "epoch": 1.4217462932454694, "grad_norm": 0.9959362991192034, "learning_rate": 7.201099247926985e-05, "loss": 0.4757, "step": 863 }, { "epoch": 1.4233937397034597, "grad_norm": 1.094020510535325, "learning_rate": 7.1983379889593e-05, "loss": 0.4653, "step": 864 }, { "epoch": 1.4250411861614498, "grad_norm": 0.7369728349002672, "learning_rate": 7.195572497650595e-05, "loss": 0.4715, "step": 865 }, { "epoch": 1.42668863261944, "grad_norm": 0.4703383666513801, "learning_rate": 7.192802777660425e-05, "loss": 0.4767, "step": 866 }, { "epoch": 1.42833607907743, "grad_norm": 0.4524780194081046, "learning_rate": 7.190028832653947e-05, "loss": 0.4733, "step": 867 }, { "epoch": 1.42998352553542, "grad_norm": 0.7003851958535532, "learning_rate": 7.1872506663019e-05, "loss": 0.4685, "step": 868 }, { "epoch": 1.4316309719934102, "grad_norm": 0.8504661781231301, "learning_rate": 7.18446828228062e-05, "loss": 0.4757, "step": 869 }, { "epoch": 1.4332784184514002, "grad_norm": 0.7084425413077257, "learning_rate": 7.181681684272016e-05, "loss": 0.4703, "step": 870 }, { "epoch": 1.4349258649093906, "grad_norm": 0.5560197359232483, "learning_rate": 7.178890875963576e-05, "loss": 0.4703, "step": 871 }, { "epoch": 1.4365733113673804, "grad_norm": 0.41652883646714123, "learning_rate": 7.176095861048363e-05, "loss": 0.4678, "step": 872 }, { "epoch": 1.4382207578253707, "grad_norm": 0.3894199488357972, "learning_rate": 7.173296643224997e-05, "loss": 0.4762, "step": 873 }, { "epoch": 1.4398682042833608, "grad_norm": 0.5564014499724059, "learning_rate": 7.17049322619767e-05, "loss": 0.4691, "step": 874 }, { "epoch": 1.441515650741351, "grad_norm": 0.597707881862813, "learning_rate": 7.167685613676124e-05, "loss": 0.4678, "step": 875 }, { "epoch": 1.443163097199341, "grad_norm": 0.49955026358318827, "learning_rate": 7.164873809375658e-05, "loss": 0.4594, "step": 876 }, { "epoch": 1.444810543657331, "grad_norm": 0.36499270433855713, "learning_rate": 7.162057817017114e-05, "loss": 0.4599, "step": 877 }, { "epoch": 1.4464579901153214, "grad_norm": 0.34575130269516346, "learning_rate": 7.159237640326877e-05, "loss": 0.4598, "step": 878 }, { "epoch": 1.4481054365733113, "grad_norm": 0.37109019266795545, "learning_rate": 7.156413283036871e-05, "loss": 0.4565, "step": 879 }, { "epoch": 1.4497528830313016, "grad_norm": 0.37708341426339065, "learning_rate": 7.15358474888455e-05, "loss": 0.4652, "step": 880 }, { "epoch": 1.4514003294892917, "grad_norm": 0.3700168208561387, "learning_rate": 7.150752041612895e-05, "loss": 0.4676, "step": 881 }, { "epoch": 1.4530477759472817, "grad_norm": 0.4019582376620918, "learning_rate": 7.14791516497041e-05, "loss": 0.4662, "step": 882 }, { "epoch": 1.4546952224052718, "grad_norm": 0.39247408388676763, "learning_rate": 7.14507412271112e-05, "loss": 0.4714, "step": 883 }, { "epoch": 1.456342668863262, "grad_norm": 0.4846036570076252, "learning_rate": 7.142228918594553e-05, "loss": 0.4662, "step": 884 }, { "epoch": 1.457990115321252, "grad_norm": 0.6229691621251094, "learning_rate": 7.139379556385753e-05, "loss": 0.4704, "step": 885 }, { "epoch": 1.459637561779242, "grad_norm": 0.601614716322314, "learning_rate": 7.136526039855265e-05, "loss": 0.4672, "step": 886 }, { "epoch": 1.4612850082372324, "grad_norm": 0.6268330423672124, "learning_rate": 7.133668372779129e-05, "loss": 0.4727, "step": 887 }, { "epoch": 1.4629324546952225, "grad_norm": 0.7034650467510409, "learning_rate": 7.130806558938875e-05, "loss": 0.4729, "step": 888 }, { "epoch": 1.4645799011532126, "grad_norm": 0.7685157247547795, "learning_rate": 7.127940602121531e-05, "loss": 0.4732, "step": 889 }, { "epoch": 1.4662273476112027, "grad_norm": 0.8020592444582061, "learning_rate": 7.125070506119592e-05, "loss": 0.4684, "step": 890 }, { "epoch": 1.4678747940691927, "grad_norm": 0.8283770726564867, "learning_rate": 7.122196274731042e-05, "loss": 0.4641, "step": 891 }, { "epoch": 1.4695222405271828, "grad_norm": 0.6638744385097497, "learning_rate": 7.119317911759336e-05, "loss": 0.4699, "step": 892 }, { "epoch": 1.471169686985173, "grad_norm": 0.474251741800077, "learning_rate": 7.11643542101339e-05, "loss": 0.475, "step": 893 }, { "epoch": 1.4728171334431632, "grad_norm": 0.5201518191487113, "learning_rate": 7.113548806307588e-05, "loss": 0.4632, "step": 894 }, { "epoch": 1.474464579901153, "grad_norm": 0.46117636855857463, "learning_rate": 7.110658071461772e-05, "loss": 0.4652, "step": 895 }, { "epoch": 1.4761120263591434, "grad_norm": 0.30676400241384033, "learning_rate": 7.107763220301228e-05, "loss": 0.4629, "step": 896 }, { "epoch": 1.4777594728171335, "grad_norm": 0.4217690499991569, "learning_rate": 7.104864256656699e-05, "loss": 0.4612, "step": 897 }, { "epoch": 1.4794069192751236, "grad_norm": 0.5157388673533592, "learning_rate": 7.101961184364365e-05, "loss": 0.4674, "step": 898 }, { "epoch": 1.4810543657331137, "grad_norm": 0.5182782618912538, "learning_rate": 7.099054007265844e-05, "loss": 0.4661, "step": 899 }, { "epoch": 1.4827018121911038, "grad_norm": 0.394567803517057, "learning_rate": 7.096142729208184e-05, "loss": 0.4634, "step": 900 }, { "epoch": 1.4843492586490938, "grad_norm": 0.3394314088595935, "learning_rate": 7.093227354043864e-05, "loss": 0.4702, "step": 901 }, { "epoch": 1.485996705107084, "grad_norm": 0.40475766502652283, "learning_rate": 7.09030788563078e-05, "loss": 0.4668, "step": 902 }, { "epoch": 1.4876441515650742, "grad_norm": 0.41711171012495163, "learning_rate": 7.087384327832248e-05, "loss": 0.4755, "step": 903 }, { "epoch": 1.4892915980230643, "grad_norm": 0.43194118753499744, "learning_rate": 7.084456684516991e-05, "loss": 0.4628, "step": 904 }, { "epoch": 1.4909390444810544, "grad_norm": 0.48534080081797626, "learning_rate": 7.081524959559146e-05, "loss": 0.4731, "step": 905 }, { "epoch": 1.4925864909390445, "grad_norm": 0.5172428631740005, "learning_rate": 7.078589156838243e-05, "loss": 0.4679, "step": 906 }, { "epoch": 1.4942339373970346, "grad_norm": 0.5093370696766526, "learning_rate": 7.075649280239213e-05, "loss": 0.4685, "step": 907 }, { "epoch": 1.4958813838550247, "grad_norm": 0.5117948444523397, "learning_rate": 7.072705333652377e-05, "loss": 0.469, "step": 908 }, { "epoch": 1.4975288303130148, "grad_norm": 0.5078787582611883, "learning_rate": 7.069757320973442e-05, "loss": 0.4643, "step": 909 }, { "epoch": 1.499176276771005, "grad_norm": 0.539402014492225, "learning_rate": 7.066805246103493e-05, "loss": 0.4582, "step": 910 }, { "epoch": 1.500823723228995, "grad_norm": 0.6652845291140941, "learning_rate": 7.063849112948994e-05, "loss": 0.4616, "step": 911 }, { "epoch": 1.5024711696869852, "grad_norm": 0.7492782363052848, "learning_rate": 7.060888925421777e-05, "loss": 0.4646, "step": 912 }, { "epoch": 1.5041186161449753, "grad_norm": 0.7393465267503111, "learning_rate": 7.057924687439043e-05, "loss": 0.4701, "step": 913 }, { "epoch": 1.5057660626029654, "grad_norm": 0.6886154379341712, "learning_rate": 7.054956402923345e-05, "loss": 0.4761, "step": 914 }, { "epoch": 1.5074135090609555, "grad_norm": 0.5920999943052413, "learning_rate": 7.051984075802599e-05, "loss": 0.4699, "step": 915 }, { "epoch": 1.5090609555189456, "grad_norm": 0.4931403228206967, "learning_rate": 7.049007710010067e-05, "loss": 0.4671, "step": 916 }, { "epoch": 1.510708401976936, "grad_norm": 0.474621414788173, "learning_rate": 7.046027309484353e-05, "loss": 0.4626, "step": 917 }, { "epoch": 1.5123558484349258, "grad_norm": 0.4289785651886632, "learning_rate": 7.043042878169407e-05, "loss": 0.4686, "step": 918 }, { "epoch": 1.514003294892916, "grad_norm": 0.4691170350626838, "learning_rate": 7.040054420014506e-05, "loss": 0.4682, "step": 919 }, { "epoch": 1.515650741350906, "grad_norm": 0.47460415991864763, "learning_rate": 7.037061938974259e-05, "loss": 0.4656, "step": 920 }, { "epoch": 1.5172981878088962, "grad_norm": 0.4903005948926247, "learning_rate": 7.034065439008595e-05, "loss": 0.4711, "step": 921 }, { "epoch": 1.5189456342668863, "grad_norm": 0.4661840606035265, "learning_rate": 7.03106492408277e-05, "loss": 0.4671, "step": 922 }, { "epoch": 1.5205930807248764, "grad_norm": 0.4278864217645539, "learning_rate": 7.02806039816734e-05, "loss": 0.4707, "step": 923 }, { "epoch": 1.5222405271828665, "grad_norm": 0.3152252578246817, "learning_rate": 7.025051865238181e-05, "loss": 0.4612, "step": 924 }, { "epoch": 1.5238879736408566, "grad_norm": 0.35262324307113035, "learning_rate": 7.022039329276464e-05, "loss": 0.4626, "step": 925 }, { "epoch": 1.525535420098847, "grad_norm": 0.48093222971460076, "learning_rate": 7.01902279426866e-05, "loss": 0.4688, "step": 926 }, { "epoch": 1.5271828665568368, "grad_norm": 0.5491120163051847, "learning_rate": 7.01600226420653e-05, "loss": 0.4629, "step": 927 }, { "epoch": 1.528830313014827, "grad_norm": 0.5847560499512879, "learning_rate": 7.012977743087123e-05, "loss": 0.465, "step": 928 }, { "epoch": 1.5304777594728172, "grad_norm": 0.609825561809004, "learning_rate": 7.009949234912772e-05, "loss": 0.4687, "step": 929 }, { "epoch": 1.5321252059308073, "grad_norm": 0.6347720131273823, "learning_rate": 7.00691674369108e-05, "loss": 0.4576, "step": 930 }, { "epoch": 1.5337726523887973, "grad_norm": 0.717106409197699, "learning_rate": 7.003880273434926e-05, "loss": 0.4652, "step": 931 }, { "epoch": 1.5354200988467874, "grad_norm": 0.8668474635819055, "learning_rate": 7.00083982816245e-05, "loss": 0.4661, "step": 932 }, { "epoch": 1.5370675453047777, "grad_norm": 0.8371671010184105, "learning_rate": 6.997795411897057e-05, "loss": 0.4643, "step": 933 }, { "epoch": 1.5387149917627676, "grad_norm": 0.7783127479873807, "learning_rate": 6.994747028667405e-05, "loss": 0.4634, "step": 934 }, { "epoch": 1.540362438220758, "grad_norm": 0.6519103778594328, "learning_rate": 6.991694682507399e-05, "loss": 0.4642, "step": 935 }, { "epoch": 1.5420098846787478, "grad_norm": 0.49704909318997115, "learning_rate": 6.98863837745619e-05, "loss": 0.4659, "step": 936 }, { "epoch": 1.543657331136738, "grad_norm": 0.45164358680737265, "learning_rate": 6.985578117558167e-05, "loss": 0.4619, "step": 937 }, { "epoch": 1.5453047775947282, "grad_norm": 0.5662186711294476, "learning_rate": 6.982513906862955e-05, "loss": 0.4644, "step": 938 }, { "epoch": 1.5469522240527183, "grad_norm": 0.6397040401330021, "learning_rate": 6.979445749425404e-05, "loss": 0.4636, "step": 939 }, { "epoch": 1.5485996705107083, "grad_norm": 0.5817020535491507, "learning_rate": 6.97637364930559e-05, "loss": 0.4754, "step": 940 }, { "epoch": 1.5502471169686984, "grad_norm": 0.4355687425973465, "learning_rate": 6.9732976105688e-05, "loss": 0.459, "step": 941 }, { "epoch": 1.5518945634266887, "grad_norm": 0.4078123747811945, "learning_rate": 6.970217637285543e-05, "loss": 0.4658, "step": 942 }, { "epoch": 1.5535420098846786, "grad_norm": 0.4168019524760106, "learning_rate": 6.967133733531522e-05, "loss": 0.4678, "step": 943 }, { "epoch": 1.555189456342669, "grad_norm": 0.40693060550142385, "learning_rate": 6.964045903387652e-05, "loss": 0.4584, "step": 944 }, { "epoch": 1.556836902800659, "grad_norm": 0.43277842101582925, "learning_rate": 6.96095415094004e-05, "loss": 0.4664, "step": 945 }, { "epoch": 1.558484349258649, "grad_norm": 0.4754516517719869, "learning_rate": 6.957858480279982e-05, "loss": 0.4663, "step": 946 }, { "epoch": 1.5601317957166392, "grad_norm": 0.5335947063054358, "learning_rate": 6.95475889550396e-05, "loss": 0.4648, "step": 947 }, { "epoch": 1.5617792421746293, "grad_norm": 0.5479553541100616, "learning_rate": 6.951655400713634e-05, "loss": 0.4663, "step": 948 }, { "epoch": 1.5634266886326196, "grad_norm": 0.5453680705643691, "learning_rate": 6.948548000015842e-05, "loss": 0.4681, "step": 949 }, { "epoch": 1.5650741350906094, "grad_norm": 0.5951148479025303, "learning_rate": 6.945436697522587e-05, "loss": 0.4749, "step": 950 }, { "epoch": 1.5667215815485998, "grad_norm": 0.5892449087934744, "learning_rate": 6.942321497351038e-05, "loss": 0.4668, "step": 951 }, { "epoch": 1.5683690280065898, "grad_norm": 0.5047248564876605, "learning_rate": 6.939202403623519e-05, "loss": 0.4601, "step": 952 }, { "epoch": 1.57001647446458, "grad_norm": 0.432024017202012, "learning_rate": 6.936079420467506e-05, "loss": 0.4696, "step": 953 }, { "epoch": 1.57166392092257, "grad_norm": 0.44838271295810767, "learning_rate": 6.932952552015627e-05, "loss": 0.4616, "step": 954 }, { "epoch": 1.57331136738056, "grad_norm": 0.4584692095997759, "learning_rate": 6.929821802405645e-05, "loss": 0.4578, "step": 955 }, { "epoch": 1.5749588138385504, "grad_norm": 0.44527918580000864, "learning_rate": 6.926687175780463e-05, "loss": 0.4661, "step": 956 }, { "epoch": 1.5766062602965403, "grad_norm": 0.43816138860936055, "learning_rate": 6.923548676288112e-05, "loss": 0.4695, "step": 957 }, { "epoch": 1.5782537067545306, "grad_norm": 0.5177925876568732, "learning_rate": 6.92040630808175e-05, "loss": 0.4684, "step": 958 }, { "epoch": 1.5799011532125204, "grad_norm": 0.7515864647569738, "learning_rate": 6.917260075319655e-05, "loss": 0.4697, "step": 959 }, { "epoch": 1.5815485996705108, "grad_norm": 0.8567002481829906, "learning_rate": 6.914109982165213e-05, "loss": 0.4623, "step": 960 }, { "epoch": 1.5831960461285008, "grad_norm": 0.8228965559872766, "learning_rate": 6.910956032786929e-05, "loss": 0.4662, "step": 961 }, { "epoch": 1.584843492586491, "grad_norm": 0.7077832468641867, "learning_rate": 6.907798231358401e-05, "loss": 0.4638, "step": 962 }, { "epoch": 1.586490939044481, "grad_norm": 0.49061315397233707, "learning_rate": 6.904636582058327e-05, "loss": 0.4616, "step": 963 }, { "epoch": 1.588138385502471, "grad_norm": 0.547976471880321, "learning_rate": 6.901471089070501e-05, "loss": 0.4588, "step": 964 }, { "epoch": 1.5897858319604614, "grad_norm": 0.6870635839218852, "learning_rate": 6.898301756583801e-05, "loss": 0.4662, "step": 965 }, { "epoch": 1.5914332784184513, "grad_norm": 0.674051175977031, "learning_rate": 6.895128588792183e-05, "loss": 0.4647, "step": 966 }, { "epoch": 1.5930807248764416, "grad_norm": 0.48788512583952287, "learning_rate": 6.891951589894681e-05, "loss": 0.4621, "step": 967 }, { "epoch": 1.5947281713344317, "grad_norm": 0.33427663333353297, "learning_rate": 6.888770764095398e-05, "loss": 0.457, "step": 968 }, { "epoch": 1.5963756177924218, "grad_norm": 0.4183525945318603, "learning_rate": 6.885586115603502e-05, "loss": 0.4716, "step": 969 }, { "epoch": 1.5980230642504119, "grad_norm": 0.4612411851314692, "learning_rate": 6.882397648633218e-05, "loss": 0.4628, "step": 970 }, { "epoch": 1.599670510708402, "grad_norm": 0.37148636467269397, "learning_rate": 6.879205367403826e-05, "loss": 0.4655, "step": 971 }, { "epoch": 1.6013179571663922, "grad_norm": 0.33026344339313024, "learning_rate": 6.87600927613965e-05, "loss": 0.4659, "step": 972 }, { "epoch": 1.6029654036243821, "grad_norm": 0.4247374122514578, "learning_rate": 6.87280937907006e-05, "loss": 0.4571, "step": 973 }, { "epoch": 1.6046128500823724, "grad_norm": 0.5488674801337392, "learning_rate": 6.869605680429459e-05, "loss": 0.4721, "step": 974 }, { "epoch": 1.6062602965403623, "grad_norm": 0.5377641462995398, "learning_rate": 6.866398184457282e-05, "loss": 0.4665, "step": 975 }, { "epoch": 1.6079077429983526, "grad_norm": 0.4596010928397761, "learning_rate": 6.86318689539799e-05, "loss": 0.4651, "step": 976 }, { "epoch": 1.6095551894563427, "grad_norm": 0.42275903413433, "learning_rate": 6.859971817501059e-05, "loss": 0.4671, "step": 977 }, { "epoch": 1.6112026359143328, "grad_norm": 0.43236317657755235, "learning_rate": 6.856752955020986e-05, "loss": 0.4652, "step": 978 }, { "epoch": 1.6128500823723229, "grad_norm": 0.480886726208894, "learning_rate": 6.853530312217267e-05, "loss": 0.4602, "step": 979 }, { "epoch": 1.614497528830313, "grad_norm": 0.5063040990091658, "learning_rate": 6.850303893354411e-05, "loss": 0.4653, "step": 980 }, { "epoch": 1.6161449752883033, "grad_norm": 0.45712726939188625, "learning_rate": 6.847073702701914e-05, "loss": 0.4627, "step": 981 }, { "epoch": 1.6177924217462931, "grad_norm": 0.44458211041437884, "learning_rate": 6.843839744534268e-05, "loss": 0.4677, "step": 982 }, { "epoch": 1.6194398682042834, "grad_norm": 0.4197418835980473, "learning_rate": 6.840602023130953e-05, "loss": 0.469, "step": 983 }, { "epoch": 1.6210873146622735, "grad_norm": 0.35961261260582833, "learning_rate": 6.837360542776421e-05, "loss": 0.4602, "step": 984 }, { "epoch": 1.6227347611202636, "grad_norm": 0.33343848463733167, "learning_rate": 6.834115307760108e-05, "loss": 0.4741, "step": 985 }, { "epoch": 1.6243822075782537, "grad_norm": 0.32435148783862855, "learning_rate": 6.830866322376411e-05, "loss": 0.4651, "step": 986 }, { "epoch": 1.6260296540362438, "grad_norm": 0.4188714632957181, "learning_rate": 6.827613590924692e-05, "loss": 0.4672, "step": 987 }, { "epoch": 1.627677100494234, "grad_norm": 0.5670430056161974, "learning_rate": 6.824357117709271e-05, "loss": 0.4569, "step": 988 }, { "epoch": 1.629324546952224, "grad_norm": 0.6570167129019133, "learning_rate": 6.821096907039421e-05, "loss": 0.4619, "step": 989 }, { "epoch": 1.6309719934102143, "grad_norm": 0.714198820748596, "learning_rate": 6.817832963229356e-05, "loss": 0.4748, "step": 990 }, { "epoch": 1.6326194398682041, "grad_norm": 0.7379340504183475, "learning_rate": 6.814565290598232e-05, "loss": 0.4686, "step": 991 }, { "epoch": 1.6342668863261944, "grad_norm": 0.7165257305094961, "learning_rate": 6.811293893470145e-05, "loss": 0.4621, "step": 992 }, { "epoch": 1.6359143327841845, "grad_norm": 0.618034582638155, "learning_rate": 6.80801877617411e-05, "loss": 0.459, "step": 993 }, { "epoch": 1.6375617792421746, "grad_norm": 0.5079983856180039, "learning_rate": 6.804739943044072e-05, "loss": 0.4644, "step": 994 }, { "epoch": 1.6392092257001647, "grad_norm": 0.44148269369120824, "learning_rate": 6.801457398418889e-05, "loss": 0.4628, "step": 995 }, { "epoch": 1.6408566721581548, "grad_norm": 0.37695011971384307, "learning_rate": 6.798171146642334e-05, "loss": 0.4657, "step": 996 }, { "epoch": 1.642504118616145, "grad_norm": 0.40312157350904987, "learning_rate": 6.794881192063085e-05, "loss": 0.4587, "step": 997 }, { "epoch": 1.644151565074135, "grad_norm": 0.4960257950449315, "learning_rate": 6.791587539034714e-05, "loss": 0.4619, "step": 998 }, { "epoch": 1.6457990115321253, "grad_norm": 0.5814120520455907, "learning_rate": 6.788290191915694e-05, "loss": 0.4614, "step": 999 }, { "epoch": 1.6474464579901154, "grad_norm": 0.548884472890275, "learning_rate": 6.784989155069386e-05, "loss": 0.4574, "step": 1000 }, { "epoch": 1.6490939044481054, "grad_norm": 0.480559222517083, "learning_rate": 6.781684432864032e-05, "loss": 0.4661, "step": 1001 }, { "epoch": 1.6507413509060955, "grad_norm": 0.4187893146491566, "learning_rate": 6.778376029672747e-05, "loss": 0.4659, "step": 1002 }, { "epoch": 1.6523887973640856, "grad_norm": 0.4232072670833389, "learning_rate": 6.775063949873524e-05, "loss": 0.4624, "step": 1003 }, { "epoch": 1.654036243822076, "grad_norm": 0.41902671321029045, "learning_rate": 6.771748197849215e-05, "loss": 0.4672, "step": 1004 }, { "epoch": 1.6556836902800658, "grad_norm": 0.43439418081549896, "learning_rate": 6.768428777987537e-05, "loss": 0.4548, "step": 1005 }, { "epoch": 1.657331136738056, "grad_norm": 0.4301288737681747, "learning_rate": 6.765105694681057e-05, "loss": 0.4643, "step": 1006 }, { "epoch": 1.658978583196046, "grad_norm": 0.46393005596967274, "learning_rate": 6.76177895232719e-05, "loss": 0.4616, "step": 1007 }, { "epoch": 1.6606260296540363, "grad_norm": 0.4001689650101954, "learning_rate": 6.758448555328194e-05, "loss": 0.4639, "step": 1008 }, { "epoch": 1.6622734761120264, "grad_norm": 0.3377385607322078, "learning_rate": 6.755114508091164e-05, "loss": 0.4711, "step": 1009 }, { "epoch": 1.6639209225700164, "grad_norm": 0.30541064673952495, "learning_rate": 6.751776815028024e-05, "loss": 0.4696, "step": 1010 }, { "epoch": 1.6655683690280065, "grad_norm": 0.3401156309536302, "learning_rate": 6.74843548055552e-05, "loss": 0.4603, "step": 1011 }, { "epoch": 1.6672158154859966, "grad_norm": 0.41985461736190594, "learning_rate": 6.745090509095223e-05, "loss": 0.4695, "step": 1012 }, { "epoch": 1.668863261943987, "grad_norm": 0.43322019981159743, "learning_rate": 6.74174190507351e-05, "loss": 0.4619, "step": 1013 }, { "epoch": 1.6705107084019768, "grad_norm": 0.3226543560571379, "learning_rate": 6.738389672921573e-05, "loss": 0.4563, "step": 1014 }, { "epoch": 1.672158154859967, "grad_norm": 0.26988900639328345, "learning_rate": 6.735033817075396e-05, "loss": 0.4659, "step": 1015 }, { "epoch": 1.6738056013179572, "grad_norm": 0.3371234724937315, "learning_rate": 6.731674341975763e-05, "loss": 0.4634, "step": 1016 }, { "epoch": 1.6754530477759473, "grad_norm": 0.3776552137675123, "learning_rate": 6.728311252068247e-05, "loss": 0.4623, "step": 1017 }, { "epoch": 1.6771004942339374, "grad_norm": 0.41248503542061415, "learning_rate": 6.724944551803206e-05, "loss": 0.4559, "step": 1018 }, { "epoch": 1.6787479406919275, "grad_norm": 0.4567086869345682, "learning_rate": 6.72157424563577e-05, "loss": 0.4662, "step": 1019 }, { "epoch": 1.6803953871499178, "grad_norm": 0.468060185457701, "learning_rate": 6.718200338025848e-05, "loss": 0.4629, "step": 1020 }, { "epoch": 1.6820428336079076, "grad_norm": 0.5391295179023253, "learning_rate": 6.71482283343811e-05, "loss": 0.4675, "step": 1021 }, { "epoch": 1.683690280065898, "grad_norm": 0.6598419653157457, "learning_rate": 6.711441736341987e-05, "loss": 0.4601, "step": 1022 }, { "epoch": 1.685337726523888, "grad_norm": 0.765597238661721, "learning_rate": 6.708057051211664e-05, "loss": 0.4576, "step": 1023 }, { "epoch": 1.6869851729818781, "grad_norm": 0.9164604905537336, "learning_rate": 6.704668782526073e-05, "loss": 0.468, "step": 1024 }, { "epoch": 1.6886326194398682, "grad_norm": 1.0426512437940783, "learning_rate": 6.701276934768892e-05, "loss": 0.4627, "step": 1025 }, { "epoch": 1.6902800658978583, "grad_norm": 0.8339245211098827, "learning_rate": 6.697881512428531e-05, "loss": 0.4604, "step": 1026 }, { "epoch": 1.6919275123558486, "grad_norm": 0.5569553385872161, "learning_rate": 6.694482519998129e-05, "loss": 0.4652, "step": 1027 }, { "epoch": 1.6935749588138385, "grad_norm": 0.457761103349558, "learning_rate": 6.691079961975556e-05, "loss": 0.4703, "step": 1028 }, { "epoch": 1.6952224052718288, "grad_norm": 0.5137004667936551, "learning_rate": 6.687673842863393e-05, "loss": 0.4715, "step": 1029 }, { "epoch": 1.6968698517298186, "grad_norm": 0.5402007267471655, "learning_rate": 6.684264167168938e-05, "loss": 0.4668, "step": 1030 }, { "epoch": 1.698517298187809, "grad_norm": 0.5001082415884353, "learning_rate": 6.680850939404194e-05, "loss": 0.4652, "step": 1031 }, { "epoch": 1.700164744645799, "grad_norm": 0.42909289222596947, "learning_rate": 6.677434164085862e-05, "loss": 0.4643, "step": 1032 }, { "epoch": 1.7018121911037891, "grad_norm": 0.417378975967374, "learning_rate": 6.674013845735343e-05, "loss": 0.4671, "step": 1033 }, { "epoch": 1.7034596375617792, "grad_norm": 0.41101635041708123, "learning_rate": 6.670589988878722e-05, "loss": 0.4696, "step": 1034 }, { "epoch": 1.7051070840197693, "grad_norm": 0.37150976646825684, "learning_rate": 6.667162598046767e-05, "loss": 0.4589, "step": 1035 }, { "epoch": 1.7067545304777596, "grad_norm": 0.38117114937248586, "learning_rate": 6.663731677774925e-05, "loss": 0.453, "step": 1036 }, { "epoch": 1.7084019769357495, "grad_norm": 0.4282831672949578, "learning_rate": 6.66029723260331e-05, "loss": 0.4573, "step": 1037 }, { "epoch": 1.7100494233937398, "grad_norm": 0.5379657915384326, "learning_rate": 6.656859267076701e-05, "loss": 0.4666, "step": 1038 }, { "epoch": 1.7116968698517299, "grad_norm": 0.593436001758686, "learning_rate": 6.653417785744542e-05, "loss": 0.4646, "step": 1039 }, { "epoch": 1.71334431630972, "grad_norm": 0.5626980867208237, "learning_rate": 6.649972793160917e-05, "loss": 0.4562, "step": 1040 }, { "epoch": 1.71499176276771, "grad_norm": 0.5944930167299609, "learning_rate": 6.64652429388457e-05, "loss": 0.4624, "step": 1041 }, { "epoch": 1.7166392092257001, "grad_norm": 0.7348412660875432, "learning_rate": 6.643072292478874e-05, "loss": 0.4597, "step": 1042 }, { "epoch": 1.7182866556836904, "grad_norm": 0.6903862440623245, "learning_rate": 6.639616793511845e-05, "loss": 0.4636, "step": 1043 }, { "epoch": 1.7199341021416803, "grad_norm": 0.507722520846339, "learning_rate": 6.636157801556122e-05, "loss": 0.4597, "step": 1044 }, { "epoch": 1.7215815485996706, "grad_norm": 0.5110784147656474, "learning_rate": 6.632695321188966e-05, "loss": 0.4629, "step": 1045 }, { "epoch": 1.7232289950576605, "grad_norm": 0.6124051358297173, "learning_rate": 6.629229356992258e-05, "loss": 0.46, "step": 1046 }, { "epoch": 1.7248764415156508, "grad_norm": 0.688749242395514, "learning_rate": 6.625759913552488e-05, "loss": 0.4617, "step": 1047 }, { "epoch": 1.7265238879736409, "grad_norm": 0.54080344050438, "learning_rate": 6.622286995460747e-05, "loss": 0.4653, "step": 1048 }, { "epoch": 1.728171334431631, "grad_norm": 0.3857167906819182, "learning_rate": 6.618810607312729e-05, "loss": 0.4573, "step": 1049 }, { "epoch": 1.729818780889621, "grad_norm": 0.42114636508343034, "learning_rate": 6.615330753708713e-05, "loss": 0.4618, "step": 1050 }, { "epoch": 1.7314662273476111, "grad_norm": 0.5537932832511983, "learning_rate": 6.611847439253572e-05, "loss": 0.4656, "step": 1051 }, { "epoch": 1.7331136738056014, "grad_norm": 0.6041465426997026, "learning_rate": 6.608360668556753e-05, "loss": 0.4531, "step": 1052 }, { "epoch": 1.7347611202635913, "grad_norm": 0.6565047683077643, "learning_rate": 6.604870446232276e-05, "loss": 0.4683, "step": 1053 }, { "epoch": 1.7364085667215816, "grad_norm": 0.6296860110786352, "learning_rate": 6.601376776898735e-05, "loss": 0.4594, "step": 1054 }, { "epoch": 1.7380560131795717, "grad_norm": 0.6457659424502807, "learning_rate": 6.597879665179278e-05, "loss": 0.4688, "step": 1055 }, { "epoch": 1.7397034596375618, "grad_norm": 0.600080950314993, "learning_rate": 6.594379115701614e-05, "loss": 0.4591, "step": 1056 }, { "epoch": 1.7413509060955519, "grad_norm": 0.4722562614503477, "learning_rate": 6.590875133097995e-05, "loss": 0.4695, "step": 1057 }, { "epoch": 1.742998352553542, "grad_norm": 0.43466854022369844, "learning_rate": 6.587367722005222e-05, "loss": 0.4606, "step": 1058 }, { "epoch": 1.7446457990115323, "grad_norm": 0.40425478933264, "learning_rate": 6.583856887064632e-05, "loss": 0.4644, "step": 1059 }, { "epoch": 1.7462932454695221, "grad_norm": 0.3624525485188207, "learning_rate": 6.580342632922089e-05, "loss": 0.4638, "step": 1060 }, { "epoch": 1.7479406919275124, "grad_norm": 0.26579637039054355, "learning_rate": 6.576824964227985e-05, "loss": 0.4619, "step": 1061 }, { "epoch": 1.7495881383855023, "grad_norm": 0.3061576337294293, "learning_rate": 6.573303885637228e-05, "loss": 0.4584, "step": 1062 }, { "epoch": 1.7512355848434926, "grad_norm": 0.3797285737711541, "learning_rate": 6.569779401809241e-05, "loss": 0.4616, "step": 1063 }, { "epoch": 1.7528830313014827, "grad_norm": 0.43843233589550473, "learning_rate": 6.566251517407953e-05, "loss": 0.4637, "step": 1064 }, { "epoch": 1.7545304777594728, "grad_norm": 0.37230362637519093, "learning_rate": 6.562720237101788e-05, "loss": 0.4608, "step": 1065 }, { "epoch": 1.7561779242174629, "grad_norm": 0.3162829857318855, "learning_rate": 6.55918556556367e-05, "loss": 0.4646, "step": 1066 }, { "epoch": 1.757825370675453, "grad_norm": 0.3512218911414887, "learning_rate": 6.555647507471009e-05, "loss": 0.4618, "step": 1067 }, { "epoch": 1.7594728171334433, "grad_norm": 0.46109386292464505, "learning_rate": 6.552106067505694e-05, "loss": 0.4651, "step": 1068 }, { "epoch": 1.7611202635914331, "grad_norm": 0.5305994166919885, "learning_rate": 6.548561250354092e-05, "loss": 0.4647, "step": 1069 }, { "epoch": 1.7627677100494235, "grad_norm": 0.536796634024583, "learning_rate": 6.545013060707036e-05, "loss": 0.4631, "step": 1070 }, { "epoch": 1.7644151565074135, "grad_norm": 0.5107814884737902, "learning_rate": 6.541461503259825e-05, "loss": 0.4607, "step": 1071 }, { "epoch": 1.7660626029654036, "grad_norm": 0.45438886794174105, "learning_rate": 6.537906582712213e-05, "loss": 0.463, "step": 1072 }, { "epoch": 1.7677100494233937, "grad_norm": 0.3796769547801077, "learning_rate": 6.534348303768404e-05, "loss": 0.4648, "step": 1073 }, { "epoch": 1.7693574958813838, "grad_norm": 0.39009779627899605, "learning_rate": 6.530786671137048e-05, "loss": 0.4513, "step": 1074 }, { "epoch": 1.771004942339374, "grad_norm": 0.4943591588375057, "learning_rate": 6.527221689531229e-05, "loss": 0.4559, "step": 1075 }, { "epoch": 1.772652388797364, "grad_norm": 0.5656360735753063, "learning_rate": 6.523653363668468e-05, "loss": 0.4656, "step": 1076 }, { "epoch": 1.7742998352553543, "grad_norm": 0.6344717802714285, "learning_rate": 6.520081698270707e-05, "loss": 0.4685, "step": 1077 }, { "epoch": 1.7759472817133442, "grad_norm": 0.6651521534878464, "learning_rate": 6.516506698064308e-05, "loss": 0.4594, "step": 1078 }, { "epoch": 1.7775947281713345, "grad_norm": 0.6344322608854728, "learning_rate": 6.512928367780051e-05, "loss": 0.4636, "step": 1079 }, { "epoch": 1.7792421746293245, "grad_norm": 0.5976863647815895, "learning_rate": 6.509346712153113e-05, "loss": 0.4523, "step": 1080 }, { "epoch": 1.7808896210873146, "grad_norm": 0.5093051325030807, "learning_rate": 6.50576173592308e-05, "loss": 0.4655, "step": 1081 }, { "epoch": 1.782537067545305, "grad_norm": 0.40380831620111113, "learning_rate": 6.502173443833931e-05, "loss": 0.4642, "step": 1082 }, { "epoch": 1.7841845140032948, "grad_norm": 0.4714254856027105, "learning_rate": 6.498581840634027e-05, "loss": 0.4633, "step": 1083 }, { "epoch": 1.7858319604612851, "grad_norm": 0.5368328216603431, "learning_rate": 6.494986931076117e-05, "loss": 0.4657, "step": 1084 }, { "epoch": 1.787479406919275, "grad_norm": 0.4701861234520461, "learning_rate": 6.49138871991732e-05, "loss": 0.4638, "step": 1085 }, { "epoch": 1.7891268533772653, "grad_norm": 0.3364906637255847, "learning_rate": 6.48778721191913e-05, "loss": 0.4567, "step": 1086 }, { "epoch": 1.7907742998352554, "grad_norm": 0.3431631655500182, "learning_rate": 6.484182411847398e-05, "loss": 0.4773, "step": 1087 }, { "epoch": 1.7924217462932455, "grad_norm": 0.4475858571349268, "learning_rate": 6.480574324472335e-05, "loss": 0.458, "step": 1088 }, { "epoch": 1.7940691927512356, "grad_norm": 0.5079165252037537, "learning_rate": 6.476962954568501e-05, "loss": 0.455, "step": 1089 }, { "epoch": 1.7957166392092256, "grad_norm": 0.4693346081743462, "learning_rate": 6.473348306914797e-05, "loss": 0.4534, "step": 1090 }, { "epoch": 1.797364085667216, "grad_norm": 0.37240262610363445, "learning_rate": 6.469730386294469e-05, "loss": 0.4642, "step": 1091 }, { "epoch": 1.7990115321252058, "grad_norm": 0.2848157848763826, "learning_rate": 6.466109197495085e-05, "loss": 0.4625, "step": 1092 }, { "epoch": 1.8006589785831961, "grad_norm": 0.3387859123956077, "learning_rate": 6.462484745308545e-05, "loss": 0.4668, "step": 1093 }, { "epoch": 1.8023064250411862, "grad_norm": 0.4928030567710776, "learning_rate": 6.45885703453106e-05, "loss": 0.4545, "step": 1094 }, { "epoch": 1.8039538714991763, "grad_norm": 0.6358016202813764, "learning_rate": 6.45522606996316e-05, "loss": 0.4607, "step": 1095 }, { "epoch": 1.8056013179571664, "grad_norm": 0.6797031843095763, "learning_rate": 6.45159185640968e-05, "loss": 0.4553, "step": 1096 }, { "epoch": 1.8072487644151565, "grad_norm": 0.6758860309352107, "learning_rate": 6.44795439867975e-05, "loss": 0.4565, "step": 1097 }, { "epoch": 1.8088962108731468, "grad_norm": 0.6503130957615556, "learning_rate": 6.444313701586795e-05, "loss": 0.4642, "step": 1098 }, { "epoch": 1.8105436573311366, "grad_norm": 0.5932487728538745, "learning_rate": 6.44066976994853e-05, "loss": 0.4656, "step": 1099 }, { "epoch": 1.812191103789127, "grad_norm": 0.6123417759545811, "learning_rate": 6.437022608586945e-05, "loss": 0.4603, "step": 1100 }, { "epoch": 1.8138385502471168, "grad_norm": 0.7326403746622969, "learning_rate": 6.433372222328306e-05, "loss": 0.4612, "step": 1101 }, { "epoch": 1.8154859967051071, "grad_norm": 0.6210527450157869, "learning_rate": 6.429718616003148e-05, "loss": 0.4604, "step": 1102 }, { "epoch": 1.8171334431630972, "grad_norm": 0.4969713648391882, "learning_rate": 6.426061794446265e-05, "loss": 0.4568, "step": 1103 }, { "epoch": 1.8187808896210873, "grad_norm": 0.5085708471271931, "learning_rate": 6.422401762496707e-05, "loss": 0.4614, "step": 1104 }, { "epoch": 1.8204283360790774, "grad_norm": 0.535815533340169, "learning_rate": 6.418738524997771e-05, "loss": 0.4553, "step": 1105 }, { "epoch": 1.8220757825370675, "grad_norm": 0.568448689924414, "learning_rate": 6.415072086796999e-05, "loss": 0.4604, "step": 1106 }, { "epoch": 1.8237232289950578, "grad_norm": 0.5763049131146702, "learning_rate": 6.411402452746162e-05, "loss": 0.4652, "step": 1107 }, { "epoch": 1.8253706754530477, "grad_norm": 0.5021644494677876, "learning_rate": 6.407729627701269e-05, "loss": 0.4663, "step": 1108 }, { "epoch": 1.827018121911038, "grad_norm": 0.40070751338944777, "learning_rate": 6.404053616522543e-05, "loss": 0.4567, "step": 1109 }, { "epoch": 1.828665568369028, "grad_norm": 0.43981143844191073, "learning_rate": 6.400374424074429e-05, "loss": 0.4654, "step": 1110 }, { "epoch": 1.8303130148270181, "grad_norm": 0.4647953225983057, "learning_rate": 6.396692055225579e-05, "loss": 0.463, "step": 1111 }, { "epoch": 1.8319604612850082, "grad_norm": 0.3976868957746323, "learning_rate": 6.39300651484885e-05, "loss": 0.4605, "step": 1112 }, { "epoch": 1.8336079077429983, "grad_norm": 0.36218364528134855, "learning_rate": 6.389317807821294e-05, "loss": 0.4598, "step": 1113 }, { "epoch": 1.8352553542009886, "grad_norm": 0.32248968146597234, "learning_rate": 6.385625939024154e-05, "loss": 0.4584, "step": 1114 }, { "epoch": 1.8369028006589785, "grad_norm": 0.36643981777816925, "learning_rate": 6.381930913342858e-05, "loss": 0.4591, "step": 1115 }, { "epoch": 1.8385502471169688, "grad_norm": 0.43584337453515914, "learning_rate": 6.378232735667013e-05, "loss": 0.4575, "step": 1116 }, { "epoch": 1.8401976935749587, "grad_norm": 0.40858004990265784, "learning_rate": 6.37453141089039e-05, "loss": 0.4541, "step": 1117 }, { "epoch": 1.841845140032949, "grad_norm": 0.40212930141258474, "learning_rate": 6.370826943910934e-05, "loss": 0.4626, "step": 1118 }, { "epoch": 1.843492586490939, "grad_norm": 0.4715351816444081, "learning_rate": 6.367119339630739e-05, "loss": 0.4533, "step": 1119 }, { "epoch": 1.8451400329489291, "grad_norm": 0.5002369895051356, "learning_rate": 6.363408602956059e-05, "loss": 0.4733, "step": 1120 }, { "epoch": 1.8467874794069192, "grad_norm": 0.4728023309632516, "learning_rate": 6.359694738797286e-05, "loss": 0.4632, "step": 1121 }, { "epoch": 1.8484349258649093, "grad_norm": 0.5411728633616163, "learning_rate": 6.355977752068956e-05, "loss": 0.4519, "step": 1122 }, { "epoch": 1.8500823723228996, "grad_norm": 0.6135398321590909, "learning_rate": 6.352257647689735e-05, "loss": 0.463, "step": 1123 }, { "epoch": 1.8517298187808895, "grad_norm": 0.6754719735150364, "learning_rate": 6.34853443058241e-05, "loss": 0.4569, "step": 1124 }, { "epoch": 1.8533772652388798, "grad_norm": 0.6926199679186456, "learning_rate": 6.344808105673898e-05, "loss": 0.4627, "step": 1125 }, { "epoch": 1.8550247116968699, "grad_norm": 0.6276731177359077, "learning_rate": 6.341078677895216e-05, "loss": 0.459, "step": 1126 }, { "epoch": 1.85667215815486, "grad_norm": 0.4284831800150335, "learning_rate": 6.337346152181495e-05, "loss": 0.4688, "step": 1127 }, { "epoch": 1.85831960461285, "grad_norm": 0.2924524180129266, "learning_rate": 6.333610533471964e-05, "loss": 0.4589, "step": 1128 }, { "epoch": 1.8599670510708401, "grad_norm": 0.26893717454423377, "learning_rate": 6.329871826709943e-05, "loss": 0.4551, "step": 1129 }, { "epoch": 1.8616144975288305, "grad_norm": 0.3533982060132202, "learning_rate": 6.326130036842842e-05, "loss": 0.457, "step": 1130 }, { "epoch": 1.8632619439868203, "grad_norm": 0.49089774754889676, "learning_rate": 6.322385168822145e-05, "loss": 0.4694, "step": 1131 }, { "epoch": 1.8649093904448106, "grad_norm": 0.5562015446168748, "learning_rate": 6.318637227603417e-05, "loss": 0.4531, "step": 1132 }, { "epoch": 1.8665568369028005, "grad_norm": 0.5797398885484322, "learning_rate": 6.314886218146282e-05, "loss": 0.4566, "step": 1133 }, { "epoch": 1.8682042833607908, "grad_norm": 0.5011032267896284, "learning_rate": 6.31113214541443e-05, "loss": 0.4638, "step": 1134 }, { "epoch": 1.869851729818781, "grad_norm": 0.4223425798883273, "learning_rate": 6.3073750143756e-05, "loss": 0.4657, "step": 1135 }, { "epoch": 1.871499176276771, "grad_norm": 0.390392986142629, "learning_rate": 6.303614830001585e-05, "loss": 0.457, "step": 1136 }, { "epoch": 1.873146622734761, "grad_norm": 0.37734895421047265, "learning_rate": 6.299851597268208e-05, "loss": 0.459, "step": 1137 }, { "epoch": 1.8747940691927512, "grad_norm": 0.4812313532764299, "learning_rate": 6.296085321155335e-05, "loss": 0.4612, "step": 1138 }, { "epoch": 1.8764415156507415, "grad_norm": 0.5283895292613409, "learning_rate": 6.292316006646858e-05, "loss": 0.4615, "step": 1139 }, { "epoch": 1.8780889621087313, "grad_norm": 0.6325771868715735, "learning_rate": 6.288543658730684e-05, "loss": 0.4651, "step": 1140 }, { "epoch": 1.8797364085667216, "grad_norm": 0.6392316807813232, "learning_rate": 6.28476828239874e-05, "loss": 0.4633, "step": 1141 }, { "epoch": 1.8813838550247117, "grad_norm": 0.5650929497245265, "learning_rate": 6.280989882646957e-05, "loss": 0.4599, "step": 1142 }, { "epoch": 1.8830313014827018, "grad_norm": 0.5236986287366302, "learning_rate": 6.277208464475269e-05, "loss": 0.4521, "step": 1143 }, { "epoch": 1.884678747940692, "grad_norm": 0.437281332054022, "learning_rate": 6.273424032887603e-05, "loss": 0.4486, "step": 1144 }, { "epoch": 1.886326194398682, "grad_norm": 0.42416173489227416, "learning_rate": 6.269636592891877e-05, "loss": 0.4537, "step": 1145 }, { "epoch": 1.8879736408566723, "grad_norm": 0.4566638159803165, "learning_rate": 6.265846149499982e-05, "loss": 0.4645, "step": 1146 }, { "epoch": 1.8896210873146622, "grad_norm": 0.47334079033481824, "learning_rate": 6.262052707727791e-05, "loss": 0.4669, "step": 1147 }, { "epoch": 1.8912685337726525, "grad_norm": 0.4522167272084299, "learning_rate": 6.258256272595143e-05, "loss": 0.4521, "step": 1148 }, { "epoch": 1.8929159802306426, "grad_norm": 0.33683533410204286, "learning_rate": 6.254456849125836e-05, "loss": 0.4572, "step": 1149 }, { "epoch": 1.8945634266886326, "grad_norm": 0.3566153617862311, "learning_rate": 6.250654442347625e-05, "loss": 0.4557, "step": 1150 }, { "epoch": 1.8962108731466227, "grad_norm": 0.41430080817194254, "learning_rate": 6.246849057292209e-05, "loss": 0.4623, "step": 1151 }, { "epoch": 1.8978583196046128, "grad_norm": 0.44104647842716704, "learning_rate": 6.243040698995233e-05, "loss": 0.4651, "step": 1152 }, { "epoch": 1.8995057660626031, "grad_norm": 0.35786655490065455, "learning_rate": 6.239229372496274e-05, "loss": 0.4568, "step": 1153 }, { "epoch": 1.901153212520593, "grad_norm": 0.341077884418105, "learning_rate": 6.235415082838836e-05, "loss": 0.4583, "step": 1154 }, { "epoch": 1.9028006589785833, "grad_norm": 0.35230183830875683, "learning_rate": 6.231597835070346e-05, "loss": 0.46, "step": 1155 }, { "epoch": 1.9044481054365732, "grad_norm": 0.3896990323290054, "learning_rate": 6.227777634242145e-05, "loss": 0.4694, "step": 1156 }, { "epoch": 1.9060955518945635, "grad_norm": 0.41499281656093306, "learning_rate": 6.22395448540948e-05, "loss": 0.4631, "step": 1157 }, { "epoch": 1.9077429983525536, "grad_norm": 0.393953619385022, "learning_rate": 6.2201283936315e-05, "loss": 0.4577, "step": 1158 }, { "epoch": 1.9093904448105437, "grad_norm": 0.43780555008951155, "learning_rate": 6.21629936397125e-05, "loss": 0.4664, "step": 1159 }, { "epoch": 1.9110378912685337, "grad_norm": 0.4680869367663529, "learning_rate": 6.21246740149566e-05, "loss": 0.4508, "step": 1160 }, { "epoch": 1.9126853377265238, "grad_norm": 0.4421336714360952, "learning_rate": 6.208632511275544e-05, "loss": 0.4606, "step": 1161 }, { "epoch": 1.9143327841845141, "grad_norm": 0.43033279983531186, "learning_rate": 6.20479469838559e-05, "loss": 0.4609, "step": 1162 }, { "epoch": 1.915980230642504, "grad_norm": 0.3638575818256503, "learning_rate": 6.200953967904347e-05, "loss": 0.4611, "step": 1163 }, { "epoch": 1.9176276771004943, "grad_norm": 5.341060026003334, "learning_rate": 6.197110324914236e-05, "loss": 0.4687, "step": 1164 }, { "epoch": 1.9192751235584844, "grad_norm": 0.7153728213061545, "learning_rate": 6.193263774501523e-05, "loss": 0.4705, "step": 1165 }, { "epoch": 1.9209225700164745, "grad_norm": 1.2814143128215754, "learning_rate": 6.189414321756324e-05, "loss": 0.4728, "step": 1166 }, { "epoch": 1.9225700164744646, "grad_norm": 0.6011356822701441, "learning_rate": 6.185561971772598e-05, "loss": 0.4596, "step": 1167 }, { "epoch": 1.9242174629324547, "grad_norm": 1.0331251364642149, "learning_rate": 6.181706729648136e-05, "loss": 0.4706, "step": 1168 }, { "epoch": 1.925864909390445, "grad_norm": 0.9852269441191599, "learning_rate": 6.177848600484554e-05, "loss": 0.4606, "step": 1169 }, { "epoch": 1.9275123558484348, "grad_norm": 0.8164633757627832, "learning_rate": 6.173987589387293e-05, "loss": 0.4533, "step": 1170 }, { "epoch": 1.9291598023064251, "grad_norm": 0.771955297442939, "learning_rate": 6.170123701465604e-05, "loss": 0.4735, "step": 1171 }, { "epoch": 1.930807248764415, "grad_norm": 0.7248756826274092, "learning_rate": 6.166256941832549e-05, "loss": 0.4643, "step": 1172 }, { "epoch": 1.9324546952224053, "grad_norm": 2.1124669648891623, "learning_rate": 6.162387315604984e-05, "loss": 0.4668, "step": 1173 }, { "epoch": 1.9341021416803954, "grad_norm": 0.7202443974335938, "learning_rate": 6.15851482790356e-05, "loss": 0.4646, "step": 1174 }, { "epoch": 1.9357495881383855, "grad_norm": 1.2469345630457571, "learning_rate": 6.15463948385272e-05, "loss": 0.4719, "step": 1175 }, { "epoch": 1.9373970345963756, "grad_norm": 1.058847872369442, "learning_rate": 6.150761288580682e-05, "loss": 0.4684, "step": 1176 }, { "epoch": 1.9390444810543657, "grad_norm": 1.4521427120074863, "learning_rate": 6.14688024721944e-05, "loss": 0.4734, "step": 1177 }, { "epoch": 1.940691927512356, "grad_norm": 0.9122996248270591, "learning_rate": 6.142996364904746e-05, "loss": 0.4942, "step": 1178 }, { "epoch": 1.9423393739703458, "grad_norm": 1.7567313716721216, "learning_rate": 6.139109646776124e-05, "loss": 0.4971, "step": 1179 }, { "epoch": 1.9439868204283361, "grad_norm": 1.2202691720078427, "learning_rate": 6.135220097976842e-05, "loss": 0.4951, "step": 1180 }, { "epoch": 1.9456342668863262, "grad_norm": 1.1540239377335224, "learning_rate": 6.131327723653915e-05, "loss": 0.4738, "step": 1181 }, { "epoch": 1.9472817133443163, "grad_norm": 1.1023679744860668, "learning_rate": 6.1274325289581e-05, "loss": 0.4855, "step": 1182 }, { "epoch": 1.9489291598023064, "grad_norm": 0.9754759722593211, "learning_rate": 6.123534519043879e-05, "loss": 0.4796, "step": 1183 }, { "epoch": 1.9505766062602965, "grad_norm": 298.9386484907432, "learning_rate": 6.119633699069474e-05, "loss": 4.8964, "step": 1184 }, { "epoch": 1.9522240527182868, "grad_norm": 2.813924852454544, "learning_rate": 6.115730074196807e-05, "loss": 0.522, "step": 1185 }, { "epoch": 1.9538714991762767, "grad_norm": 1.6784851946582835, "learning_rate": 6.111823649591527e-05, "loss": 0.5066, "step": 1186 }, { "epoch": 1.955518945634267, "grad_norm": 65.88734278433262, "learning_rate": 6.107914430422978e-05, "loss": 0.8596, "step": 1187 }, { "epoch": 1.9571663920922568, "grad_norm": 4.818088849577162, "learning_rate": 6.104002421864206e-05, "loss": 0.6794, "step": 1188 }, { "epoch": 1.9588138385502472, "grad_norm": 23.452779607749623, "learning_rate": 6.1000876290919505e-05, "loss": 0.836, "step": 1189 }, { "epoch": 1.9604612850082372, "grad_norm": 2.83352131847011, "learning_rate": 6.096170057286631e-05, "loss": 0.6001, "step": 1190 }, { "epoch": 1.9621087314662273, "grad_norm": 9.61064275547527, "learning_rate": 6.092249711632347e-05, "loss": 0.5657, "step": 1191 }, { "epoch": 1.9637561779242174, "grad_norm": 1.4590261855883657, "learning_rate": 6.088326597316865e-05, "loss": 0.5592, "step": 1192 }, { "epoch": 1.9654036243822075, "grad_norm": 46.822757001886096, "learning_rate": 6.084400719531623e-05, "loss": 0.6291, "step": 1193 }, { "epoch": 1.9670510708401978, "grad_norm": 53.65097958781059, "learning_rate": 6.080472083471707e-05, "loss": 1.0753, "step": 1194 }, { "epoch": 1.9686985172981877, "grad_norm": 44.76486883499205, "learning_rate": 6.076540694335857e-05, "loss": 0.7804, "step": 1195 }, { "epoch": 1.970345963756178, "grad_norm": 2.2041652173473136, "learning_rate": 6.07260655732646e-05, "loss": 0.6637, "step": 1196 }, { "epoch": 1.971993410214168, "grad_norm": 2.6609036734434524, "learning_rate": 6.0686696776495295e-05, "loss": 0.6204, "step": 1197 }, { "epoch": 1.9736408566721582, "grad_norm": 4.836686723476092, "learning_rate": 6.0647300605147196e-05, "loss": 0.7609, "step": 1198 }, { "epoch": 1.9752883031301482, "grad_norm": 17.605101507194362, "learning_rate": 6.060787711135299e-05, "loss": 1.0992, "step": 1199 }, { "epoch": 1.9769357495881383, "grad_norm": 4.200370909802659, "learning_rate": 6.056842634728155e-05, "loss": 0.7512, "step": 1200 }, { "epoch": 1.9785831960461286, "grad_norm": 4.147788046562492, "learning_rate": 6.052894836513782e-05, "loss": 0.7148, "step": 1201 }, { "epoch": 1.9802306425041185, "grad_norm": 2.3346976481259905, "learning_rate": 6.048944321716276e-05, "loss": 0.6517, "step": 1202 }, { "epoch": 1.9818780889621088, "grad_norm": 1.8413025589482208, "learning_rate": 6.044991095563333e-05, "loss": 0.6547, "step": 1203 }, { "epoch": 1.9835255354200987, "grad_norm": 1.2635512709809267, "learning_rate": 6.0410351632862306e-05, "loss": 0.5809, "step": 1204 }, { "epoch": 1.985172981878089, "grad_norm": 1.6605974414262472, "learning_rate": 6.0370765301198295e-05, "loss": 0.5867, "step": 1205 }, { "epoch": 1.986820428336079, "grad_norm": 0.8618227675264495, "learning_rate": 6.033115201302565e-05, "loss": 0.553, "step": 1206 }, { "epoch": 1.9884678747940692, "grad_norm": 1.7932965503630711, "learning_rate": 6.029151182076438e-05, "loss": 0.5472, "step": 1207 }, { "epoch": 1.9901153212520593, "grad_norm": 1.0382639594063565, "learning_rate": 6.025184477687014e-05, "loss": 0.5267, "step": 1208 }, { "epoch": 1.9917627677100493, "grad_norm": 2.1602763186137914, "learning_rate": 6.021215093383405e-05, "loss": 0.5268, "step": 1209 }, { "epoch": 1.9934102141680397, "grad_norm": 1.8461546313003345, "learning_rate": 6.017243034418274e-05, "loss": 0.532, "step": 1210 }, { "epoch": 1.9950576606260295, "grad_norm": 1.0205980241691461, "learning_rate": 6.013268306047822e-05, "loss": 0.5178, "step": 1211 }, { "epoch": 1.9967051070840198, "grad_norm": 0.9665884599604444, "learning_rate": 6.009290913531785e-05, "loss": 0.5178, "step": 1212 }, { "epoch": 1.99835255354201, "grad_norm": 0.8378553853647274, "learning_rate": 6.005310862133419e-05, "loss": 0.5079, "step": 1213 }, { "epoch": 2.0, "grad_norm": 0.7930201671879882, "learning_rate": 6.001328157119504e-05, "loss": 0.4896, "step": 1214 }, { "epoch": 2.0016474464579903, "grad_norm": 0.6940880950067679, "learning_rate": 5.9973428037603276e-05, "loss": 0.4759, "step": 1215 }, { "epoch": 2.00329489291598, "grad_norm": 0.6339133067306166, "learning_rate": 5.993354807329683e-05, "loss": 0.4778, "step": 1216 }, { "epoch": 2.0049423393739705, "grad_norm": 0.6564248949380417, "learning_rate": 5.9893641731048635e-05, "loss": 0.4679, "step": 1217 }, { "epoch": 2.0065897858319603, "grad_norm": 0.6738264097059204, "learning_rate": 5.98537090636665e-05, "loss": 0.4712, "step": 1218 }, { "epoch": 2.0082372322899507, "grad_norm": 0.587118023561725, "learning_rate": 5.981375012399305e-05, "loss": 0.4725, "step": 1219 }, { "epoch": 2.0098846787479405, "grad_norm": 0.6372348773538791, "learning_rate": 5.977376496490574e-05, "loss": 0.464, "step": 1220 }, { "epoch": 2.011532125205931, "grad_norm": 0.48123883425705216, "learning_rate": 5.973375363931668e-05, "loss": 0.4597, "step": 1221 }, { "epoch": 2.013179571663921, "grad_norm": 0.42420620187620195, "learning_rate": 5.96937162001726e-05, "loss": 0.4541, "step": 1222 }, { "epoch": 2.014827018121911, "grad_norm": 0.5013054988817637, "learning_rate": 5.965365270045481e-05, "loss": 0.4614, "step": 1223 }, { "epoch": 2.0164744645799013, "grad_norm": 0.38691478262484424, "learning_rate": 5.961356319317907e-05, "loss": 0.4625, "step": 1224 }, { "epoch": 2.018121911037891, "grad_norm": 0.40173721483166186, "learning_rate": 5.9573447731395606e-05, "loss": 0.4563, "step": 1225 }, { "epoch": 2.0197693574958815, "grad_norm": 0.4109236197102767, "learning_rate": 5.953330636818895e-05, "loss": 0.4533, "step": 1226 }, { "epoch": 2.0214168039538714, "grad_norm": 0.3630304208837497, "learning_rate": 5.949313915667793e-05, "loss": 0.4494, "step": 1227 }, { "epoch": 2.0230642504118617, "grad_norm": 0.36175560217105784, "learning_rate": 5.945294615001555e-05, "loss": 0.4484, "step": 1228 }, { "epoch": 2.0247116968698515, "grad_norm": 0.3616299083939483, "learning_rate": 5.941272740138899e-05, "loss": 0.4462, "step": 1229 }, { "epoch": 2.026359143327842, "grad_norm": 0.3536106854220306, "learning_rate": 5.9372482964019465e-05, "loss": 0.4464, "step": 1230 }, { "epoch": 2.028006589785832, "grad_norm": 0.31191747001381825, "learning_rate": 5.933221289116219e-05, "loss": 0.4451, "step": 1231 }, { "epoch": 2.029654036243822, "grad_norm": 0.27914258380598583, "learning_rate": 5.929191723610631e-05, "loss": 0.4457, "step": 1232 }, { "epoch": 2.0313014827018123, "grad_norm": 0.2809009560765289, "learning_rate": 5.9251596052174806e-05, "loss": 0.4491, "step": 1233 }, { "epoch": 2.032948929159802, "grad_norm": 0.3712067047227602, "learning_rate": 5.921124939272447e-05, "loss": 0.4412, "step": 1234 }, { "epoch": 2.0345963756177925, "grad_norm": 0.2800440974117476, "learning_rate": 5.917087731114578e-05, "loss": 0.4486, "step": 1235 }, { "epoch": 2.0362438220757824, "grad_norm": 0.2686619022672477, "learning_rate": 5.9130479860862856e-05, "loss": 0.4471, "step": 1236 }, { "epoch": 2.0378912685337727, "grad_norm": 0.2746760509142176, "learning_rate": 5.909005709533342e-05, "loss": 0.4412, "step": 1237 }, { "epoch": 2.039538714991763, "grad_norm": 0.26621863224663056, "learning_rate": 5.904960906804864e-05, "loss": 0.4422, "step": 1238 }, { "epoch": 2.041186161449753, "grad_norm": 0.3015524784652707, "learning_rate": 5.900913583253315e-05, "loss": 0.4507, "step": 1239 }, { "epoch": 2.042833607907743, "grad_norm": 0.29687815884783914, "learning_rate": 5.896863744234496e-05, "loss": 0.4419, "step": 1240 }, { "epoch": 2.044481054365733, "grad_norm": 0.25482911812171233, "learning_rate": 5.89281139510753e-05, "loss": 0.4408, "step": 1241 }, { "epoch": 2.0461285008237233, "grad_norm": 0.34737193678858397, "learning_rate": 5.8887565412348696e-05, "loss": 0.4361, "step": 1242 }, { "epoch": 2.047775947281713, "grad_norm": 0.2795329053053698, "learning_rate": 5.884699187982275e-05, "loss": 0.4466, "step": 1243 }, { "epoch": 2.0494233937397035, "grad_norm": 0.2773081480658366, "learning_rate": 5.8806393407188186e-05, "loss": 0.4578, "step": 1244 }, { "epoch": 2.0510708401976934, "grad_norm": 0.2736033210840046, "learning_rate": 5.8765770048168714e-05, "loss": 0.4394, "step": 1245 }, { "epoch": 2.0527182866556837, "grad_norm": 0.21039221671477057, "learning_rate": 5.872512185652095e-05, "loss": 0.4442, "step": 1246 }, { "epoch": 2.054365733113674, "grad_norm": 0.2547796245419437, "learning_rate": 5.868444888603444e-05, "loss": 0.451, "step": 1247 }, { "epoch": 2.056013179571664, "grad_norm": 0.2867448307289201, "learning_rate": 5.864375119053144e-05, "loss": 0.4408, "step": 1248 }, { "epoch": 2.057660626029654, "grad_norm": 0.23168143678120395, "learning_rate": 5.8603028823866967e-05, "loss": 0.4415, "step": 1249 }, { "epoch": 2.059308072487644, "grad_norm": 0.27868976508655946, "learning_rate": 5.8562281839928694e-05, "loss": 0.4392, "step": 1250 }, { "epoch": 2.0609555189456343, "grad_norm": 0.36263917842588345, "learning_rate": 5.8521510292636845e-05, "loss": 0.4367, "step": 1251 }, { "epoch": 2.062602965403624, "grad_norm": 0.2879475572655201, "learning_rate": 5.848071423594416e-05, "loss": 0.4362, "step": 1252 }, { "epoch": 2.0642504118616145, "grad_norm": 0.23621287780274489, "learning_rate": 5.843989372383579e-05, "loss": 0.4481, "step": 1253 }, { "epoch": 2.065897858319605, "grad_norm": 0.251803613499904, "learning_rate": 5.8399048810329315e-05, "loss": 0.4409, "step": 1254 }, { "epoch": 2.0675453047775947, "grad_norm": 0.2513993040621663, "learning_rate": 5.835817954947452e-05, "loss": 0.4445, "step": 1255 }, { "epoch": 2.069192751235585, "grad_norm": 0.24220056092321607, "learning_rate": 5.831728599535345e-05, "loss": 0.4391, "step": 1256 }, { "epoch": 2.070840197693575, "grad_norm": 0.26325214752398585, "learning_rate": 5.82763682020803e-05, "loss": 0.4322, "step": 1257 }, { "epoch": 2.072487644151565, "grad_norm": 0.2565645703859962, "learning_rate": 5.823542622380134e-05, "loss": 0.4413, "step": 1258 }, { "epoch": 2.074135090609555, "grad_norm": 0.26050927510888156, "learning_rate": 5.819446011469483e-05, "loss": 0.4434, "step": 1259 }, { "epoch": 2.0757825370675453, "grad_norm": 0.2600194911456552, "learning_rate": 5.815346992897097e-05, "loss": 0.4261, "step": 1260 }, { "epoch": 2.077429983525535, "grad_norm": 0.23304751179234862, "learning_rate": 5.811245572087184e-05, "loss": 0.4404, "step": 1261 }, { "epoch": 2.0790774299835255, "grad_norm": 0.29406885132500654, "learning_rate": 5.807141754467127e-05, "loss": 0.4417, "step": 1262 }, { "epoch": 2.080724876441516, "grad_norm": 0.2748869593631865, "learning_rate": 5.803035545467483e-05, "loss": 0.4355, "step": 1263 }, { "epoch": 2.0823723228995057, "grad_norm": 0.2557565005846712, "learning_rate": 5.798926950521973e-05, "loss": 0.4409, "step": 1264 }, { "epoch": 2.084019769357496, "grad_norm": 0.331501315989901, "learning_rate": 5.794815975067476e-05, "loss": 0.4351, "step": 1265 }, { "epoch": 2.085667215815486, "grad_norm": 0.3481084805007761, "learning_rate": 5.790702624544022e-05, "loss": 0.444, "step": 1266 }, { "epoch": 2.087314662273476, "grad_norm": 0.3353088217634171, "learning_rate": 5.7865869043947796e-05, "loss": 0.4412, "step": 1267 }, { "epoch": 2.088962108731466, "grad_norm": 0.3848967780224068, "learning_rate": 5.782468820066056e-05, "loss": 0.4486, "step": 1268 }, { "epoch": 2.0906095551894563, "grad_norm": 0.37627854583315257, "learning_rate": 5.77834837700729e-05, "loss": 0.429, "step": 1269 }, { "epoch": 2.0922570016474467, "grad_norm": 0.2772731606825006, "learning_rate": 5.774225580671036e-05, "loss": 0.4358, "step": 1270 }, { "epoch": 2.0939044481054365, "grad_norm": 0.3192149338923696, "learning_rate": 5.770100436512966e-05, "loss": 0.433, "step": 1271 }, { "epoch": 2.095551894563427, "grad_norm": 0.3154089040081257, "learning_rate": 5.765972949991858e-05, "loss": 0.4448, "step": 1272 }, { "epoch": 2.0971993410214167, "grad_norm": 0.2600793160388885, "learning_rate": 5.761843126569589e-05, "loss": 0.4383, "step": 1273 }, { "epoch": 2.098846787479407, "grad_norm": 0.3242602874912255, "learning_rate": 5.757710971711129e-05, "loss": 0.432, "step": 1274 }, { "epoch": 2.100494233937397, "grad_norm": 0.380411012788977, "learning_rate": 5.753576490884534e-05, "loss": 0.4419, "step": 1275 }, { "epoch": 2.102141680395387, "grad_norm": 0.4113205525942596, "learning_rate": 5.7494396895609364e-05, "loss": 0.4338, "step": 1276 }, { "epoch": 2.1037891268533775, "grad_norm": 0.3978284289095878, "learning_rate": 5.74530057321454e-05, "loss": 0.4396, "step": 1277 }, { "epoch": 2.1054365733113674, "grad_norm": 0.31881490927173173, "learning_rate": 5.7411591473226127e-05, "loss": 0.4411, "step": 1278 }, { "epoch": 2.1070840197693577, "grad_norm": 0.3126116884419322, "learning_rate": 5.7370154173654774e-05, "loss": 0.4332, "step": 1279 }, { "epoch": 2.1087314662273475, "grad_norm": 0.2669803256819363, "learning_rate": 5.732869388826507e-05, "loss": 0.4342, "step": 1280 }, { "epoch": 2.110378912685338, "grad_norm": 0.272039212503499, "learning_rate": 5.728721067192114e-05, "loss": 0.4341, "step": 1281 }, { "epoch": 2.1120263591433277, "grad_norm": 0.35937474563784844, "learning_rate": 5.724570457951748e-05, "loss": 0.4375, "step": 1282 }, { "epoch": 2.113673805601318, "grad_norm": 0.37017779202200096, "learning_rate": 5.720417566597886e-05, "loss": 0.4406, "step": 1283 }, { "epoch": 2.115321252059308, "grad_norm": 0.34799698809475577, "learning_rate": 5.716262398626022e-05, "loss": 0.436, "step": 1284 }, { "epoch": 2.116968698517298, "grad_norm": 0.37449680782544453, "learning_rate": 5.7121049595346646e-05, "loss": 0.4366, "step": 1285 }, { "epoch": 2.1186161449752885, "grad_norm": 0.3319391679001124, "learning_rate": 5.707945254825328e-05, "loss": 0.4224, "step": 1286 }, { "epoch": 2.1202635914332784, "grad_norm": 0.2020844971100753, "learning_rate": 5.7037832900025225e-05, "loss": 0.4378, "step": 1287 }, { "epoch": 2.1219110378912687, "grad_norm": 0.25086755444631526, "learning_rate": 5.699619070573752e-05, "loss": 0.4475, "step": 1288 }, { "epoch": 2.1235584843492585, "grad_norm": 0.3252615892789254, "learning_rate": 5.695452602049503e-05, "loss": 0.4351, "step": 1289 }, { "epoch": 2.125205930807249, "grad_norm": 0.3536307699954434, "learning_rate": 5.6912838899432356e-05, "loss": 0.4326, "step": 1290 }, { "epoch": 2.1268533772652387, "grad_norm": 0.36810456782381856, "learning_rate": 5.687112939771382e-05, "loss": 0.4338, "step": 1291 }, { "epoch": 2.128500823723229, "grad_norm": 0.3565891788351271, "learning_rate": 5.682939757053335e-05, "loss": 0.4441, "step": 1292 }, { "epoch": 2.130148270181219, "grad_norm": 0.367518298796143, "learning_rate": 5.678764347311442e-05, "loss": 0.4378, "step": 1293 }, { "epoch": 2.131795716639209, "grad_norm": 0.2974713630367373, "learning_rate": 5.674586716070997e-05, "loss": 0.4269, "step": 1294 }, { "epoch": 2.1334431630971995, "grad_norm": 0.30685287257490124, "learning_rate": 5.670406868860234e-05, "loss": 0.4284, "step": 1295 }, { "epoch": 2.1350906095551894, "grad_norm": 0.23886377641350717, "learning_rate": 5.666224811210318e-05, "loss": 0.4365, "step": 1296 }, { "epoch": 2.1367380560131797, "grad_norm": 0.22160343968631033, "learning_rate": 5.6620405486553416e-05, "loss": 0.4275, "step": 1297 }, { "epoch": 2.1383855024711695, "grad_norm": 0.2145155749773439, "learning_rate": 5.6578540867323117e-05, "loss": 0.4346, "step": 1298 }, { "epoch": 2.14003294892916, "grad_norm": 0.25273609638862593, "learning_rate": 5.65366543098115e-05, "loss": 0.4444, "step": 1299 }, { "epoch": 2.1416803953871497, "grad_norm": 0.23262308614557764, "learning_rate": 5.649474586944678e-05, "loss": 0.434, "step": 1300 }, { "epoch": 2.14332784184514, "grad_norm": 0.24278394749194424, "learning_rate": 5.645281560168613e-05, "loss": 0.434, "step": 1301 }, { "epoch": 2.1449752883031303, "grad_norm": 0.2603764179660713, "learning_rate": 5.641086356201563e-05, "loss": 0.4384, "step": 1302 }, { "epoch": 2.14662273476112, "grad_norm": 0.28518659508471206, "learning_rate": 5.636888980595015e-05, "loss": 0.4337, "step": 1303 }, { "epoch": 2.1482701812191105, "grad_norm": 0.29853820105208584, "learning_rate": 5.6326894389033296e-05, "loss": 0.4273, "step": 1304 }, { "epoch": 2.1499176276771004, "grad_norm": 0.33802318002457116, "learning_rate": 5.628487736683736e-05, "loss": 0.4318, "step": 1305 }, { "epoch": 2.1515650741350907, "grad_norm": 0.2681343490367457, "learning_rate": 5.624283879496321e-05, "loss": 0.4276, "step": 1306 }, { "epoch": 2.1532125205930805, "grad_norm": 0.20261278557553228, "learning_rate": 5.620077872904022e-05, "loss": 0.4343, "step": 1307 }, { "epoch": 2.154859967051071, "grad_norm": 0.2397788169755858, "learning_rate": 5.615869722472621e-05, "loss": 0.4367, "step": 1308 }, { "epoch": 2.156507413509061, "grad_norm": 0.30583215888030735, "learning_rate": 5.611659433770738e-05, "loss": 0.4411, "step": 1309 }, { "epoch": 2.158154859967051, "grad_norm": 0.26913308543243747, "learning_rate": 5.607447012369825e-05, "loss": 0.4358, "step": 1310 }, { "epoch": 2.1598023064250413, "grad_norm": 0.2237427163521568, "learning_rate": 5.60323246384415e-05, "loss": 0.434, "step": 1311 }, { "epoch": 2.161449752883031, "grad_norm": 0.28141665779291514, "learning_rate": 5.5990157937708e-05, "loss": 0.4359, "step": 1312 }, { "epoch": 2.1630971993410215, "grad_norm": 0.23717100555684975, "learning_rate": 5.594797007729671e-05, "loss": 0.4284, "step": 1313 }, { "epoch": 2.1647446457990114, "grad_norm": 0.21789455721860423, "learning_rate": 5.590576111303453e-05, "loss": 0.4354, "step": 1314 }, { "epoch": 2.1663920922570017, "grad_norm": 0.2812299456346028, "learning_rate": 5.586353110077634e-05, "loss": 0.4395, "step": 1315 }, { "epoch": 2.168039538714992, "grad_norm": 0.29549216058037575, "learning_rate": 5.582128009640485e-05, "loss": 0.4336, "step": 1316 }, { "epoch": 2.169686985172982, "grad_norm": 0.266252691804676, "learning_rate": 5.577900815583057e-05, "loss": 0.4331, "step": 1317 }, { "epoch": 2.171334431630972, "grad_norm": 0.2643117477110311, "learning_rate": 5.573671533499169e-05, "loss": 0.4356, "step": 1318 }, { "epoch": 2.172981878088962, "grad_norm": 0.28571515280973336, "learning_rate": 5.569440168985402e-05, "loss": 0.4316, "step": 1319 }, { "epoch": 2.1746293245469523, "grad_norm": 0.23200148739948845, "learning_rate": 5.5652067276411e-05, "loss": 0.4352, "step": 1320 }, { "epoch": 2.176276771004942, "grad_norm": 0.31408658211511564, "learning_rate": 5.560971215068344e-05, "loss": 0.4377, "step": 1321 }, { "epoch": 2.1779242174629325, "grad_norm": 0.2840096105073355, "learning_rate": 5.5567336368719647e-05, "loss": 0.4387, "step": 1322 }, { "epoch": 2.1795716639209224, "grad_norm": 0.24503572385755099, "learning_rate": 5.552493998659524e-05, "loss": 0.4403, "step": 1323 }, { "epoch": 2.1812191103789127, "grad_norm": 0.2452039718607964, "learning_rate": 5.548252306041307e-05, "loss": 0.444, "step": 1324 }, { "epoch": 2.182866556836903, "grad_norm": 0.2730050572969265, "learning_rate": 5.544008564630321e-05, "loss": 0.4358, "step": 1325 }, { "epoch": 2.184514003294893, "grad_norm": 0.260619738941376, "learning_rate": 5.5397627800422825e-05, "loss": 0.4448, "step": 1326 }, { "epoch": 2.186161449752883, "grad_norm": 0.27135704990400167, "learning_rate": 5.535514957895612e-05, "loss": 0.4407, "step": 1327 }, { "epoch": 2.187808896210873, "grad_norm": 0.31166536877238465, "learning_rate": 5.531265103811427e-05, "loss": 0.4385, "step": 1328 }, { "epoch": 2.1894563426688634, "grad_norm": 0.32911425820761897, "learning_rate": 5.527013223413532e-05, "loss": 0.4443, "step": 1329 }, { "epoch": 2.191103789126853, "grad_norm": 0.3031418738966255, "learning_rate": 5.522759322328417e-05, "loss": 0.4328, "step": 1330 }, { "epoch": 2.1927512355848435, "grad_norm": 0.23365309544049181, "learning_rate": 5.518503406185239e-05, "loss": 0.4343, "step": 1331 }, { "epoch": 2.1943986820428334, "grad_norm": 0.31308656605983043, "learning_rate": 5.51424548061583e-05, "loss": 0.431, "step": 1332 }, { "epoch": 2.1960461285008237, "grad_norm": 0.31467901248065755, "learning_rate": 5.5099855512546746e-05, "loss": 0.4326, "step": 1333 }, { "epoch": 2.197693574958814, "grad_norm": 0.30076462541752197, "learning_rate": 5.5057236237389105e-05, "loss": 0.4371, "step": 1334 }, { "epoch": 2.199341021416804, "grad_norm": 0.3464867212520395, "learning_rate": 5.5014597037083216e-05, "loss": 0.435, "step": 1335 }, { "epoch": 2.200988467874794, "grad_norm": 0.3429303527511449, "learning_rate": 5.497193796805326e-05, "loss": 0.4357, "step": 1336 }, { "epoch": 2.202635914332784, "grad_norm": 0.2515926340489381, "learning_rate": 5.4929259086749744e-05, "loss": 0.4397, "step": 1337 }, { "epoch": 2.2042833607907744, "grad_norm": 0.250524515126624, "learning_rate": 5.4886560449649345e-05, "loss": 0.4387, "step": 1338 }, { "epoch": 2.2059308072487642, "grad_norm": 0.3387373409200534, "learning_rate": 5.484384211325491e-05, "loss": 0.4306, "step": 1339 }, { "epoch": 2.2075782537067545, "grad_norm": 0.3958989227036364, "learning_rate": 5.480110413409536e-05, "loss": 0.4277, "step": 1340 }, { "epoch": 2.209225700164745, "grad_norm": 0.40072929639792854, "learning_rate": 5.4758346568725595e-05, "loss": 0.4376, "step": 1341 }, { "epoch": 2.2108731466227347, "grad_norm": 0.33384381101846233, "learning_rate": 5.4715569473726436e-05, "loss": 0.4353, "step": 1342 }, { "epoch": 2.212520593080725, "grad_norm": 0.2349624370931048, "learning_rate": 5.4672772905704554e-05, "loss": 0.4322, "step": 1343 }, { "epoch": 2.214168039538715, "grad_norm": 0.2294925121349414, "learning_rate": 5.462995692129239e-05, "loss": 0.434, "step": 1344 }, { "epoch": 2.215815485996705, "grad_norm": 0.32645329916728366, "learning_rate": 5.458712157714807e-05, "loss": 0.446, "step": 1345 }, { "epoch": 2.217462932454695, "grad_norm": 0.3537012415063282, "learning_rate": 5.454426692995534e-05, "loss": 0.4367, "step": 1346 }, { "epoch": 2.2191103789126854, "grad_norm": 0.31739945992339713, "learning_rate": 5.450139303642349e-05, "loss": 0.4382, "step": 1347 }, { "epoch": 2.2207578253706757, "grad_norm": 0.34087393540545086, "learning_rate": 5.4458499953287285e-05, "loss": 0.4397, "step": 1348 }, { "epoch": 2.2224052718286655, "grad_norm": 0.34646277885205856, "learning_rate": 5.441558773730687e-05, "loss": 0.4305, "step": 1349 }, { "epoch": 2.224052718286656, "grad_norm": 0.26039432825118, "learning_rate": 5.4372656445267726e-05, "loss": 0.4434, "step": 1350 }, { "epoch": 2.2257001647446457, "grad_norm": 0.2602653575239823, "learning_rate": 5.4329706133980554e-05, "loss": 0.4345, "step": 1351 }, { "epoch": 2.227347611202636, "grad_norm": 0.2515463293555506, "learning_rate": 5.428673686028126e-05, "loss": 0.4412, "step": 1352 }, { "epoch": 2.228995057660626, "grad_norm": 0.24947675511900455, "learning_rate": 5.42437486810308e-05, "loss": 0.4409, "step": 1353 }, { "epoch": 2.230642504118616, "grad_norm": 0.2674996145529729, "learning_rate": 5.4200741653115186e-05, "loss": 0.439, "step": 1354 }, { "epoch": 2.232289950576606, "grad_norm": 0.2716303216359714, "learning_rate": 5.415771583344533e-05, "loss": 0.4374, "step": 1355 }, { "epoch": 2.2339373970345964, "grad_norm": 0.2893635652223762, "learning_rate": 5.411467127895705e-05, "loss": 0.4402, "step": 1356 }, { "epoch": 2.2355848434925867, "grad_norm": 0.23779340341714944, "learning_rate": 5.407160804661095e-05, "loss": 0.4409, "step": 1357 }, { "epoch": 2.2372322899505765, "grad_norm": 0.291042984515646, "learning_rate": 5.402852619339231e-05, "loss": 0.4359, "step": 1358 }, { "epoch": 2.238879736408567, "grad_norm": 0.3402556845638091, "learning_rate": 5.3985425776311126e-05, "loss": 0.4286, "step": 1359 }, { "epoch": 2.2405271828665567, "grad_norm": 0.30125947277438414, "learning_rate": 5.394230685240187e-05, "loss": 0.4384, "step": 1360 }, { "epoch": 2.242174629324547, "grad_norm": 0.2431734959767278, "learning_rate": 5.389916947872358e-05, "loss": 0.4309, "step": 1361 }, { "epoch": 2.243822075782537, "grad_norm": 0.2454204246347552, "learning_rate": 5.3856013712359665e-05, "loss": 0.4338, "step": 1362 }, { "epoch": 2.245469522240527, "grad_norm": 0.2935804495364915, "learning_rate": 5.3812839610417886e-05, "loss": 0.4293, "step": 1363 }, { "epoch": 2.247116968698517, "grad_norm": 0.3445670766267672, "learning_rate": 5.376964723003028e-05, "loss": 0.4373, "step": 1364 }, { "epoch": 2.2487644151565074, "grad_norm": 0.3591875590516158, "learning_rate": 5.372643662835303e-05, "loss": 0.4354, "step": 1365 }, { "epoch": 2.2504118616144977, "grad_norm": 0.3494594125361348, "learning_rate": 5.3683207862566495e-05, "loss": 0.4354, "step": 1366 }, { "epoch": 2.2520593080724876, "grad_norm": 0.32571318169557206, "learning_rate": 5.3639960989875e-05, "loss": 0.4304, "step": 1367 }, { "epoch": 2.253706754530478, "grad_norm": 0.26906493714831614, "learning_rate": 5.3596696067506897e-05, "loss": 0.4351, "step": 1368 }, { "epoch": 2.2553542009884677, "grad_norm": 0.22949306473034337, "learning_rate": 5.355341315271438e-05, "loss": 0.4294, "step": 1369 }, { "epoch": 2.257001647446458, "grad_norm": 0.25924835313087496, "learning_rate": 5.351011230277346e-05, "loss": 0.4318, "step": 1370 }, { "epoch": 2.258649093904448, "grad_norm": 0.3012359597175578, "learning_rate": 5.346679357498388e-05, "loss": 0.4305, "step": 1371 }, { "epoch": 2.260296540362438, "grad_norm": 0.31853080243247656, "learning_rate": 5.342345702666908e-05, "loss": 0.4379, "step": 1372 }, { "epoch": 2.2619439868204285, "grad_norm": 0.3935945514779735, "learning_rate": 5.3380102715176006e-05, "loss": 0.43, "step": 1373 }, { "epoch": 2.2635914332784184, "grad_norm": 0.35639328843634854, "learning_rate": 5.333673069787518e-05, "loss": 0.4353, "step": 1374 }, { "epoch": 2.2652388797364087, "grad_norm": 0.29256586128460527, "learning_rate": 5.3293341032160505e-05, "loss": 0.4337, "step": 1375 }, { "epoch": 2.2668863261943986, "grad_norm": 0.21922070092943874, "learning_rate": 5.324993377544928e-05, "loss": 0.4428, "step": 1376 }, { "epoch": 2.268533772652389, "grad_norm": 0.4226163514373856, "learning_rate": 5.320650898518205e-05, "loss": 0.4356, "step": 1377 }, { "epoch": 2.2701812191103787, "grad_norm": 0.5667294593509165, "learning_rate": 5.316306671882258e-05, "loss": 0.439, "step": 1378 }, { "epoch": 2.271828665568369, "grad_norm": 0.451809779901237, "learning_rate": 5.311960703385774e-05, "loss": 0.4341, "step": 1379 }, { "epoch": 2.2734761120263594, "grad_norm": 0.28949689193680334, "learning_rate": 5.307612998779748e-05, "loss": 0.4422, "step": 1380 }, { "epoch": 2.275123558484349, "grad_norm": 0.36198258510462755, "learning_rate": 5.30326356381747e-05, "loss": 0.4422, "step": 1381 }, { "epoch": 2.2767710049423395, "grad_norm": 0.34262289142011576, "learning_rate": 5.2989124042545194e-05, "loss": 0.4378, "step": 1382 }, { "epoch": 2.2784184514003294, "grad_norm": 0.2808363842647155, "learning_rate": 5.2945595258487625e-05, "loss": 0.4338, "step": 1383 }, { "epoch": 2.2800658978583197, "grad_norm": 0.32271047190939567, "learning_rate": 5.2902049343603343e-05, "loss": 0.4337, "step": 1384 }, { "epoch": 2.2817133443163096, "grad_norm": 0.3065856151604241, "learning_rate": 5.285848635551638e-05, "loss": 0.4406, "step": 1385 }, { "epoch": 2.2833607907743, "grad_norm": 0.24493119228171606, "learning_rate": 5.281490635187339e-05, "loss": 0.4457, "step": 1386 }, { "epoch": 2.28500823723229, "grad_norm": 0.2639159682768085, "learning_rate": 5.27713093903435e-05, "loss": 0.4315, "step": 1387 }, { "epoch": 2.28665568369028, "grad_norm": 0.2585952775722585, "learning_rate": 5.2727695528618334e-05, "loss": 0.4396, "step": 1388 }, { "epoch": 2.2883031301482704, "grad_norm": 0.31741854296571026, "learning_rate": 5.268406482441182e-05, "loss": 0.4319, "step": 1389 }, { "epoch": 2.2899505766062602, "grad_norm": 0.2978164741467532, "learning_rate": 5.2640417335460214e-05, "loss": 0.4338, "step": 1390 }, { "epoch": 2.2915980230642505, "grad_norm": 0.2455960682332144, "learning_rate": 5.2596753119521976e-05, "loss": 0.4337, "step": 1391 }, { "epoch": 2.2932454695222404, "grad_norm": 0.2609073680975944, "learning_rate": 5.255307223437767e-05, "loss": 0.4311, "step": 1392 }, { "epoch": 2.2948929159802307, "grad_norm": 0.36439122682576947, "learning_rate": 5.250937473782997e-05, "loss": 0.4318, "step": 1393 }, { "epoch": 2.2965403624382206, "grad_norm": 0.44133277892696104, "learning_rate": 5.246566068770349e-05, "loss": 0.4395, "step": 1394 }, { "epoch": 2.298187808896211, "grad_norm": 0.512015617932254, "learning_rate": 5.242193014184476e-05, "loss": 0.4357, "step": 1395 }, { "epoch": 2.2998352553542007, "grad_norm": 0.4545282773756923, "learning_rate": 5.237818315812218e-05, "loss": 0.4401, "step": 1396 }, { "epoch": 2.301482701812191, "grad_norm": 0.35928309816537446, "learning_rate": 5.23344197944258e-05, "loss": 0.4407, "step": 1397 }, { "epoch": 2.3031301482701814, "grad_norm": 0.2985417152491407, "learning_rate": 5.229064010866747e-05, "loss": 0.4347, "step": 1398 }, { "epoch": 2.3047775947281712, "grad_norm": 0.2703842610457379, "learning_rate": 5.2246844158780544e-05, "loss": 0.4353, "step": 1399 }, { "epoch": 2.3064250411861615, "grad_norm": 0.22096687072302218, "learning_rate": 5.2203032002719954e-05, "loss": 0.4457, "step": 1400 }, { "epoch": 2.3080724876441514, "grad_norm": 0.302417335988505, "learning_rate": 5.215920369846204e-05, "loss": 0.44, "step": 1401 }, { "epoch": 2.3097199341021417, "grad_norm": 0.3312130707945056, "learning_rate": 5.211535930400454e-05, "loss": 0.432, "step": 1402 }, { "epoch": 2.3113673805601316, "grad_norm": 0.30801299571074836, "learning_rate": 5.207149887736648e-05, "loss": 0.4378, "step": 1403 }, { "epoch": 2.313014827018122, "grad_norm": 0.27047798828545766, "learning_rate": 5.202762247658806e-05, "loss": 0.4324, "step": 1404 }, { "epoch": 2.314662273476112, "grad_norm": 0.3113696840517807, "learning_rate": 5.1983730159730685e-05, "loss": 0.4373, "step": 1405 }, { "epoch": 2.316309719934102, "grad_norm": 0.2861965094115557, "learning_rate": 5.193982198487678e-05, "loss": 0.4311, "step": 1406 }, { "epoch": 2.3179571663920924, "grad_norm": 0.2601424294814161, "learning_rate": 5.1895898010129755e-05, "loss": 0.439, "step": 1407 }, { "epoch": 2.3196046128500822, "grad_norm": 0.24940135276341513, "learning_rate": 5.185195829361394e-05, "loss": 0.447, "step": 1408 }, { "epoch": 2.3212520593080725, "grad_norm": 0.26946943026313325, "learning_rate": 5.1808002893474485e-05, "loss": 0.4312, "step": 1409 }, { "epoch": 2.3228995057660624, "grad_norm": 0.28211409841440505, "learning_rate": 5.176403186787732e-05, "loss": 0.4342, "step": 1410 }, { "epoch": 2.3245469522240527, "grad_norm": 0.2652518428690113, "learning_rate": 5.1720045275009015e-05, "loss": 0.4364, "step": 1411 }, { "epoch": 2.326194398682043, "grad_norm": 0.3279459765847013, "learning_rate": 5.167604317307677e-05, "loss": 0.4326, "step": 1412 }, { "epoch": 2.327841845140033, "grad_norm": 0.265839442275889, "learning_rate": 5.1632025620308294e-05, "loss": 0.4294, "step": 1413 }, { "epoch": 2.329489291598023, "grad_norm": 0.26838635306968317, "learning_rate": 5.158799267495173e-05, "loss": 0.4389, "step": 1414 }, { "epoch": 2.331136738056013, "grad_norm": 0.24811527582005824, "learning_rate": 5.154394439527563e-05, "loss": 0.4378, "step": 1415 }, { "epoch": 2.3327841845140034, "grad_norm": 0.33621435890916784, "learning_rate": 5.1499880839568795e-05, "loss": 0.4362, "step": 1416 }, { "epoch": 2.3344316309719932, "grad_norm": 0.3613327915015947, "learning_rate": 5.1455802066140264e-05, "loss": 0.4231, "step": 1417 }, { "epoch": 2.3360790774299836, "grad_norm": 0.2783035546994787, "learning_rate": 5.14117081333192e-05, "loss": 0.4312, "step": 1418 }, { "epoch": 2.337726523887974, "grad_norm": 0.2973623062056164, "learning_rate": 5.136759909945484e-05, "loss": 0.4362, "step": 1419 }, { "epoch": 2.3393739703459637, "grad_norm": 0.31261663340651696, "learning_rate": 5.132347502291639e-05, "loss": 0.4372, "step": 1420 }, { "epoch": 2.341021416803954, "grad_norm": 0.22570659157270717, "learning_rate": 5.127933596209297e-05, "loss": 0.435, "step": 1421 }, { "epoch": 2.342668863261944, "grad_norm": 0.2668147881410388, "learning_rate": 5.123518197539354e-05, "loss": 0.4311, "step": 1422 }, { "epoch": 2.344316309719934, "grad_norm": 0.2577384831204931, "learning_rate": 5.119101312124679e-05, "loss": 0.432, "step": 1423 }, { "epoch": 2.345963756177924, "grad_norm": 0.23912006144339928, "learning_rate": 5.11468294581011e-05, "loss": 0.4372, "step": 1424 }, { "epoch": 2.3476112026359144, "grad_norm": 0.23445888435374232, "learning_rate": 5.110263104442443e-05, "loss": 0.4374, "step": 1425 }, { "epoch": 2.3492586490939047, "grad_norm": 0.25651217415573635, "learning_rate": 5.105841793870427e-05, "loss": 0.4281, "step": 1426 }, { "epoch": 2.3509060955518946, "grad_norm": 0.25077960059021104, "learning_rate": 5.1014190199447585e-05, "loss": 0.4397, "step": 1427 }, { "epoch": 2.352553542009885, "grad_norm": 0.28500694207961863, "learning_rate": 5.0969947885180604e-05, "loss": 0.4387, "step": 1428 }, { "epoch": 2.3542009884678747, "grad_norm": 0.3184585293645126, "learning_rate": 5.092569105444898e-05, "loss": 0.4364, "step": 1429 }, { "epoch": 2.355848434925865, "grad_norm": 0.2866639874544935, "learning_rate": 5.088141976581746e-05, "loss": 0.434, "step": 1430 }, { "epoch": 2.357495881383855, "grad_norm": 0.23092652404887512, "learning_rate": 5.0837134077869976e-05, "loss": 0.4321, "step": 1431 }, { "epoch": 2.359143327841845, "grad_norm": 0.1925606934030895, "learning_rate": 5.079283404920952e-05, "loss": 0.434, "step": 1432 }, { "epoch": 2.360790774299835, "grad_norm": 0.1963924138858606, "learning_rate": 5.0748519738458044e-05, "loss": 0.4369, "step": 1433 }, { "epoch": 2.3624382207578254, "grad_norm": 0.27913392019905736, "learning_rate": 5.07041912042564e-05, "loss": 0.4322, "step": 1434 }, { "epoch": 2.3640856672158153, "grad_norm": 0.2840386551739174, "learning_rate": 5.065984850526427e-05, "loss": 0.4416, "step": 1435 }, { "epoch": 2.3657331136738056, "grad_norm": 0.26605530383315334, "learning_rate": 5.0615491700160055e-05, "loss": 0.4402, "step": 1436 }, { "epoch": 2.367380560131796, "grad_norm": 0.2612211636365365, "learning_rate": 5.057112084764087e-05, "loss": 0.4358, "step": 1437 }, { "epoch": 2.3690280065897857, "grad_norm": 0.24673095873616396, "learning_rate": 5.052673600642237e-05, "loss": 0.4319, "step": 1438 }, { "epoch": 2.370675453047776, "grad_norm": 0.28492559815699503, "learning_rate": 5.048233723523875e-05, "loss": 0.4309, "step": 1439 }, { "epoch": 2.372322899505766, "grad_norm": 0.2949604109592219, "learning_rate": 5.043792459284266e-05, "loss": 0.4416, "step": 1440 }, { "epoch": 2.3739703459637562, "grad_norm": 0.30613258658509046, "learning_rate": 5.0393498138005024e-05, "loss": 0.4325, "step": 1441 }, { "epoch": 2.375617792421746, "grad_norm": 0.2999449489410412, "learning_rate": 5.0349057929515136e-05, "loss": 0.4366, "step": 1442 }, { "epoch": 2.3772652388797364, "grad_norm": 0.2984221769652478, "learning_rate": 5.030460402618044e-05, "loss": 0.4383, "step": 1443 }, { "epoch": 2.3789126853377267, "grad_norm": 0.2642860422122808, "learning_rate": 5.026013648682651e-05, "loss": 0.4295, "step": 1444 }, { "epoch": 2.3805601317957166, "grad_norm": 0.22919968245123515, "learning_rate": 5.0215655370296965e-05, "loss": 0.4388, "step": 1445 }, { "epoch": 2.382207578253707, "grad_norm": 0.2690739638499515, "learning_rate": 5.017116073545341e-05, "loss": 0.4294, "step": 1446 }, { "epoch": 2.3838550247116967, "grad_norm": 0.3315781399943425, "learning_rate": 5.01266526411753e-05, "loss": 0.4395, "step": 1447 }, { "epoch": 2.385502471169687, "grad_norm": 0.35724117964354646, "learning_rate": 5.008213114635993e-05, "loss": 0.4308, "step": 1448 }, { "epoch": 2.387149917627677, "grad_norm": 0.38704671673808333, "learning_rate": 5.0037596309922334e-05, "loss": 0.4403, "step": 1449 }, { "epoch": 2.3887973640856672, "grad_norm": 0.3343266679531296, "learning_rate": 4.999304819079517e-05, "loss": 0.438, "step": 1450 }, { "epoch": 2.3904448105436575, "grad_norm": 0.23349617681116286, "learning_rate": 4.99484868479287e-05, "loss": 0.4333, "step": 1451 }, { "epoch": 2.3920922570016474, "grad_norm": 0.2627524413297394, "learning_rate": 4.9903912340290683e-05, "loss": 0.4457, "step": 1452 }, { "epoch": 2.3937397034596377, "grad_norm": 0.33687428991864726, "learning_rate": 4.985932472686627e-05, "loss": 0.4287, "step": 1453 }, { "epoch": 2.3953871499176276, "grad_norm": 0.29737455155643056, "learning_rate": 4.981472406665799e-05, "loss": 0.4412, "step": 1454 }, { "epoch": 2.397034596375618, "grad_norm": 0.34355698577258553, "learning_rate": 4.977011041868562e-05, "loss": 0.4401, "step": 1455 }, { "epoch": 2.3986820428336078, "grad_norm": 0.31343122533864276, "learning_rate": 4.972548384198613e-05, "loss": 0.4299, "step": 1456 }, { "epoch": 2.400329489291598, "grad_norm": 0.22685276352781913, "learning_rate": 4.96808443956136e-05, "loss": 0.4305, "step": 1457 }, { "epoch": 2.4019769357495884, "grad_norm": 0.2822452096853593, "learning_rate": 4.9636192138639116e-05, "loss": 0.4315, "step": 1458 }, { "epoch": 2.4036243822075782, "grad_norm": 0.31961269667357567, "learning_rate": 4.9591527130150754e-05, "loss": 0.4352, "step": 1459 }, { "epoch": 2.4052718286655685, "grad_norm": 0.3023467033254562, "learning_rate": 4.9546849429253426e-05, "loss": 0.4411, "step": 1460 }, { "epoch": 2.4069192751235584, "grad_norm": 0.23750215212088885, "learning_rate": 4.950215909506888e-05, "loss": 0.4305, "step": 1461 }, { "epoch": 2.4085667215815487, "grad_norm": 0.259362333176414, "learning_rate": 4.9457456186735554e-05, "loss": 0.4352, "step": 1462 }, { "epoch": 2.4102141680395386, "grad_norm": 0.28421337637248034, "learning_rate": 4.941274076340852e-05, "loss": 0.4363, "step": 1463 }, { "epoch": 2.411861614497529, "grad_norm": 0.2657977911459538, "learning_rate": 4.936801288425945e-05, "loss": 0.4333, "step": 1464 }, { "epoch": 2.4135090609555188, "grad_norm": 0.24959095533199735, "learning_rate": 4.9323272608476444e-05, "loss": 0.4381, "step": 1465 }, { "epoch": 2.415156507413509, "grad_norm": 0.24372292040153404, "learning_rate": 4.927851999526405e-05, "loss": 0.4422, "step": 1466 }, { "epoch": 2.416803953871499, "grad_norm": 0.2801503211870354, "learning_rate": 4.9233755103843115e-05, "loss": 0.4329, "step": 1467 }, { "epoch": 2.4184514003294892, "grad_norm": 0.2385670877938533, "learning_rate": 4.9188977993450754e-05, "loss": 0.4407, "step": 1468 }, { "epoch": 2.4200988467874796, "grad_norm": 0.23600366090557764, "learning_rate": 4.914418872334024e-05, "loss": 0.4444, "step": 1469 }, { "epoch": 2.4217462932454694, "grad_norm": 0.24982637085127848, "learning_rate": 4.9099387352780946e-05, "loss": 0.4329, "step": 1470 }, { "epoch": 2.4233937397034597, "grad_norm": 0.26940862348562017, "learning_rate": 4.905457394105824e-05, "loss": 0.4286, "step": 1471 }, { "epoch": 2.4250411861614496, "grad_norm": 0.22634566866359146, "learning_rate": 4.9009748547473434e-05, "loss": 0.4292, "step": 1472 }, { "epoch": 2.42668863261944, "grad_norm": 0.2729306795243545, "learning_rate": 4.89649112313437e-05, "loss": 0.4335, "step": 1473 }, { "epoch": 2.4283360790774298, "grad_norm": 0.2629349515111156, "learning_rate": 4.892006205200199e-05, "loss": 0.4228, "step": 1474 }, { "epoch": 2.42998352553542, "grad_norm": 0.22056611873524357, "learning_rate": 4.8875201068796936e-05, "loss": 0.4335, "step": 1475 }, { "epoch": 2.4316309719934104, "grad_norm": 0.21878553221895014, "learning_rate": 4.88303283410928e-05, "loss": 0.432, "step": 1476 }, { "epoch": 2.4332784184514002, "grad_norm": 0.26330833598062675, "learning_rate": 4.8785443928269404e-05, "loss": 0.4339, "step": 1477 }, { "epoch": 2.4349258649093906, "grad_norm": 0.30716421196915217, "learning_rate": 4.8740547889722e-05, "loss": 0.4334, "step": 1478 }, { "epoch": 2.4365733113673804, "grad_norm": 0.31907307645943694, "learning_rate": 4.869564028486124e-05, "loss": 0.4346, "step": 1479 }, { "epoch": 2.4382207578253707, "grad_norm": 0.21854476957066563, "learning_rate": 4.86507211731131e-05, "loss": 0.4393, "step": 1480 }, { "epoch": 2.4398682042833606, "grad_norm": 0.29550116502421286, "learning_rate": 4.860579061391875e-05, "loss": 0.4312, "step": 1481 }, { "epoch": 2.441515650741351, "grad_norm": 0.3104557214286182, "learning_rate": 4.8560848666734506e-05, "loss": 0.4359, "step": 1482 }, { "epoch": 2.443163097199341, "grad_norm": 0.25990327412651115, "learning_rate": 4.851589539103182e-05, "loss": 0.4303, "step": 1483 }, { "epoch": 2.444810543657331, "grad_norm": 0.2684482074383332, "learning_rate": 4.847093084629703e-05, "loss": 0.4287, "step": 1484 }, { "epoch": 2.4464579901153214, "grad_norm": 0.27338994056871546, "learning_rate": 4.8425955092031475e-05, "loss": 0.4316, "step": 1485 }, { "epoch": 2.4481054365733113, "grad_norm": 0.2560170839978574, "learning_rate": 4.83809681877513e-05, "loss": 0.4344, "step": 1486 }, { "epoch": 2.4497528830313016, "grad_norm": 0.25926103184365784, "learning_rate": 4.833597019298737e-05, "loss": 0.4303, "step": 1487 }, { "epoch": 2.4514003294892914, "grad_norm": 0.24036552683909995, "learning_rate": 4.8290961167285276e-05, "loss": 0.431, "step": 1488 }, { "epoch": 2.4530477759472817, "grad_norm": 0.24172760613818187, "learning_rate": 4.8245941170205176e-05, "loss": 0.4325, "step": 1489 }, { "epoch": 2.454695222405272, "grad_norm": 0.27068572071559466, "learning_rate": 4.8200910261321775e-05, "loss": 0.434, "step": 1490 }, { "epoch": 2.456342668863262, "grad_norm": 0.2950617977132491, "learning_rate": 4.815586850022418e-05, "loss": 0.437, "step": 1491 }, { "epoch": 2.4579901153212522, "grad_norm": 0.2479047791404462, "learning_rate": 4.811081594651587e-05, "loss": 0.4393, "step": 1492 }, { "epoch": 2.459637561779242, "grad_norm": 0.2441774129686774, "learning_rate": 4.806575265981463e-05, "loss": 0.4357, "step": 1493 }, { "epoch": 2.4612850082372324, "grad_norm": 0.21692100066690753, "learning_rate": 4.802067869975241e-05, "loss": 0.4287, "step": 1494 }, { "epoch": 2.4629324546952223, "grad_norm": 0.19169786688879756, "learning_rate": 4.797559412597533e-05, "loss": 0.4272, "step": 1495 }, { "epoch": 2.4645799011532126, "grad_norm": 0.31324845457989725, "learning_rate": 4.7930498998143516e-05, "loss": 0.4367, "step": 1496 }, { "epoch": 2.466227347611203, "grad_norm": 0.402181918365328, "learning_rate": 4.788539337593108e-05, "loss": 0.4413, "step": 1497 }, { "epoch": 2.4678747940691927, "grad_norm": 0.27177037638072565, "learning_rate": 4.784027731902601e-05, "loss": 0.4367, "step": 1498 }, { "epoch": 2.469522240527183, "grad_norm": 0.2651804497522646, "learning_rate": 4.77951508871301e-05, "loss": 0.4342, "step": 1499 }, { "epoch": 2.471169686985173, "grad_norm": 0.3315651212077936, "learning_rate": 4.775001413995889e-05, "loss": 0.4408, "step": 1500 }, { "epoch": 2.4728171334431632, "grad_norm": 0.36793620160672674, "learning_rate": 4.770486713724156e-05, "loss": 0.4291, "step": 1501 }, { "epoch": 2.474464579901153, "grad_norm": 0.3245379455349387, "learning_rate": 4.765970993872087e-05, "loss": 0.4252, "step": 1502 }, { "epoch": 2.4761120263591434, "grad_norm": 0.34066723280393196, "learning_rate": 4.761454260415304e-05, "loss": 0.4416, "step": 1503 }, { "epoch": 2.4777594728171333, "grad_norm": 0.38946369204419873, "learning_rate": 4.756936519330774e-05, "loss": 0.442, "step": 1504 }, { "epoch": 2.4794069192751236, "grad_norm": 0.3448499385228827, "learning_rate": 4.7524177765967956e-05, "loss": 0.4313, "step": 1505 }, { "epoch": 2.4810543657331134, "grad_norm": 0.31306641322118633, "learning_rate": 4.7478980381929944e-05, "loss": 0.4331, "step": 1506 }, { "epoch": 2.4827018121911038, "grad_norm": 0.2862948292029937, "learning_rate": 4.7433773101003115e-05, "loss": 0.4313, "step": 1507 }, { "epoch": 2.484349258649094, "grad_norm": 0.320128840398156, "learning_rate": 4.738855598300999e-05, "loss": 0.4263, "step": 1508 }, { "epoch": 2.485996705107084, "grad_norm": 0.4172470006361805, "learning_rate": 4.7343329087786106e-05, "loss": 0.4364, "step": 1509 }, { "epoch": 2.4876441515650742, "grad_norm": 0.32825735003851897, "learning_rate": 4.729809247517993e-05, "loss": 0.4346, "step": 1510 }, { "epoch": 2.489291598023064, "grad_norm": 0.22765149724769732, "learning_rate": 4.72528462050528e-05, "loss": 0.4424, "step": 1511 }, { "epoch": 2.4909390444810544, "grad_norm": 0.25728882254924607, "learning_rate": 4.7207590337278814e-05, "loss": 0.4382, "step": 1512 }, { "epoch": 2.4925864909390443, "grad_norm": 0.2665756005151125, "learning_rate": 4.71623249317448e-05, "loss": 0.4326, "step": 1513 }, { "epoch": 2.4942339373970346, "grad_norm": 0.3009391175179251, "learning_rate": 4.7117050048350185e-05, "loss": 0.4337, "step": 1514 }, { "epoch": 2.495881383855025, "grad_norm": 0.24315531665669946, "learning_rate": 4.707176574700694e-05, "loss": 0.4283, "step": 1515 }, { "epoch": 2.4975288303130148, "grad_norm": 0.23410453968734568, "learning_rate": 4.7026472087639515e-05, "loss": 0.4332, "step": 1516 }, { "epoch": 2.499176276771005, "grad_norm": 0.29000660888207297, "learning_rate": 4.6981169130184714e-05, "loss": 0.4387, "step": 1517 }, { "epoch": 2.500823723228995, "grad_norm": 0.3296764380601096, "learning_rate": 4.693585693459166e-05, "loss": 0.4314, "step": 1518 }, { "epoch": 2.5024711696869852, "grad_norm": 0.27723833512356477, "learning_rate": 4.689053556082174e-05, "loss": 0.4408, "step": 1519 }, { "epoch": 2.504118616144975, "grad_norm": 0.203820127384556, "learning_rate": 4.6845205068848414e-05, "loss": 0.4333, "step": 1520 }, { "epoch": 2.5057660626029654, "grad_norm": 0.22288911046539342, "learning_rate": 4.6799865518657244e-05, "loss": 0.4353, "step": 1521 }, { "epoch": 2.5074135090609557, "grad_norm": 0.19388881360215587, "learning_rate": 4.675451697024581e-05, "loss": 0.4367, "step": 1522 }, { "epoch": 2.5090609555189456, "grad_norm": 0.25814726993708453, "learning_rate": 4.670915948362353e-05, "loss": 0.4284, "step": 1523 }, { "epoch": 2.510708401976936, "grad_norm": 0.2676346448838469, "learning_rate": 4.6663793118811705e-05, "loss": 0.4294, "step": 1524 }, { "epoch": 2.5123558484349258, "grad_norm": 0.2519885904265151, "learning_rate": 4.661841793584337e-05, "loss": 0.4325, "step": 1525 }, { "epoch": 2.514003294892916, "grad_norm": 0.22977477064241503, "learning_rate": 4.657303399476322e-05, "loss": 0.4296, "step": 1526 }, { "epoch": 2.515650741350906, "grad_norm": 0.32844150907235786, "learning_rate": 4.652764135562754e-05, "loss": 0.4272, "step": 1527 }, { "epoch": 2.5172981878088962, "grad_norm": 0.43476239848254716, "learning_rate": 4.648224007850414e-05, "loss": 0.4318, "step": 1528 }, { "epoch": 2.5189456342668866, "grad_norm": 0.43540162173512686, "learning_rate": 4.643683022347225e-05, "loss": 0.4341, "step": 1529 }, { "epoch": 2.5205930807248764, "grad_norm": 0.2749829637500747, "learning_rate": 4.6391411850622444e-05, "loss": 0.428, "step": 1530 }, { "epoch": 2.5222405271828663, "grad_norm": 0.39360935009073444, "learning_rate": 4.63459850200566e-05, "loss": 0.4325, "step": 1531 }, { "epoch": 2.5238879736408566, "grad_norm": 0.32298555070808604, "learning_rate": 4.6300549791887744e-05, "loss": 0.421, "step": 1532 }, { "epoch": 2.525535420098847, "grad_norm": 0.2693197913031904, "learning_rate": 4.625510622624002e-05, "loss": 0.4316, "step": 1533 }, { "epoch": 2.5271828665568368, "grad_norm": 0.2507492730808459, "learning_rate": 4.6209654383248656e-05, "loss": 0.437, "step": 1534 }, { "epoch": 2.528830313014827, "grad_norm": 0.22123257424546375, "learning_rate": 4.616419432305976e-05, "loss": 0.4342, "step": 1535 }, { "epoch": 2.5304777594728174, "grad_norm": 0.27002838496923504, "learning_rate": 4.6118726105830384e-05, "loss": 0.4303, "step": 1536 }, { "epoch": 2.5321252059308073, "grad_norm": 0.23334484600402836, "learning_rate": 4.607324979172832e-05, "loss": 0.4334, "step": 1537 }, { "epoch": 2.533772652388797, "grad_norm": 0.25544942868856596, "learning_rate": 4.602776544093209e-05, "loss": 0.4288, "step": 1538 }, { "epoch": 2.5354200988467874, "grad_norm": 0.29731531128630073, "learning_rate": 4.598227311363088e-05, "loss": 0.4307, "step": 1539 }, { "epoch": 2.5370675453047777, "grad_norm": 0.2344667280475107, "learning_rate": 4.5936772870024376e-05, "loss": 0.4311, "step": 1540 }, { "epoch": 2.5387149917627676, "grad_norm": 0.22561521673371213, "learning_rate": 4.589126477032281e-05, "loss": 0.4302, "step": 1541 }, { "epoch": 2.540362438220758, "grad_norm": 0.27300514459569747, "learning_rate": 4.584574887474674e-05, "loss": 0.4369, "step": 1542 }, { "epoch": 2.5420098846787478, "grad_norm": 0.3728410255341461, "learning_rate": 4.580022524352707e-05, "loss": 0.4381, "step": 1543 }, { "epoch": 2.543657331136738, "grad_norm": 0.35993591705567146, "learning_rate": 4.5754693936904964e-05, "loss": 0.4398, "step": 1544 }, { "epoch": 2.545304777594728, "grad_norm": 0.3241941779706033, "learning_rate": 4.5709155015131704e-05, "loss": 0.4403, "step": 1545 }, { "epoch": 2.5469522240527183, "grad_norm": 0.2914758840387498, "learning_rate": 4.566360853846868e-05, "loss": 0.4412, "step": 1546 }, { "epoch": 2.5485996705107086, "grad_norm": 0.264033123260727, "learning_rate": 4.561805456718723e-05, "loss": 0.4253, "step": 1547 }, { "epoch": 2.5502471169686984, "grad_norm": 0.25696131502314556, "learning_rate": 4.5572493161568694e-05, "loss": 0.4325, "step": 1548 }, { "epoch": 2.5518945634266887, "grad_norm": 0.3262289293169251, "learning_rate": 4.552692438190417e-05, "loss": 0.4348, "step": 1549 }, { "epoch": 2.5535420098846786, "grad_norm": 0.26638801817392815, "learning_rate": 4.548134828849454e-05, "loss": 0.434, "step": 1550 }, { "epoch": 2.555189456342669, "grad_norm": 0.249505698648917, "learning_rate": 4.543576494165039e-05, "loss": 0.439, "step": 1551 }, { "epoch": 2.556836902800659, "grad_norm": 0.3293432150859243, "learning_rate": 4.5390174401691863e-05, "loss": 0.4293, "step": 1552 }, { "epoch": 2.558484349258649, "grad_norm": 0.3931440341513524, "learning_rate": 4.5344576728948654e-05, "loss": 0.4429, "step": 1553 }, { "epoch": 2.5601317957166394, "grad_norm": 0.2833281400350413, "learning_rate": 4.529897198375988e-05, "loss": 0.4321, "step": 1554 }, { "epoch": 2.5617792421746293, "grad_norm": 0.22173275318458835, "learning_rate": 4.525336022647401e-05, "loss": 0.4295, "step": 1555 }, { "epoch": 2.5634266886326196, "grad_norm": 0.3161253338340574, "learning_rate": 4.520774151744882e-05, "loss": 0.4416, "step": 1556 }, { "epoch": 2.5650741350906094, "grad_norm": 0.28004425716291015, "learning_rate": 4.5162115917051246e-05, "loss": 0.434, "step": 1557 }, { "epoch": 2.5667215815485998, "grad_norm": 0.18425953593011507, "learning_rate": 4.511648348565739e-05, "loss": 0.4382, "step": 1558 }, { "epoch": 2.5683690280065896, "grad_norm": 0.2482859147037315, "learning_rate": 4.5070844283652356e-05, "loss": 0.4325, "step": 1559 }, { "epoch": 2.57001647446458, "grad_norm": 0.22769155670686736, "learning_rate": 4.5025198371430214e-05, "loss": 0.4313, "step": 1560 }, { "epoch": 2.5716639209225702, "grad_norm": 0.23157720098845505, "learning_rate": 4.497954580939393e-05, "loss": 0.4312, "step": 1561 }, { "epoch": 2.57331136738056, "grad_norm": 0.23729166660166212, "learning_rate": 4.493388665795523e-05, "loss": 0.429, "step": 1562 }, { "epoch": 2.5749588138385504, "grad_norm": 0.2731200812644811, "learning_rate": 4.4888220977534646e-05, "loss": 0.4395, "step": 1563 }, { "epoch": 2.5766062602965403, "grad_norm": 0.27847144807420143, "learning_rate": 4.484254882856123e-05, "loss": 0.4265, "step": 1564 }, { "epoch": 2.5782537067545306, "grad_norm": 0.21185757895381704, "learning_rate": 4.479687027147271e-05, "loss": 0.4384, "step": 1565 }, { "epoch": 2.5799011532125204, "grad_norm": 0.3017813937273821, "learning_rate": 4.475118536671521e-05, "loss": 0.4327, "step": 1566 }, { "epoch": 2.5815485996705108, "grad_norm": 0.34053562273310484, "learning_rate": 4.4705494174743274e-05, "loss": 0.4356, "step": 1567 }, { "epoch": 2.583196046128501, "grad_norm": 0.2913158756654447, "learning_rate": 4.465979675601981e-05, "loss": 0.4396, "step": 1568 }, { "epoch": 2.584843492586491, "grad_norm": 0.27339322703094937, "learning_rate": 4.4614093171015896e-05, "loss": 0.4356, "step": 1569 }, { "epoch": 2.586490939044481, "grad_norm": 0.2620358261849029, "learning_rate": 4.456838348021084e-05, "loss": 0.4298, "step": 1570 }, { "epoch": 2.588138385502471, "grad_norm": 0.3248818488757718, "learning_rate": 4.4522667744091965e-05, "loss": 0.4398, "step": 1571 }, { "epoch": 2.5897858319604614, "grad_norm": 0.31463656271153173, "learning_rate": 4.447694602315463e-05, "loss": 0.434, "step": 1572 }, { "epoch": 2.5914332784184513, "grad_norm": 0.3025748202707065, "learning_rate": 4.443121837790212e-05, "loss": 0.4362, "step": 1573 }, { "epoch": 2.5930807248764416, "grad_norm": 0.3760806559354288, "learning_rate": 4.438548486884554e-05, "loss": 0.4299, "step": 1574 }, { "epoch": 2.594728171334432, "grad_norm": 0.37115918488606936, "learning_rate": 4.4339745556503765e-05, "loss": 0.4363, "step": 1575 }, { "epoch": 2.5963756177924218, "grad_norm": 0.28007953266525676, "learning_rate": 4.429400050140335e-05, "loss": 0.4428, "step": 1576 }, { "epoch": 2.5980230642504116, "grad_norm": 0.24357672354730606, "learning_rate": 4.424824976407843e-05, "loss": 0.4379, "step": 1577 }, { "epoch": 2.599670510708402, "grad_norm": 0.4152695206537926, "learning_rate": 4.420249340507068e-05, "loss": 0.4332, "step": 1578 }, { "epoch": 2.6013179571663922, "grad_norm": 0.45627846519093196, "learning_rate": 4.4156731484929224e-05, "loss": 0.4313, "step": 1579 }, { "epoch": 2.602965403624382, "grad_norm": 0.27686156590661154, "learning_rate": 4.411096406421052e-05, "loss": 0.4335, "step": 1580 }, { "epoch": 2.6046128500823724, "grad_norm": 0.4540758897988175, "learning_rate": 4.4065191203478295e-05, "loss": 0.426, "step": 1581 }, { "epoch": 2.6062602965403623, "grad_norm": 0.4673283290753148, "learning_rate": 4.401941296330353e-05, "loss": 0.433, "step": 1582 }, { "epoch": 2.6079077429983526, "grad_norm": 0.2574714821390449, "learning_rate": 4.397362940426427e-05, "loss": 0.4329, "step": 1583 }, { "epoch": 2.6095551894563425, "grad_norm": 0.4257304424927718, "learning_rate": 4.392784058694561e-05, "loss": 0.4372, "step": 1584 }, { "epoch": 2.6112026359143328, "grad_norm": 0.4281890064672699, "learning_rate": 4.388204657193963e-05, "loss": 0.4319, "step": 1585 }, { "epoch": 2.612850082372323, "grad_norm": 0.2381060048245174, "learning_rate": 4.383624741984525e-05, "loss": 0.4298, "step": 1586 }, { "epoch": 2.614497528830313, "grad_norm": 0.4718705414787536, "learning_rate": 4.379044319126822e-05, "loss": 0.4278, "step": 1587 }, { "epoch": 2.6161449752883033, "grad_norm": 0.46854520538475675, "learning_rate": 4.3744633946821e-05, "loss": 0.4315, "step": 1588 }, { "epoch": 2.617792421746293, "grad_norm": 0.2919193303563457, "learning_rate": 4.369881974712266e-05, "loss": 0.4362, "step": 1589 }, { "epoch": 2.6194398682042834, "grad_norm": 0.40423244945468273, "learning_rate": 4.365300065279887e-05, "loss": 0.4337, "step": 1590 }, { "epoch": 2.6210873146622733, "grad_norm": 0.4123835438872673, "learning_rate": 4.360717672448174e-05, "loss": 0.4274, "step": 1591 }, { "epoch": 2.6227347611202636, "grad_norm": 0.33954427721475633, "learning_rate": 4.3561348022809815e-05, "loss": 0.4405, "step": 1592 }, { "epoch": 2.624382207578254, "grad_norm": 0.3935316082059271, "learning_rate": 4.351551460842792e-05, "loss": 0.4331, "step": 1593 }, { "epoch": 2.6260296540362438, "grad_norm": 0.3476184404569561, "learning_rate": 4.346967654198711e-05, "loss": 0.4365, "step": 1594 }, { "epoch": 2.627677100494234, "grad_norm": 0.33849711686793676, "learning_rate": 4.342383388414465e-05, "loss": 0.4326, "step": 1595 }, { "epoch": 2.629324546952224, "grad_norm": 0.3612987385143651, "learning_rate": 4.3377986695563824e-05, "loss": 0.438, "step": 1596 }, { "epoch": 2.6309719934102143, "grad_norm": 0.30678530363475986, "learning_rate": 4.333213503691396e-05, "loss": 0.4413, "step": 1597 }, { "epoch": 2.632619439868204, "grad_norm": 0.30587999251180403, "learning_rate": 4.328627896887026e-05, "loss": 0.4327, "step": 1598 }, { "epoch": 2.6342668863261944, "grad_norm": 0.2816223811083709, "learning_rate": 4.3240418552113784e-05, "loss": 0.433, "step": 1599 }, { "epoch": 2.6359143327841847, "grad_norm": 0.2141359347490018, "learning_rate": 4.319455384733133e-05, "loss": 0.4268, "step": 1600 }, { "epoch": 2.6375617792421746, "grad_norm": 0.22162428802080505, "learning_rate": 4.314868491521539e-05, "loss": 0.43, "step": 1601 }, { "epoch": 2.6392092257001645, "grad_norm": 0.3150984334637902, "learning_rate": 4.3102811816464037e-05, "loss": 0.4363, "step": 1602 }, { "epoch": 2.640856672158155, "grad_norm": 0.3002813917951298, "learning_rate": 4.305693461178085e-05, "loss": 0.4295, "step": 1603 }, { "epoch": 2.642504118616145, "grad_norm": 0.27594468169857705, "learning_rate": 4.3011053361874874e-05, "loss": 0.4281, "step": 1604 }, { "epoch": 2.644151565074135, "grad_norm": 0.25201934650978963, "learning_rate": 4.296516812746047e-05, "loss": 0.4361, "step": 1605 }, { "epoch": 2.6457990115321253, "grad_norm": 0.26009867438086265, "learning_rate": 4.291927896925729e-05, "loss": 0.4344, "step": 1606 }, { "epoch": 2.6474464579901156, "grad_norm": 0.219440106175037, "learning_rate": 4.2873385947990175e-05, "loss": 0.4386, "step": 1607 }, { "epoch": 2.6490939044481054, "grad_norm": 0.23991622005268218, "learning_rate": 4.282748912438908e-05, "loss": 0.4355, "step": 1608 }, { "epoch": 2.6507413509060953, "grad_norm": 0.25707700049624854, "learning_rate": 4.278158855918899e-05, "loss": 0.4308, "step": 1609 }, { "epoch": 2.6523887973640856, "grad_norm": 0.23354864201189762, "learning_rate": 4.273568431312985e-05, "loss": 0.4439, "step": 1610 }, { "epoch": 2.654036243822076, "grad_norm": 0.2789980168764591, "learning_rate": 4.2689776446956444e-05, "loss": 0.4402, "step": 1611 }, { "epoch": 2.655683690280066, "grad_norm": 0.2820970671094961, "learning_rate": 4.2643865021418384e-05, "loss": 0.4367, "step": 1612 }, { "epoch": 2.657331136738056, "grad_norm": 0.2983824459890313, "learning_rate": 4.2597950097269975e-05, "loss": 0.4275, "step": 1613 }, { "epoch": 2.658978583196046, "grad_norm": 0.25131800599844945, "learning_rate": 4.255203173527016e-05, "loss": 0.4281, "step": 1614 }, { "epoch": 2.6606260296540363, "grad_norm": 0.22011860564074862, "learning_rate": 4.250610999618241e-05, "loss": 0.4307, "step": 1615 }, { "epoch": 2.662273476112026, "grad_norm": 0.2547757865985801, "learning_rate": 4.2460184940774714e-05, "loss": 0.4414, "step": 1616 }, { "epoch": 2.6639209225700164, "grad_norm": 0.35731741546215656, "learning_rate": 4.2414256629819376e-05, "loss": 0.4314, "step": 1617 }, { "epoch": 2.6655683690280068, "grad_norm": 0.3873077034582667, "learning_rate": 4.236832512409306e-05, "loss": 0.4334, "step": 1618 }, { "epoch": 2.6672158154859966, "grad_norm": 0.3396824627029171, "learning_rate": 4.2322390484376664e-05, "loss": 0.4305, "step": 1619 }, { "epoch": 2.668863261943987, "grad_norm": 0.23620156858941085, "learning_rate": 4.227645277145518e-05, "loss": 0.4337, "step": 1620 }, { "epoch": 2.670510708401977, "grad_norm": 0.2883461642088114, "learning_rate": 4.223051204611775e-05, "loss": 0.4247, "step": 1621 }, { "epoch": 2.672158154859967, "grad_norm": 0.4401208834563027, "learning_rate": 4.218456836915741e-05, "loss": 0.4431, "step": 1622 }, { "epoch": 2.673805601317957, "grad_norm": 0.3880136537089656, "learning_rate": 4.213862180137116e-05, "loss": 0.4281, "step": 1623 }, { "epoch": 2.6754530477759473, "grad_norm": 0.32388918421206164, "learning_rate": 4.209267240355982e-05, "loss": 0.4338, "step": 1624 }, { "epoch": 2.6771004942339376, "grad_norm": 0.23003936530249808, "learning_rate": 4.204672023652793e-05, "loss": 0.4306, "step": 1625 }, { "epoch": 2.6787479406919275, "grad_norm": 0.30362205532523673, "learning_rate": 4.200076536108373e-05, "loss": 0.4313, "step": 1626 }, { "epoch": 2.6803953871499178, "grad_norm": 0.28508878573655855, "learning_rate": 4.195480783803901e-05, "loss": 0.4324, "step": 1627 }, { "epoch": 2.6820428336079076, "grad_norm": 0.24386601355381354, "learning_rate": 4.1908847728209074e-05, "loss": 0.4275, "step": 1628 }, { "epoch": 2.683690280065898, "grad_norm": 0.225854222258109, "learning_rate": 4.1862885092412664e-05, "loss": 0.4325, "step": 1629 }, { "epoch": 2.685337726523888, "grad_norm": 0.3123077047761177, "learning_rate": 4.181691999147185e-05, "loss": 0.4279, "step": 1630 }, { "epoch": 2.686985172981878, "grad_norm": 0.3260787343420072, "learning_rate": 4.1770952486211974e-05, "loss": 0.4266, "step": 1631 }, { "epoch": 2.6886326194398684, "grad_norm": 0.2618306095519384, "learning_rate": 4.172498263746154e-05, "loss": 0.4378, "step": 1632 }, { "epoch": 2.6902800658978583, "grad_norm": 0.24412817919974486, "learning_rate": 4.167901050605218e-05, "loss": 0.4349, "step": 1633 }, { "epoch": 2.6919275123558486, "grad_norm": 0.2894451726123454, "learning_rate": 4.163303615281853e-05, "loss": 0.438, "step": 1634 }, { "epoch": 2.6935749588138385, "grad_norm": 0.24468968003664415, "learning_rate": 4.158705963859816e-05, "loss": 0.4319, "step": 1635 }, { "epoch": 2.6952224052718288, "grad_norm": 0.26761100246858677, "learning_rate": 4.154108102423152e-05, "loss": 0.4356, "step": 1636 }, { "epoch": 2.6968698517298186, "grad_norm": 0.21714725160916726, "learning_rate": 4.149510037056182e-05, "loss": 0.4235, "step": 1637 }, { "epoch": 2.698517298187809, "grad_norm": 0.25051226055671144, "learning_rate": 4.1449117738434984e-05, "loss": 0.4294, "step": 1638 }, { "epoch": 2.7001647446457993, "grad_norm": 0.2144686564113709, "learning_rate": 4.140313318869955e-05, "loss": 0.4321, "step": 1639 }, { "epoch": 2.701812191103789, "grad_norm": 0.2042294683044893, "learning_rate": 4.135714678220656e-05, "loss": 0.4297, "step": 1640 }, { "epoch": 2.703459637561779, "grad_norm": 0.21873890237637286, "learning_rate": 4.131115857980959e-05, "loss": 0.4325, "step": 1641 }, { "epoch": 2.7051070840197693, "grad_norm": 0.21195801476392476, "learning_rate": 4.126516864236451e-05, "loss": 0.4326, "step": 1642 }, { "epoch": 2.7067545304777596, "grad_norm": 0.251049291061298, "learning_rate": 4.121917703072954e-05, "loss": 0.4345, "step": 1643 }, { "epoch": 2.7084019769357495, "grad_norm": 0.2192126666682618, "learning_rate": 4.1173183805765096e-05, "loss": 0.4284, "step": 1644 }, { "epoch": 2.7100494233937398, "grad_norm": 0.21381677790100057, "learning_rate": 4.112718902833371e-05, "loss": 0.433, "step": 1645 }, { "epoch": 2.71169686985173, "grad_norm": 0.23916483390663434, "learning_rate": 4.108119275930002e-05, "loss": 0.4234, "step": 1646 }, { "epoch": 2.71334431630972, "grad_norm": 0.17411372064521882, "learning_rate": 4.1035195059530576e-05, "loss": 0.4256, "step": 1647 }, { "epoch": 2.71499176276771, "grad_norm": 0.25481145601906147, "learning_rate": 4.098919598989389e-05, "loss": 0.4289, "step": 1648 }, { "epoch": 2.7166392092257, "grad_norm": 0.22541134127359258, "learning_rate": 4.0943195611260206e-05, "loss": 0.4302, "step": 1649 }, { "epoch": 2.7182866556836904, "grad_norm": 0.2206779728440063, "learning_rate": 4.089719398450156e-05, "loss": 0.4338, "step": 1650 }, { "epoch": 2.7199341021416803, "grad_norm": 0.24719686865107138, "learning_rate": 4.085119117049164e-05, "loss": 0.4302, "step": 1651 }, { "epoch": 2.7215815485996706, "grad_norm": 0.1995645022871235, "learning_rate": 4.080518723010565e-05, "loss": 0.4395, "step": 1652 }, { "epoch": 2.7232289950576605, "grad_norm": 0.2313373218192144, "learning_rate": 4.075918222422037e-05, "loss": 0.435, "step": 1653 }, { "epoch": 2.724876441515651, "grad_norm": 0.2577942905744408, "learning_rate": 4.071317621371388e-05, "loss": 0.4325, "step": 1654 }, { "epoch": 2.7265238879736406, "grad_norm": 0.21210261895028037, "learning_rate": 4.066716925946571e-05, "loss": 0.4262, "step": 1655 }, { "epoch": 2.728171334431631, "grad_norm": 0.1977984551558657, "learning_rate": 4.062116142235655e-05, "loss": 0.4368, "step": 1656 }, { "epoch": 2.7298187808896213, "grad_norm": 0.25147176967404045, "learning_rate": 4.0575152763268286e-05, "loss": 0.4354, "step": 1657 }, { "epoch": 2.731466227347611, "grad_norm": 0.21502246744199574, "learning_rate": 4.0529143343083915e-05, "loss": 0.4241, "step": 1658 }, { "epoch": 2.7331136738056014, "grad_norm": 0.191262692790558, "learning_rate": 4.04831332226874e-05, "loss": 0.4301, "step": 1659 }, { "epoch": 2.7347611202635913, "grad_norm": 0.20429205299930017, "learning_rate": 4.043712246296366e-05, "loss": 0.4331, "step": 1660 }, { "epoch": 2.7364085667215816, "grad_norm": 0.24703098447806116, "learning_rate": 4.039111112479846e-05, "loss": 0.441, "step": 1661 }, { "epoch": 2.7380560131795715, "grad_norm": 0.2550394717762531, "learning_rate": 4.034509926907829e-05, "loss": 0.4265, "step": 1662 }, { "epoch": 2.739703459637562, "grad_norm": 0.16045834390936833, "learning_rate": 4.029908695669039e-05, "loss": 0.436, "step": 1663 }, { "epoch": 2.741350906095552, "grad_norm": 0.22967062842505442, "learning_rate": 4.025307424852254e-05, "loss": 0.4304, "step": 1664 }, { "epoch": 2.742998352553542, "grad_norm": 0.21712859953832547, "learning_rate": 4.0207061205463105e-05, "loss": 0.438, "step": 1665 }, { "epoch": 2.7446457990115323, "grad_norm": 0.20289415631489208, "learning_rate": 4.016104788840084e-05, "loss": 0.4293, "step": 1666 }, { "epoch": 2.746293245469522, "grad_norm": 0.17856965564901783, "learning_rate": 4.011503435822491e-05, "loss": 0.4325, "step": 1667 }, { "epoch": 2.7479406919275124, "grad_norm": 0.18498151690942172, "learning_rate": 4.0069020675824705e-05, "loss": 0.421, "step": 1668 }, { "epoch": 2.7495881383855023, "grad_norm": 0.2066810305069699, "learning_rate": 4.0023006902089875e-05, "loss": 0.4305, "step": 1669 }, { "epoch": 2.7512355848434926, "grad_norm": 0.2039664082353199, "learning_rate": 3.9976993097910145e-05, "loss": 0.4316, "step": 1670 }, { "epoch": 2.752883031301483, "grad_norm": 0.21116174078113958, "learning_rate": 3.99309793241753e-05, "loss": 0.4301, "step": 1671 }, { "epoch": 2.754530477759473, "grad_norm": 0.24630457803151723, "learning_rate": 3.9884965641775105e-05, "loss": 0.4398, "step": 1672 }, { "epoch": 2.7561779242174627, "grad_norm": 0.26957469322712874, "learning_rate": 3.9838952111599164e-05, "loss": 0.4338, "step": 1673 }, { "epoch": 2.757825370675453, "grad_norm": 0.2074628866516185, "learning_rate": 3.97929387945369e-05, "loss": 0.4304, "step": 1674 }, { "epoch": 2.7594728171334433, "grad_norm": 0.25359926104564595, "learning_rate": 3.9746925751477465e-05, "loss": 0.434, "step": 1675 }, { "epoch": 2.761120263591433, "grad_norm": 0.2700929364421924, "learning_rate": 3.970091304330963e-05, "loss": 0.433, "step": 1676 }, { "epoch": 2.7627677100494235, "grad_norm": 0.20561091432663758, "learning_rate": 3.9654900730921724e-05, "loss": 0.4449, "step": 1677 }, { "epoch": 2.7644151565074138, "grad_norm": 0.22799178045501683, "learning_rate": 3.9608888875201556e-05, "loss": 0.4378, "step": 1678 }, { "epoch": 2.7660626029654036, "grad_norm": 0.22333778725304027, "learning_rate": 3.956287753703635e-05, "loss": 0.4368, "step": 1679 }, { "epoch": 2.7677100494233935, "grad_norm": 0.1871805175204774, "learning_rate": 3.951686677731261e-05, "loss": 0.4249, "step": 1680 }, { "epoch": 2.769357495881384, "grad_norm": 0.20464131070149003, "learning_rate": 3.94708566569161e-05, "loss": 0.4299, "step": 1681 }, { "epoch": 2.771004942339374, "grad_norm": 0.16516512923120236, "learning_rate": 3.942484723673172e-05, "loss": 0.433, "step": 1682 }, { "epoch": 2.772652388797364, "grad_norm": 0.19929965633569605, "learning_rate": 3.9378838577643456e-05, "loss": 0.4306, "step": 1683 }, { "epoch": 2.7742998352553543, "grad_norm": 0.19701216502004362, "learning_rate": 3.93328307405343e-05, "loss": 0.4261, "step": 1684 }, { "epoch": 2.775947281713344, "grad_norm": 0.22623603791272098, "learning_rate": 3.928682378628613e-05, "loss": 0.4424, "step": 1685 }, { "epoch": 2.7775947281713345, "grad_norm": 0.265849659919266, "learning_rate": 3.9240817775779653e-05, "loss": 0.4343, "step": 1686 }, { "epoch": 2.7792421746293243, "grad_norm": 0.17590424113526684, "learning_rate": 3.919481276989436e-05, "loss": 0.4277, "step": 1687 }, { "epoch": 2.7808896210873146, "grad_norm": 0.19096588435872144, "learning_rate": 3.914880882950837e-05, "loss": 0.427, "step": 1688 }, { "epoch": 2.782537067545305, "grad_norm": 0.20241514969676921, "learning_rate": 3.9102806015498444e-05, "loss": 0.4327, "step": 1689 }, { "epoch": 2.784184514003295, "grad_norm": 0.22313461110292143, "learning_rate": 3.905680438873981e-05, "loss": 0.4321, "step": 1690 }, { "epoch": 2.785831960461285, "grad_norm": 0.19117195825770006, "learning_rate": 3.9010804010106126e-05, "loss": 0.436, "step": 1691 }, { "epoch": 2.787479406919275, "grad_norm": 0.1923120172044873, "learning_rate": 3.896480494046943e-05, "loss": 0.4319, "step": 1692 }, { "epoch": 2.7891268533772653, "grad_norm": 0.17721778502710162, "learning_rate": 3.891880724069999e-05, "loss": 0.4313, "step": 1693 }, { "epoch": 2.790774299835255, "grad_norm": 0.17920061299522458, "learning_rate": 3.8872810971666296e-05, "loss": 0.4328, "step": 1694 }, { "epoch": 2.7924217462932455, "grad_norm": 0.21287822494713815, "learning_rate": 3.882681619423492e-05, "loss": 0.4304, "step": 1695 }, { "epoch": 2.7940691927512358, "grad_norm": 0.2018498266602819, "learning_rate": 3.878082296927047e-05, "loss": 0.4256, "step": 1696 }, { "epoch": 2.7957166392092256, "grad_norm": 0.2915715500043882, "learning_rate": 3.87348313576355e-05, "loss": 0.4272, "step": 1697 }, { "epoch": 2.797364085667216, "grad_norm": 0.2700379554653843, "learning_rate": 3.868884142019042e-05, "loss": 0.4407, "step": 1698 }, { "epoch": 2.799011532125206, "grad_norm": 0.25833348192043876, "learning_rate": 3.864285321779345e-05, "loss": 0.4313, "step": 1699 }, { "epoch": 2.800658978583196, "grad_norm": 0.18513434939525214, "learning_rate": 3.8596866811300466e-05, "loss": 0.4261, "step": 1700 }, { "epoch": 2.802306425041186, "grad_norm": 0.26644048362408823, "learning_rate": 3.855088226156503e-05, "loss": 0.4309, "step": 1701 }, { "epoch": 2.8039538714991763, "grad_norm": 0.36314312551796574, "learning_rate": 3.8504899629438195e-05, "loss": 0.4313, "step": 1702 }, { "epoch": 2.8056013179571666, "grad_norm": 0.3299335107650698, "learning_rate": 3.845891897576849e-05, "loss": 0.4394, "step": 1703 }, { "epoch": 2.8072487644151565, "grad_norm": 0.2350239314104466, "learning_rate": 3.841294036140185e-05, "loss": 0.4301, "step": 1704 }, { "epoch": 2.808896210873147, "grad_norm": 0.261156386759192, "learning_rate": 3.8366963847181475e-05, "loss": 0.4335, "step": 1705 }, { "epoch": 2.8105436573311366, "grad_norm": 0.3286860525889842, "learning_rate": 3.832098949394783e-05, "loss": 0.4285, "step": 1706 }, { "epoch": 2.812191103789127, "grad_norm": 0.2944591906116293, "learning_rate": 3.827501736253847e-05, "loss": 0.4333, "step": 1707 }, { "epoch": 2.813838550247117, "grad_norm": 0.2312049071006984, "learning_rate": 3.822904751378803e-05, "loss": 0.4343, "step": 1708 }, { "epoch": 2.815485996705107, "grad_norm": 0.21822501120224114, "learning_rate": 3.818308000852816e-05, "loss": 0.4355, "step": 1709 }, { "epoch": 2.8171334431630974, "grad_norm": 0.25551066022285274, "learning_rate": 3.813711490758734e-05, "loss": 0.4303, "step": 1710 }, { "epoch": 2.8187808896210873, "grad_norm": 0.2398041847394207, "learning_rate": 3.809115227179094e-05, "loss": 0.4268, "step": 1711 }, { "epoch": 2.820428336079077, "grad_norm": 0.2513795379689032, "learning_rate": 3.8045192161961e-05, "loss": 0.4312, "step": 1712 }, { "epoch": 2.8220757825370675, "grad_norm": 0.19640041046330683, "learning_rate": 3.799923463891628e-05, "loss": 0.4398, "step": 1713 }, { "epoch": 2.823723228995058, "grad_norm": 0.2623592677649679, "learning_rate": 3.795327976347208e-05, "loss": 0.4291, "step": 1714 }, { "epoch": 2.8253706754530477, "grad_norm": 0.2111915292698934, "learning_rate": 3.790732759644019e-05, "loss": 0.4339, "step": 1715 }, { "epoch": 2.827018121911038, "grad_norm": 0.20908007113946278, "learning_rate": 3.786137819862885e-05, "loss": 0.4278, "step": 1716 }, { "epoch": 2.8286655683690283, "grad_norm": 0.26711523086011585, "learning_rate": 3.7815431630842596e-05, "loss": 0.4298, "step": 1717 }, { "epoch": 2.830313014827018, "grad_norm": 0.30968909575432607, "learning_rate": 3.7769487953882267e-05, "loss": 0.434, "step": 1718 }, { "epoch": 2.831960461285008, "grad_norm": 0.2726814587277003, "learning_rate": 3.7723547228544825e-05, "loss": 0.4417, "step": 1719 }, { "epoch": 2.8336079077429983, "grad_norm": 0.19128325305831254, "learning_rate": 3.767760951562335e-05, "loss": 0.4285, "step": 1720 }, { "epoch": 2.8352553542009886, "grad_norm": 0.251264122299503, "learning_rate": 3.763167487590695e-05, "loss": 0.435, "step": 1721 }, { "epoch": 2.8369028006589785, "grad_norm": 0.28636951633427116, "learning_rate": 3.758574337018063e-05, "loss": 0.4253, "step": 1722 }, { "epoch": 2.838550247116969, "grad_norm": 0.20700033704659102, "learning_rate": 3.7539815059225306e-05, "loss": 0.4269, "step": 1723 }, { "epoch": 2.8401976935749587, "grad_norm": 0.23054244622887318, "learning_rate": 3.7493890003817594e-05, "loss": 0.4315, "step": 1724 }, { "epoch": 2.841845140032949, "grad_norm": 0.2027954055724543, "learning_rate": 3.744796826472985e-05, "loss": 0.431, "step": 1725 }, { "epoch": 2.843492586490939, "grad_norm": 0.20086885288465134, "learning_rate": 3.740204990273004e-05, "loss": 0.4316, "step": 1726 }, { "epoch": 2.845140032948929, "grad_norm": 0.18717280370776698, "learning_rate": 3.735613497858162e-05, "loss": 0.4382, "step": 1727 }, { "epoch": 2.8467874794069195, "grad_norm": 0.18666283662826866, "learning_rate": 3.731022355304357e-05, "loss": 0.4292, "step": 1728 }, { "epoch": 2.8484349258649093, "grad_norm": 0.1716345154779152, "learning_rate": 3.726431568687016e-05, "loss": 0.4357, "step": 1729 }, { "epoch": 2.8500823723228996, "grad_norm": 0.1939388946831365, "learning_rate": 3.7218411440811016e-05, "loss": 0.4245, "step": 1730 }, { "epoch": 2.8517298187808895, "grad_norm": 0.2059812929619565, "learning_rate": 3.717251087561093e-05, "loss": 0.4383, "step": 1731 }, { "epoch": 2.85337726523888, "grad_norm": 0.26646003430007587, "learning_rate": 3.712661405200983e-05, "loss": 0.436, "step": 1732 }, { "epoch": 2.8550247116968697, "grad_norm": 0.17969648455160023, "learning_rate": 3.708072103074272e-05, "loss": 0.4265, "step": 1733 }, { "epoch": 2.85667215815486, "grad_norm": 0.20676334188926787, "learning_rate": 3.703483187253954e-05, "loss": 0.4334, "step": 1734 }, { "epoch": 2.8583196046128503, "grad_norm": 0.20478214969329203, "learning_rate": 3.698894663812514e-05, "loss": 0.4251, "step": 1735 }, { "epoch": 2.85996705107084, "grad_norm": 0.2165527761598463, "learning_rate": 3.694306538821916e-05, "loss": 0.4303, "step": 1736 }, { "epoch": 2.8616144975288305, "grad_norm": 0.17142444482593164, "learning_rate": 3.689718818353598e-05, "loss": 0.4275, "step": 1737 }, { "epoch": 2.8632619439868203, "grad_norm": 0.21631869726727596, "learning_rate": 3.685131508478462e-05, "loss": 0.4318, "step": 1738 }, { "epoch": 2.8649093904448106, "grad_norm": 0.20036086871528552, "learning_rate": 3.680544615266868e-05, "loss": 0.4342, "step": 1739 }, { "epoch": 2.8665568369028005, "grad_norm": 0.1763341514534332, "learning_rate": 3.675958144788623e-05, "loss": 0.4307, "step": 1740 }, { "epoch": 2.868204283360791, "grad_norm": 0.18588492501253773, "learning_rate": 3.671372103112975e-05, "loss": 0.4279, "step": 1741 }, { "epoch": 2.869851729818781, "grad_norm": 0.1799405546657709, "learning_rate": 3.6667864963086046e-05, "loss": 0.439, "step": 1742 }, { "epoch": 2.871499176276771, "grad_norm": 0.1973434975458422, "learning_rate": 3.662201330443618e-05, "loss": 0.4409, "step": 1743 }, { "epoch": 2.873146622734761, "grad_norm": 0.19039408804617708, "learning_rate": 3.657616611585536e-05, "loss": 0.4422, "step": 1744 }, { "epoch": 2.874794069192751, "grad_norm": 0.1869581807706466, "learning_rate": 3.653032345801291e-05, "loss": 0.433, "step": 1745 }, { "epoch": 2.8764415156507415, "grad_norm": 0.18294642400000252, "learning_rate": 3.64844853915721e-05, "loss": 0.4336, "step": 1746 }, { "epoch": 2.8780889621087313, "grad_norm": 0.18437380332996967, "learning_rate": 3.64386519771902e-05, "loss": 0.4257, "step": 1747 }, { "epoch": 2.8797364085667216, "grad_norm": 0.20522119242126707, "learning_rate": 3.639282327551827e-05, "loss": 0.4429, "step": 1748 }, { "epoch": 2.881383855024712, "grad_norm": 0.21781956743386324, "learning_rate": 3.6346999347201136e-05, "loss": 0.4373, "step": 1749 }, { "epoch": 2.883031301482702, "grad_norm": 0.21058289078625106, "learning_rate": 3.6301180252877345e-05, "loss": 0.429, "step": 1750 }, { "epoch": 2.8846787479406917, "grad_norm": 0.2367029073042238, "learning_rate": 3.6255366053179004e-05, "loss": 0.4291, "step": 1751 }, { "epoch": 2.886326194398682, "grad_norm": 0.17863236491457793, "learning_rate": 3.6209556808731784e-05, "loss": 0.4306, "step": 1752 }, { "epoch": 2.8879736408566723, "grad_norm": 0.18172313562621983, "learning_rate": 3.616375258015476e-05, "loss": 0.433, "step": 1753 }, { "epoch": 2.889621087314662, "grad_norm": 0.17011977175219378, "learning_rate": 3.611795342806038e-05, "loss": 0.4329, "step": 1754 }, { "epoch": 2.8912685337726525, "grad_norm": 0.1910068806600811, "learning_rate": 3.60721594130544e-05, "loss": 0.4287, "step": 1755 }, { "epoch": 2.892915980230643, "grad_norm": 0.20178810231475008, "learning_rate": 3.6026370595735744e-05, "loss": 0.4312, "step": 1756 }, { "epoch": 2.8945634266886326, "grad_norm": 0.16859712406745847, "learning_rate": 3.598058703669648e-05, "loss": 0.4292, "step": 1757 }, { "epoch": 2.8962108731466225, "grad_norm": 0.20223655947900995, "learning_rate": 3.593480879652171e-05, "loss": 0.429, "step": 1758 }, { "epoch": 2.897858319604613, "grad_norm": 0.2526490197594635, "learning_rate": 3.5889035935789495e-05, "loss": 0.4369, "step": 1759 }, { "epoch": 2.899505766062603, "grad_norm": 0.18566804148865199, "learning_rate": 3.584326851507079e-05, "loss": 0.4312, "step": 1760 }, { "epoch": 2.901153212520593, "grad_norm": 0.22010980845858777, "learning_rate": 3.5797506594929325e-05, "loss": 0.4333, "step": 1761 }, { "epoch": 2.9028006589785833, "grad_norm": 0.17978040303285858, "learning_rate": 3.575175023592159e-05, "loss": 0.4383, "step": 1762 }, { "epoch": 2.904448105436573, "grad_norm": 0.20178681928012349, "learning_rate": 3.570599949859666e-05, "loss": 0.4421, "step": 1763 }, { "epoch": 2.9060955518945635, "grad_norm": 0.20251290720786513, "learning_rate": 3.566025444349624e-05, "loss": 0.4336, "step": 1764 }, { "epoch": 2.9077429983525533, "grad_norm": 0.19377126067437572, "learning_rate": 3.561451513115447e-05, "loss": 0.4261, "step": 1765 }, { "epoch": 2.9093904448105437, "grad_norm": 0.16543150727424819, "learning_rate": 3.5568781622097885e-05, "loss": 0.4323, "step": 1766 }, { "epoch": 2.911037891268534, "grad_norm": 0.16886602041600157, "learning_rate": 3.552305397684538e-05, "loss": 0.4315, "step": 1767 }, { "epoch": 2.912685337726524, "grad_norm": 0.21969143330936278, "learning_rate": 3.547733225590805e-05, "loss": 0.4362, "step": 1768 }, { "epoch": 2.914332784184514, "grad_norm": 0.20445597514816427, "learning_rate": 3.543161651978918e-05, "loss": 0.4294, "step": 1769 }, { "epoch": 2.915980230642504, "grad_norm": 0.14627504398672006, "learning_rate": 3.538590682898412e-05, "loss": 0.427, "step": 1770 }, { "epoch": 2.9176276771004943, "grad_norm": 0.1815070628372929, "learning_rate": 3.53402032439802e-05, "loss": 0.4244, "step": 1771 }, { "epoch": 2.919275123558484, "grad_norm": 0.21293234246183926, "learning_rate": 3.529450582525673e-05, "loss": 0.4267, "step": 1772 }, { "epoch": 2.9209225700164745, "grad_norm": 0.2067105492752039, "learning_rate": 3.5248814633284805e-05, "loss": 0.4322, "step": 1773 }, { "epoch": 2.922570016474465, "grad_norm": 0.18476424909723702, "learning_rate": 3.52031297285273e-05, "loss": 0.4274, "step": 1774 }, { "epoch": 2.9242174629324547, "grad_norm": 0.15743085306817872, "learning_rate": 3.515745117143877e-05, "loss": 0.4409, "step": 1775 }, { "epoch": 2.925864909390445, "grad_norm": 0.2158417474929842, "learning_rate": 3.511177902246537e-05, "loss": 0.4319, "step": 1776 }, { "epoch": 2.927512355848435, "grad_norm": 0.22634803539423703, "learning_rate": 3.5066113342044774e-05, "loss": 0.4385, "step": 1777 }, { "epoch": 2.929159802306425, "grad_norm": 0.19895137344105251, "learning_rate": 3.5020454190606085e-05, "loss": 0.4344, "step": 1778 }, { "epoch": 2.930807248764415, "grad_norm": 0.1650525714911717, "learning_rate": 3.49748016285698e-05, "loss": 0.4334, "step": 1779 }, { "epoch": 2.9324546952224053, "grad_norm": 0.20697484560524668, "learning_rate": 3.492915571634766e-05, "loss": 0.4326, "step": 1780 }, { "epoch": 2.9341021416803956, "grad_norm": 0.20578066182199095, "learning_rate": 3.488351651434262e-05, "loss": 0.4394, "step": 1781 }, { "epoch": 2.9357495881383855, "grad_norm": 0.18296165917763263, "learning_rate": 3.483788408294876e-05, "loss": 0.4344, "step": 1782 }, { "epoch": 2.9373970345963754, "grad_norm": 0.19590169974967228, "learning_rate": 3.4792258482551195e-05, "loss": 0.4289, "step": 1783 }, { "epoch": 2.9390444810543657, "grad_norm": 0.1995861388646, "learning_rate": 3.4746639773526e-05, "loss": 0.4396, "step": 1784 }, { "epoch": 2.940691927512356, "grad_norm": 0.16247194269265586, "learning_rate": 3.470102801624014e-05, "loss": 0.4308, "step": 1785 }, { "epoch": 2.942339373970346, "grad_norm": 0.23345370918171887, "learning_rate": 3.465542327105136e-05, "loss": 0.4257, "step": 1786 }, { "epoch": 2.943986820428336, "grad_norm": 0.17565611683288643, "learning_rate": 3.460982559830815e-05, "loss": 0.4331, "step": 1787 }, { "epoch": 2.9456342668863265, "grad_norm": 0.19819330932823603, "learning_rate": 3.456423505834962e-05, "loss": 0.4273, "step": 1788 }, { "epoch": 2.9472817133443163, "grad_norm": 0.15799755882787, "learning_rate": 3.451865171150547e-05, "loss": 0.4307, "step": 1789 }, { "epoch": 2.948929159802306, "grad_norm": 0.21171692005585524, "learning_rate": 3.4473075618095844e-05, "loss": 0.4364, "step": 1790 }, { "epoch": 2.9505766062602965, "grad_norm": 0.15388811715298015, "learning_rate": 3.442750683843132e-05, "loss": 0.4287, "step": 1791 }, { "epoch": 2.952224052718287, "grad_norm": 0.19303111585265673, "learning_rate": 3.4381945432812775e-05, "loss": 0.4302, "step": 1792 }, { "epoch": 2.9538714991762767, "grad_norm": 0.15467477932313695, "learning_rate": 3.433639146153134e-05, "loss": 0.4274, "step": 1793 }, { "epoch": 2.955518945634267, "grad_norm": 0.17399690113728325, "learning_rate": 3.429084498486831e-05, "loss": 0.428, "step": 1794 }, { "epoch": 2.957166392092257, "grad_norm": 0.20220760617431358, "learning_rate": 3.424530606309504e-05, "loss": 0.4334, "step": 1795 }, { "epoch": 2.958813838550247, "grad_norm": 0.19981860437260185, "learning_rate": 3.4199774756472934e-05, "loss": 0.4301, "step": 1796 }, { "epoch": 2.960461285008237, "grad_norm": 0.16460390838837488, "learning_rate": 3.415425112525327e-05, "loss": 0.4219, "step": 1797 }, { "epoch": 2.9621087314662273, "grad_norm": 0.19387878315333518, "learning_rate": 3.410873522967721e-05, "loss": 0.4298, "step": 1798 }, { "epoch": 2.9637561779242176, "grad_norm": 0.1826646056783254, "learning_rate": 3.406322712997563e-05, "loss": 0.4194, "step": 1799 }, { "epoch": 2.9654036243822075, "grad_norm": 0.1726369066758185, "learning_rate": 3.401772688636913e-05, "loss": 0.4316, "step": 1800 }, { "epoch": 2.967051070840198, "grad_norm": 0.1689170572694944, "learning_rate": 3.397223455906792e-05, "loss": 0.4288, "step": 1801 }, { "epoch": 2.9686985172981877, "grad_norm": 0.2088279976264514, "learning_rate": 3.392675020827169e-05, "loss": 0.4388, "step": 1802 }, { "epoch": 2.970345963756178, "grad_norm": 0.20860608463775895, "learning_rate": 3.388127389416963e-05, "loss": 0.4317, "step": 1803 }, { "epoch": 2.971993410214168, "grad_norm": 0.18031643172061643, "learning_rate": 3.383580567694025e-05, "loss": 0.433, "step": 1804 }, { "epoch": 2.973640856672158, "grad_norm": 0.19130591751194648, "learning_rate": 3.379034561675136e-05, "loss": 0.4372, "step": 1805 }, { "epoch": 2.9752883031301485, "grad_norm": 0.19138049961761866, "learning_rate": 3.3744893773759986e-05, "loss": 0.4367, "step": 1806 }, { "epoch": 2.9769357495881383, "grad_norm": 0.19848342298019522, "learning_rate": 3.369945020811227e-05, "loss": 0.4283, "step": 1807 }, { "epoch": 2.9785831960461286, "grad_norm": 0.156175495552269, "learning_rate": 3.365401497994341e-05, "loss": 0.4323, "step": 1808 }, { "epoch": 2.9802306425041185, "grad_norm": 0.17265986130694339, "learning_rate": 3.360858814937756e-05, "loss": 0.4345, "step": 1809 }, { "epoch": 2.981878088962109, "grad_norm": 0.17219371921003074, "learning_rate": 3.356316977652776e-05, "loss": 0.4325, "step": 1810 }, { "epoch": 2.9835255354200987, "grad_norm": 0.2186568252098148, "learning_rate": 3.351775992149588e-05, "loss": 0.4289, "step": 1811 }, { "epoch": 2.985172981878089, "grad_norm": 0.15642529536503375, "learning_rate": 3.3472358644372475e-05, "loss": 0.4357, "step": 1812 }, { "epoch": 2.9868204283360793, "grad_norm": 0.18985116647950995, "learning_rate": 3.34269660052368e-05, "loss": 0.4267, "step": 1813 }, { "epoch": 2.988467874794069, "grad_norm": 0.210754892723987, "learning_rate": 3.3381582064156644e-05, "loss": 0.432, "step": 1814 }, { "epoch": 2.990115321252059, "grad_norm": 0.19605873480041255, "learning_rate": 3.333620688118831e-05, "loss": 0.4257, "step": 1815 }, { "epoch": 2.9917627677100493, "grad_norm": 0.21601134981101971, "learning_rate": 3.3290840516376485e-05, "loss": 0.4305, "step": 1816 }, { "epoch": 2.9934102141680397, "grad_norm": 0.17179687866575832, "learning_rate": 3.3245483029754206e-05, "loss": 0.4251, "step": 1817 }, { "epoch": 2.9950576606260295, "grad_norm": 0.1892060703810964, "learning_rate": 3.320013448134276e-05, "loss": 0.4338, "step": 1818 }, { "epoch": 2.99670510708402, "grad_norm": 0.17416307861709238, "learning_rate": 3.315479493115159e-05, "loss": 0.4299, "step": 1819 }, { "epoch": 2.99835255354201, "grad_norm": 0.1698205945905789, "learning_rate": 3.310946443917827e-05, "loss": 0.4288, "step": 1820 }, { "epoch": 3.0, "grad_norm": 0.20033488597647114, "learning_rate": 3.306414306540834e-05, "loss": 0.4159, "step": 1821 }, { "epoch": 3.0016474464579903, "grad_norm": 0.19340975893604564, "learning_rate": 3.30188308698153e-05, "loss": 0.4075, "step": 1822 }, { "epoch": 3.00329489291598, "grad_norm": 0.2022227608064696, "learning_rate": 3.2973527912360505e-05, "loss": 0.4068, "step": 1823 }, { "epoch": 3.0049423393739705, "grad_norm": 0.22469487821120326, "learning_rate": 3.292823425299307e-05, "loss": 0.404, "step": 1824 }, { "epoch": 3.0065897858319603, "grad_norm": 0.21427905561848276, "learning_rate": 3.288294995164983e-05, "loss": 0.4094, "step": 1825 }, { "epoch": 3.0082372322899507, "grad_norm": 0.1994674454958403, "learning_rate": 3.283767506825521e-05, "loss": 0.4127, "step": 1826 }, { "epoch": 3.0098846787479405, "grad_norm": 0.21589451645191587, "learning_rate": 3.279240966272119e-05, "loss": 0.4043, "step": 1827 }, { "epoch": 3.011532125205931, "grad_norm": 0.18037088227490225, "learning_rate": 3.274715379494722e-05, "loss": 0.4043, "step": 1828 }, { "epoch": 3.013179571663921, "grad_norm": 0.24364377521082184, "learning_rate": 3.270190752482008e-05, "loss": 0.4099, "step": 1829 }, { "epoch": 3.014827018121911, "grad_norm": 0.2047849149624791, "learning_rate": 3.26566709122139e-05, "loss": 0.4038, "step": 1830 }, { "epoch": 3.0164744645799013, "grad_norm": 0.2267825907675871, "learning_rate": 3.2611444016990015e-05, "loss": 0.4013, "step": 1831 }, { "epoch": 3.018121911037891, "grad_norm": 0.20064751809124687, "learning_rate": 3.256622689899689e-05, "loss": 0.409, "step": 1832 }, { "epoch": 3.0197693574958815, "grad_norm": 0.2223838023463452, "learning_rate": 3.252101961807007e-05, "loss": 0.4038, "step": 1833 }, { "epoch": 3.0214168039538714, "grad_norm": 0.19871147573984285, "learning_rate": 3.247582223403205e-05, "loss": 0.4029, "step": 1834 }, { "epoch": 3.0230642504118617, "grad_norm": 0.1802972281637381, "learning_rate": 3.243063480669228e-05, "loss": 0.4071, "step": 1835 }, { "epoch": 3.0247116968698515, "grad_norm": 0.21653587187263224, "learning_rate": 3.2385457395846976e-05, "loss": 0.4024, "step": 1836 }, { "epoch": 3.026359143327842, "grad_norm": 0.15960498893530237, "learning_rate": 3.2340290061279146e-05, "loss": 0.3985, "step": 1837 }, { "epoch": 3.028006589785832, "grad_norm": 0.21777319010783436, "learning_rate": 3.2295132862758455e-05, "loss": 0.4047, "step": 1838 }, { "epoch": 3.029654036243822, "grad_norm": 0.20331284202643718, "learning_rate": 3.2249985860041114e-05, "loss": 0.4089, "step": 1839 }, { "epoch": 3.0313014827018123, "grad_norm": 0.23052181332701496, "learning_rate": 3.220484911286991e-05, "loss": 0.4025, "step": 1840 }, { "epoch": 3.032948929159802, "grad_norm": 0.21359673282348296, "learning_rate": 3.2159722680974005e-05, "loss": 0.4026, "step": 1841 }, { "epoch": 3.0345963756177925, "grad_norm": 0.2099600616879522, "learning_rate": 3.2114606624068934e-05, "loss": 0.4021, "step": 1842 }, { "epoch": 3.0362438220757824, "grad_norm": 0.18581181132974084, "learning_rate": 3.20695010018565e-05, "loss": 0.4055, "step": 1843 }, { "epoch": 3.0378912685337727, "grad_norm": 0.1851309622426212, "learning_rate": 3.2024405874024674e-05, "loss": 0.4032, "step": 1844 }, { "epoch": 3.039538714991763, "grad_norm": 0.23011445511097048, "learning_rate": 3.19793213002476e-05, "loss": 0.4063, "step": 1845 }, { "epoch": 3.041186161449753, "grad_norm": 0.17549719880077663, "learning_rate": 3.1934247340185384e-05, "loss": 0.4104, "step": 1846 }, { "epoch": 3.042833607907743, "grad_norm": 0.2100310155661038, "learning_rate": 3.188918405348415e-05, "loss": 0.4003, "step": 1847 }, { "epoch": 3.044481054365733, "grad_norm": 0.1859707177844569, "learning_rate": 3.184413149977584e-05, "loss": 0.4123, "step": 1848 }, { "epoch": 3.0461285008237233, "grad_norm": 0.1680826434213319, "learning_rate": 3.179908973867824e-05, "loss": 0.399, "step": 1849 }, { "epoch": 3.047775947281713, "grad_norm": 0.19305108125462495, "learning_rate": 3.175405882979483e-05, "loss": 0.4066, "step": 1850 }, { "epoch": 3.0494233937397035, "grad_norm": 0.16722105217873973, "learning_rate": 3.170903883271473e-05, "loss": 0.4014, "step": 1851 }, { "epoch": 3.0510708401976934, "grad_norm": 0.16524700108460635, "learning_rate": 3.166402980701264e-05, "loss": 0.4071, "step": 1852 }, { "epoch": 3.0527182866556837, "grad_norm": 0.18152279758502918, "learning_rate": 3.1619031812248714e-05, "loss": 0.4018, "step": 1853 }, { "epoch": 3.054365733113674, "grad_norm": 0.1852007229418414, "learning_rate": 3.157404490796853e-05, "loss": 0.3955, "step": 1854 }, { "epoch": 3.056013179571664, "grad_norm": 0.1708361811087, "learning_rate": 3.152906915370298e-05, "loss": 0.4027, "step": 1855 }, { "epoch": 3.057660626029654, "grad_norm": 0.16765094205462747, "learning_rate": 3.1484104608968195e-05, "loss": 0.4073, "step": 1856 }, { "epoch": 3.059308072487644, "grad_norm": 0.1849115946163363, "learning_rate": 3.14391513332655e-05, "loss": 0.4, "step": 1857 }, { "epoch": 3.0609555189456343, "grad_norm": 0.17439958487531051, "learning_rate": 3.1394209386081266e-05, "loss": 0.4, "step": 1858 }, { "epoch": 3.062602965403624, "grad_norm": 0.17637387693781878, "learning_rate": 3.1349278826886914e-05, "loss": 0.4061, "step": 1859 }, { "epoch": 3.0642504118616145, "grad_norm": 0.16770357530920743, "learning_rate": 3.130435971513877e-05, "loss": 0.3986, "step": 1860 }, { "epoch": 3.065897858319605, "grad_norm": 0.17505057249452796, "learning_rate": 3.1259452110278014e-05, "loss": 0.4085, "step": 1861 }, { "epoch": 3.0675453047775947, "grad_norm": 0.20478342665429491, "learning_rate": 3.1214556071730616e-05, "loss": 0.4074, "step": 1862 }, { "epoch": 3.069192751235585, "grad_norm": 0.1751068905807977, "learning_rate": 3.116967165890721e-05, "loss": 0.4092, "step": 1863 }, { "epoch": 3.070840197693575, "grad_norm": 0.19224872335141127, "learning_rate": 3.1124798931203084e-05, "loss": 0.4081, "step": 1864 }, { "epoch": 3.072487644151565, "grad_norm": 0.21113784516624884, "learning_rate": 3.107993794799802e-05, "loss": 0.4084, "step": 1865 }, { "epoch": 3.074135090609555, "grad_norm": 0.18610576113331595, "learning_rate": 3.103508876865631e-05, "loss": 0.4086, "step": 1866 }, { "epoch": 3.0757825370675453, "grad_norm": 0.19038271430985174, "learning_rate": 3.0990251452526586e-05, "loss": 0.4059, "step": 1867 }, { "epoch": 3.077429983525535, "grad_norm": 0.15347728627523996, "learning_rate": 3.094542605894177e-05, "loss": 0.3979, "step": 1868 }, { "epoch": 3.0790774299835255, "grad_norm": 0.20422243862922923, "learning_rate": 3.090061264721907e-05, "loss": 0.4072, "step": 1869 }, { "epoch": 3.080724876441516, "grad_norm": 0.16558704216984094, "learning_rate": 3.0855811276659765e-05, "loss": 0.4034, "step": 1870 }, { "epoch": 3.0823723228995057, "grad_norm": 0.2106327620514912, "learning_rate": 3.081102200654925e-05, "loss": 0.4052, "step": 1871 }, { "epoch": 3.084019769357496, "grad_norm": 0.19095047063575846, "learning_rate": 3.07662448961569e-05, "loss": 0.4044, "step": 1872 }, { "epoch": 3.085667215815486, "grad_norm": 0.2523460495225735, "learning_rate": 3.0721480004735964e-05, "loss": 0.4094, "step": 1873 }, { "epoch": 3.087314662273476, "grad_norm": 0.17619050790218893, "learning_rate": 3.067672739152357e-05, "loss": 0.4042, "step": 1874 }, { "epoch": 3.088962108731466, "grad_norm": 0.2458130462108697, "learning_rate": 3.063198711574056e-05, "loss": 0.4082, "step": 1875 }, { "epoch": 3.0906095551894563, "grad_norm": 0.13385089512840367, "learning_rate": 3.058725923659149e-05, "loss": 0.4052, "step": 1876 }, { "epoch": 3.0922570016474467, "grad_norm": 0.1935920407796571, "learning_rate": 3.054254381326446e-05, "loss": 0.4066, "step": 1877 }, { "epoch": 3.0939044481054365, "grad_norm": 0.15703838449318516, "learning_rate": 3.0497840904931126e-05, "loss": 0.4031, "step": 1878 }, { "epoch": 3.095551894563427, "grad_norm": 0.16695787554666414, "learning_rate": 3.0453150570746587e-05, "loss": 0.4013, "step": 1879 }, { "epoch": 3.0971993410214167, "grad_norm": 0.15078922161510955, "learning_rate": 3.0408472869849256e-05, "loss": 0.404, "step": 1880 }, { "epoch": 3.098846787479407, "grad_norm": 0.20339433162567305, "learning_rate": 3.0363807861360897e-05, "loss": 0.4007, "step": 1881 }, { "epoch": 3.100494233937397, "grad_norm": 0.1478644908834625, "learning_rate": 3.0319155604386407e-05, "loss": 0.4092, "step": 1882 }, { "epoch": 3.102141680395387, "grad_norm": 0.19183771881335462, "learning_rate": 3.0274516158013874e-05, "loss": 0.403, "step": 1883 }, { "epoch": 3.1037891268533775, "grad_norm": 0.19250671527822263, "learning_rate": 3.0229889581314385e-05, "loss": 0.4062, "step": 1884 }, { "epoch": 3.1054365733113674, "grad_norm": 0.144906984203384, "learning_rate": 3.0185275933342012e-05, "loss": 0.4056, "step": 1885 }, { "epoch": 3.1070840197693577, "grad_norm": 0.197557323421146, "learning_rate": 3.0140675273133745e-05, "loss": 0.4119, "step": 1886 }, { "epoch": 3.1087314662273475, "grad_norm": 0.17107967275721975, "learning_rate": 3.0096087659709327e-05, "loss": 0.4039, "step": 1887 }, { "epoch": 3.110378912685338, "grad_norm": 0.200520037743449, "learning_rate": 3.0051513152071305e-05, "loss": 0.4061, "step": 1888 }, { "epoch": 3.1120263591433277, "grad_norm": 0.20233305435916332, "learning_rate": 3.0006951809204844e-05, "loss": 0.4075, "step": 1889 }, { "epoch": 3.113673805601318, "grad_norm": 0.16958813340598408, "learning_rate": 2.996240369007768e-05, "loss": 0.4077, "step": 1890 }, { "epoch": 3.115321252059308, "grad_norm": 0.23123909382130725, "learning_rate": 2.9917868853640085e-05, "loss": 0.4091, "step": 1891 }, { "epoch": 3.116968698517298, "grad_norm": 0.1866303943833339, "learning_rate": 2.9873347358824714e-05, "loss": 0.4115, "step": 1892 }, { "epoch": 3.1186161449752885, "grad_norm": 0.1577787710366342, "learning_rate": 2.982883926454661e-05, "loss": 0.4055, "step": 1893 }, { "epoch": 3.1202635914332784, "grad_norm": 0.18161995082640794, "learning_rate": 2.978434462970305e-05, "loss": 0.4044, "step": 1894 }, { "epoch": 3.1219110378912687, "grad_norm": 0.17713254890671898, "learning_rate": 2.9739863513173507e-05, "loss": 0.402, "step": 1895 }, { "epoch": 3.1235584843492585, "grad_norm": 0.15689044757512208, "learning_rate": 2.9695395973819576e-05, "loss": 0.4022, "step": 1896 }, { "epoch": 3.125205930807249, "grad_norm": 0.16866655623947988, "learning_rate": 2.965094207048487e-05, "loss": 0.4012, "step": 1897 }, { "epoch": 3.1268533772652387, "grad_norm": 0.1696830984260373, "learning_rate": 2.9606501861994986e-05, "loss": 0.4019, "step": 1898 }, { "epoch": 3.128500823723229, "grad_norm": 0.16376785406058725, "learning_rate": 2.9562075407157355e-05, "loss": 0.3979, "step": 1899 }, { "epoch": 3.130148270181219, "grad_norm": 0.1604449804417908, "learning_rate": 2.951766276476125e-05, "loss": 0.4034, "step": 1900 }, { "epoch": 3.131795716639209, "grad_norm": 0.17148986733187246, "learning_rate": 2.9473263993577643e-05, "loss": 0.4058, "step": 1901 }, { "epoch": 3.1334431630971995, "grad_norm": 0.1670183680418931, "learning_rate": 2.9428879152359148e-05, "loss": 0.401, "step": 1902 }, { "epoch": 3.1350906095551894, "grad_norm": 0.15396687600363532, "learning_rate": 2.938450829983996e-05, "loss": 0.4086, "step": 1903 }, { "epoch": 3.1367380560131797, "grad_norm": 0.1596345843603042, "learning_rate": 2.9340151494735746e-05, "loss": 0.3995, "step": 1904 }, { "epoch": 3.1383855024711695, "grad_norm": 0.15201637632168732, "learning_rate": 2.929580879574361e-05, "loss": 0.4109, "step": 1905 }, { "epoch": 3.14003294892916, "grad_norm": 0.16024733974251237, "learning_rate": 2.9251480261541966e-05, "loss": 0.408, "step": 1906 }, { "epoch": 3.1416803953871497, "grad_norm": 0.15188391161689127, "learning_rate": 2.920716595079048e-05, "loss": 0.4175, "step": 1907 }, { "epoch": 3.14332784184514, "grad_norm": 0.15983782036109828, "learning_rate": 2.9162865922130027e-05, "loss": 0.4117, "step": 1908 }, { "epoch": 3.1449752883031303, "grad_norm": 0.15365593639258654, "learning_rate": 2.9118580234182548e-05, "loss": 0.4007, "step": 1909 }, { "epoch": 3.14662273476112, "grad_norm": 0.1550462120421241, "learning_rate": 2.9074308945551033e-05, "loss": 0.407, "step": 1910 }, { "epoch": 3.1482701812191105, "grad_norm": 0.15985218041465385, "learning_rate": 2.90300521148194e-05, "loss": 0.4014, "step": 1911 }, { "epoch": 3.1499176276771004, "grad_norm": 0.17424924389696322, "learning_rate": 2.8985809800552432e-05, "loss": 0.4029, "step": 1912 }, { "epoch": 3.1515650741350907, "grad_norm": 0.16526519163584835, "learning_rate": 2.894158206129573e-05, "loss": 0.403, "step": 1913 }, { "epoch": 3.1532125205930805, "grad_norm": 0.1798119280165234, "learning_rate": 2.8897368955575572e-05, "loss": 0.4027, "step": 1914 }, { "epoch": 3.154859967051071, "grad_norm": 0.15961927127474068, "learning_rate": 2.885317054189891e-05, "loss": 0.4018, "step": 1915 }, { "epoch": 3.156507413509061, "grad_norm": 0.15173794612998212, "learning_rate": 2.8808986878753216e-05, "loss": 0.3966, "step": 1916 }, { "epoch": 3.158154859967051, "grad_norm": 0.16239559902759998, "learning_rate": 2.8764818024606472e-05, "loss": 0.4071, "step": 1917 }, { "epoch": 3.1598023064250413, "grad_norm": 0.16636069884383778, "learning_rate": 2.8720664037907044e-05, "loss": 0.4066, "step": 1918 }, { "epoch": 3.161449752883031, "grad_norm": 0.15419986996992976, "learning_rate": 2.8676524977083623e-05, "loss": 0.4013, "step": 1919 }, { "epoch": 3.1630971993410215, "grad_norm": 0.16598127759686795, "learning_rate": 2.863240090054518e-05, "loss": 0.4055, "step": 1920 }, { "epoch": 3.1647446457990114, "grad_norm": 0.16045742700490728, "learning_rate": 2.8588291866680812e-05, "loss": 0.4029, "step": 1921 }, { "epoch": 3.1663920922570017, "grad_norm": 0.1483797120739778, "learning_rate": 2.854419793385975e-05, "loss": 0.3987, "step": 1922 }, { "epoch": 3.168039538714992, "grad_norm": 0.14465732996680028, "learning_rate": 2.850011916043122e-05, "loss": 0.3998, "step": 1923 }, { "epoch": 3.169686985172982, "grad_norm": 0.16743861917684935, "learning_rate": 2.8456055604724374e-05, "loss": 0.4115, "step": 1924 }, { "epoch": 3.171334431630972, "grad_norm": 0.1402389999166826, "learning_rate": 2.8412007325048274e-05, "loss": 0.4057, "step": 1925 }, { "epoch": 3.172981878088962, "grad_norm": 0.1358753532469293, "learning_rate": 2.8367974379691713e-05, "loss": 0.3949, "step": 1926 }, { "epoch": 3.1746293245469523, "grad_norm": 0.1498459343969232, "learning_rate": 2.8323956826923238e-05, "loss": 0.4022, "step": 1927 }, { "epoch": 3.176276771004942, "grad_norm": 0.14104545482365796, "learning_rate": 2.8279954724990998e-05, "loss": 0.4052, "step": 1928 }, { "epoch": 3.1779242174629325, "grad_norm": 0.1579555561887373, "learning_rate": 2.823596813212269e-05, "loss": 0.4047, "step": 1929 }, { "epoch": 3.1795716639209224, "grad_norm": 0.146849009508038, "learning_rate": 2.8191997106525525e-05, "loss": 0.4131, "step": 1930 }, { "epoch": 3.1812191103789127, "grad_norm": 0.16071916317856338, "learning_rate": 2.814804170638607e-05, "loss": 0.4077, "step": 1931 }, { "epoch": 3.182866556836903, "grad_norm": 0.15104018151145468, "learning_rate": 2.810410198987026e-05, "loss": 0.409, "step": 1932 }, { "epoch": 3.184514003294893, "grad_norm": 0.143017715318178, "learning_rate": 2.806017801512323e-05, "loss": 0.4088, "step": 1933 }, { "epoch": 3.186161449752883, "grad_norm": 0.15649812334688284, "learning_rate": 2.8016269840269328e-05, "loss": 0.4051, "step": 1934 }, { "epoch": 3.187808896210873, "grad_norm": 0.14271669231247017, "learning_rate": 2.797237752341195e-05, "loss": 0.4095, "step": 1935 }, { "epoch": 3.1894563426688634, "grad_norm": 0.15855170569389818, "learning_rate": 2.7928501122633537e-05, "loss": 0.413, "step": 1936 }, { "epoch": 3.191103789126853, "grad_norm": 0.14931330168453444, "learning_rate": 2.788464069599547e-05, "loss": 0.4119, "step": 1937 }, { "epoch": 3.1927512355848435, "grad_norm": 0.133762078016491, "learning_rate": 2.784079630153796e-05, "loss": 0.4054, "step": 1938 }, { "epoch": 3.1943986820428334, "grad_norm": 0.15738634965600537, "learning_rate": 2.7796967997280056e-05, "loss": 0.3992, "step": 1939 }, { "epoch": 3.1960461285008237, "grad_norm": 0.14066522087011532, "learning_rate": 2.7753155841219466e-05, "loss": 0.4062, "step": 1940 }, { "epoch": 3.197693574958814, "grad_norm": 0.1505658716023028, "learning_rate": 2.7709359891332544e-05, "loss": 0.4077, "step": 1941 }, { "epoch": 3.199341021416804, "grad_norm": 0.1428940476350134, "learning_rate": 2.766558020557421e-05, "loss": 0.4147, "step": 1942 }, { "epoch": 3.200988467874794, "grad_norm": 0.1328825334876301, "learning_rate": 2.762181684187784e-05, "loss": 0.4039, "step": 1943 }, { "epoch": 3.202635914332784, "grad_norm": 0.15542097113766032, "learning_rate": 2.757806985815524e-05, "loss": 0.4103, "step": 1944 }, { "epoch": 3.2042833607907744, "grad_norm": 0.14531559463411242, "learning_rate": 2.7534339312296526e-05, "loss": 0.4065, "step": 1945 }, { "epoch": 3.2059308072487642, "grad_norm": 0.15914523193874885, "learning_rate": 2.749062526217004e-05, "loss": 0.4039, "step": 1946 }, { "epoch": 3.2075782537067545, "grad_norm": 0.14520060612757796, "learning_rate": 2.744692776562234e-05, "loss": 0.406, "step": 1947 }, { "epoch": 3.209225700164745, "grad_norm": 0.14633802419063913, "learning_rate": 2.740324688047804e-05, "loss": 0.3996, "step": 1948 }, { "epoch": 3.2108731466227347, "grad_norm": 0.16227029690217712, "learning_rate": 2.7359582664539796e-05, "loss": 0.4143, "step": 1949 }, { "epoch": 3.212520593080725, "grad_norm": 0.14209288805985612, "learning_rate": 2.7315935175588185e-05, "loss": 0.4037, "step": 1950 }, { "epoch": 3.214168039538715, "grad_norm": 0.15743655001742196, "learning_rate": 2.727230447138168e-05, "loss": 0.4064, "step": 1951 }, { "epoch": 3.215815485996705, "grad_norm": 0.16405916203563714, "learning_rate": 2.722869060965651e-05, "loss": 0.409, "step": 1952 }, { "epoch": 3.217462932454695, "grad_norm": 0.16408417796700756, "learning_rate": 2.7185093648126622e-05, "loss": 0.4042, "step": 1953 }, { "epoch": 3.2191103789126854, "grad_norm": 0.18054869661044337, "learning_rate": 2.7141513644483636e-05, "loss": 0.4044, "step": 1954 }, { "epoch": 3.2207578253706757, "grad_norm": 0.1706352467575024, "learning_rate": 2.709795065639667e-05, "loss": 0.4102, "step": 1955 }, { "epoch": 3.2224052718286655, "grad_norm": 0.18880444734822027, "learning_rate": 2.7054404741512382e-05, "loss": 0.4001, "step": 1956 }, { "epoch": 3.224052718286656, "grad_norm": 0.2071325798087886, "learning_rate": 2.701087595745481e-05, "loss": 0.4075, "step": 1957 }, { "epoch": 3.2257001647446457, "grad_norm": 0.17635515862100556, "learning_rate": 2.6967364361825313e-05, "loss": 0.4065, "step": 1958 }, { "epoch": 3.227347611202636, "grad_norm": 0.1909904138943248, "learning_rate": 2.6923870012202536e-05, "loss": 0.4009, "step": 1959 }, { "epoch": 3.228995057660626, "grad_norm": 0.17597935206538085, "learning_rate": 2.688039296614227e-05, "loss": 0.4123, "step": 1960 }, { "epoch": 3.230642504118616, "grad_norm": 0.15173900187475542, "learning_rate": 2.6836933281177436e-05, "loss": 0.3984, "step": 1961 }, { "epoch": 3.232289950576606, "grad_norm": 0.17748139662810689, "learning_rate": 2.6793491014817966e-05, "loss": 0.4075, "step": 1962 }, { "epoch": 3.2339373970345964, "grad_norm": 0.14886346825492158, "learning_rate": 2.6750066224550732e-05, "loss": 0.4051, "step": 1963 }, { "epoch": 3.2355848434925867, "grad_norm": 0.16481323259130198, "learning_rate": 2.6706658967839505e-05, "loss": 0.4081, "step": 1964 }, { "epoch": 3.2372322899505765, "grad_norm": 0.184236609111782, "learning_rate": 2.666326930212483e-05, "loss": 0.4067, "step": 1965 }, { "epoch": 3.238879736408567, "grad_norm": 0.15789173340555404, "learning_rate": 2.6619897284824004e-05, "loss": 0.4016, "step": 1966 }, { "epoch": 3.2405271828665567, "grad_norm": 0.1504221316283248, "learning_rate": 2.657654297333093e-05, "loss": 0.4067, "step": 1967 }, { "epoch": 3.242174629324547, "grad_norm": 0.17340685512984605, "learning_rate": 2.653320642501612e-05, "loss": 0.4055, "step": 1968 }, { "epoch": 3.243822075782537, "grad_norm": 0.1418466407935099, "learning_rate": 2.6489887697226554e-05, "loss": 0.4084, "step": 1969 }, { "epoch": 3.245469522240527, "grad_norm": 0.14709562651777963, "learning_rate": 2.6446586847285628e-05, "loss": 0.4045, "step": 1970 }, { "epoch": 3.247116968698517, "grad_norm": 0.14358852233570973, "learning_rate": 2.6403303932493113e-05, "loss": 0.4088, "step": 1971 }, { "epoch": 3.2487644151565074, "grad_norm": 0.15421562633430874, "learning_rate": 2.6360039010125e-05, "loss": 0.4026, "step": 1972 }, { "epoch": 3.2504118616144977, "grad_norm": 0.146565996812908, "learning_rate": 2.631679213743352e-05, "loss": 0.4053, "step": 1973 }, { "epoch": 3.2520593080724876, "grad_norm": 0.13148234450398222, "learning_rate": 2.627356337164698e-05, "loss": 0.3994, "step": 1974 }, { "epoch": 3.253706754530478, "grad_norm": 0.14155667495347612, "learning_rate": 2.6230352769969738e-05, "loss": 0.4095, "step": 1975 }, { "epoch": 3.2553542009884677, "grad_norm": 0.17373359304055513, "learning_rate": 2.6187160389582124e-05, "loss": 0.3967, "step": 1976 }, { "epoch": 3.257001647446458, "grad_norm": 0.15136787985965153, "learning_rate": 2.6143986287640345e-05, "loss": 0.4126, "step": 1977 }, { "epoch": 3.258649093904448, "grad_norm": 0.1509179231360935, "learning_rate": 2.6100830521276434e-05, "loss": 0.4092, "step": 1978 }, { "epoch": 3.260296540362438, "grad_norm": 0.152589690226744, "learning_rate": 2.6057693147598144e-05, "loss": 0.4023, "step": 1979 }, { "epoch": 3.2619439868204285, "grad_norm": 0.15368865822303726, "learning_rate": 2.6014574223688888e-05, "loss": 0.4044, "step": 1980 }, { "epoch": 3.2635914332784184, "grad_norm": 0.1457359489364247, "learning_rate": 2.5971473806607693e-05, "loss": 0.4009, "step": 1981 }, { "epoch": 3.2652388797364087, "grad_norm": 0.17662130990515282, "learning_rate": 2.5928391953389063e-05, "loss": 0.4046, "step": 1982 }, { "epoch": 3.2668863261943986, "grad_norm": 0.1475770709603076, "learning_rate": 2.5885328721042954e-05, "loss": 0.4041, "step": 1983 }, { "epoch": 3.268533772652389, "grad_norm": 0.14432307873382225, "learning_rate": 2.5842284166554678e-05, "loss": 0.4049, "step": 1984 }, { "epoch": 3.2701812191103787, "grad_norm": 0.15071516992620168, "learning_rate": 2.5799258346884827e-05, "loss": 0.4052, "step": 1985 }, { "epoch": 3.271828665568369, "grad_norm": 0.15267234477118047, "learning_rate": 2.5756251318969208e-05, "loss": 0.4139, "step": 1986 }, { "epoch": 3.2734761120263594, "grad_norm": 0.14687684709218157, "learning_rate": 2.5713263139718744e-05, "loss": 0.3985, "step": 1987 }, { "epoch": 3.275123558484349, "grad_norm": 0.13346539682515116, "learning_rate": 2.567029386601945e-05, "loss": 0.4097, "step": 1988 }, { "epoch": 3.2767710049423395, "grad_norm": 0.15160068842961952, "learning_rate": 2.5627343554732288e-05, "loss": 0.4013, "step": 1989 }, { "epoch": 3.2784184514003294, "grad_norm": 0.13715455596624473, "learning_rate": 2.5584412262693138e-05, "loss": 0.4003, "step": 1990 }, { "epoch": 3.2800658978583197, "grad_norm": 0.13237648993154863, "learning_rate": 2.554150004671273e-05, "loss": 0.4125, "step": 1991 }, { "epoch": 3.2817133443163096, "grad_norm": 0.14447483882112958, "learning_rate": 2.5498606963576515e-05, "loss": 0.4071, "step": 1992 }, { "epoch": 3.2833607907743, "grad_norm": 0.13115470488939096, "learning_rate": 2.5455733070044676e-05, "loss": 0.4079, "step": 1993 }, { "epoch": 3.28500823723229, "grad_norm": 0.1475782243556833, "learning_rate": 2.5412878422851934e-05, "loss": 0.4115, "step": 1994 }, { "epoch": 3.28665568369028, "grad_norm": 0.14894631344480785, "learning_rate": 2.5370043078707616e-05, "loss": 0.4066, "step": 1995 }, { "epoch": 3.2883031301482704, "grad_norm": 0.14485508449443296, "learning_rate": 2.5327227094295452e-05, "loss": 0.4102, "step": 1996 }, { "epoch": 3.2899505766062602, "grad_norm": 0.16127934114150702, "learning_rate": 2.528443052627357e-05, "loss": 0.4074, "step": 1997 }, { "epoch": 3.2915980230642505, "grad_norm": 0.14901206432474298, "learning_rate": 2.5241653431274422e-05, "loss": 0.4104, "step": 1998 }, { "epoch": 3.2932454695222404, "grad_norm": 0.16508330308182093, "learning_rate": 2.5198895865904655e-05, "loss": 0.403, "step": 1999 }, { "epoch": 3.2948929159802307, "grad_norm": 0.15147770267095717, "learning_rate": 2.51561578867451e-05, "loss": 0.4092, "step": 2000 }, { "epoch": 3.2965403624382206, "grad_norm": 0.13281296497300382, "learning_rate": 2.5113439550350665e-05, "loss": 0.4008, "step": 2001 }, { "epoch": 3.298187808896211, "grad_norm": 0.1466355981622626, "learning_rate": 2.5070740913250266e-05, "loss": 0.4084, "step": 2002 }, { "epoch": 3.2998352553542007, "grad_norm": 0.1674347246402341, "learning_rate": 2.5028062031946743e-05, "loss": 0.4018, "step": 2003 }, { "epoch": 3.301482701812191, "grad_norm": 0.12786947058691614, "learning_rate": 2.4985402962916787e-05, "loss": 0.408, "step": 2004 }, { "epoch": 3.3031301482701814, "grad_norm": 0.16019508597329984, "learning_rate": 2.4942763762610908e-05, "loss": 0.3975, "step": 2005 }, { "epoch": 3.3047775947281712, "grad_norm": 0.1290292889542399, "learning_rate": 2.4900144487453268e-05, "loss": 0.4025, "step": 2006 }, { "epoch": 3.3064250411861615, "grad_norm": 0.15097256312246368, "learning_rate": 2.4857545193841715e-05, "loss": 0.409, "step": 2007 }, { "epoch": 3.3080724876441514, "grad_norm": 0.12946362438572043, "learning_rate": 2.481496593814762e-05, "loss": 0.4022, "step": 2008 }, { "epoch": 3.3097199341021417, "grad_norm": 0.16384098848432613, "learning_rate": 2.477240677671585e-05, "loss": 0.4009, "step": 2009 }, { "epoch": 3.3113673805601316, "grad_norm": 0.1380480096932566, "learning_rate": 2.472986776586469e-05, "loss": 0.4034, "step": 2010 }, { "epoch": 3.313014827018122, "grad_norm": 0.14319490559528195, "learning_rate": 2.4687348961885747e-05, "loss": 0.4053, "step": 2011 }, { "epoch": 3.314662273476112, "grad_norm": 0.16034943762378667, "learning_rate": 2.4644850421043895e-05, "loss": 0.4073, "step": 2012 }, { "epoch": 3.316309719934102, "grad_norm": 0.1463968781804813, "learning_rate": 2.4602372199577195e-05, "loss": 0.3974, "step": 2013 }, { "epoch": 3.3179571663920924, "grad_norm": 0.14414231473106878, "learning_rate": 2.4559914353696808e-05, "loss": 0.4088, "step": 2014 }, { "epoch": 3.3196046128500822, "grad_norm": 0.15799426008171313, "learning_rate": 2.451747693958695e-05, "loss": 0.4105, "step": 2015 }, { "epoch": 3.3212520593080725, "grad_norm": 0.13007863033321587, "learning_rate": 2.447506001340478e-05, "loss": 0.4032, "step": 2016 }, { "epoch": 3.3228995057660624, "grad_norm": 0.14318370797210805, "learning_rate": 2.4432663631280367e-05, "loss": 0.4095, "step": 2017 }, { "epoch": 3.3245469522240527, "grad_norm": 0.1360854597687515, "learning_rate": 2.4390287849316576e-05, "loss": 0.4048, "step": 2018 }, { "epoch": 3.326194398682043, "grad_norm": 0.13704859619727905, "learning_rate": 2.4347932723589027e-05, "loss": 0.4118, "step": 2019 }, { "epoch": 3.327841845140033, "grad_norm": 0.1567576529322276, "learning_rate": 2.4305598310145994e-05, "loss": 0.401, "step": 2020 }, { "epoch": 3.329489291598023, "grad_norm": 0.1460189590791655, "learning_rate": 2.426328466500833e-05, "loss": 0.409, "step": 2021 }, { "epoch": 3.331136738056013, "grad_norm": 0.15292771611366746, "learning_rate": 2.4220991844169454e-05, "loss": 0.4011, "step": 2022 }, { "epoch": 3.3327841845140034, "grad_norm": 0.1661773395371531, "learning_rate": 2.4178719903595163e-05, "loss": 0.4032, "step": 2023 }, { "epoch": 3.3344316309719932, "grad_norm": 0.1395964399993469, "learning_rate": 2.413646889922368e-05, "loss": 0.4121, "step": 2024 }, { "epoch": 3.3360790774299836, "grad_norm": 0.15195641237169924, "learning_rate": 2.4094238886965497e-05, "loss": 0.4094, "step": 2025 }, { "epoch": 3.337726523887974, "grad_norm": 0.13537331081253692, "learning_rate": 2.4052029922703312e-05, "loss": 0.4044, "step": 2026 }, { "epoch": 3.3393739703459637, "grad_norm": 0.13036623982299764, "learning_rate": 2.4009842062292012e-05, "loss": 0.4071, "step": 2027 }, { "epoch": 3.341021416803954, "grad_norm": 0.14922213327578737, "learning_rate": 2.396767536155851e-05, "loss": 0.4035, "step": 2028 }, { "epoch": 3.342668863261944, "grad_norm": 0.12448501374339693, "learning_rate": 2.3925529876301767e-05, "loss": 0.4057, "step": 2029 }, { "epoch": 3.344316309719934, "grad_norm": 0.14437023790759737, "learning_rate": 2.388340566229263e-05, "loss": 0.4092, "step": 2030 }, { "epoch": 3.345963756177924, "grad_norm": 0.1396672167820971, "learning_rate": 2.3841302775273807e-05, "loss": 0.4029, "step": 2031 }, { "epoch": 3.3476112026359144, "grad_norm": 0.13090459316231062, "learning_rate": 2.3799221270959807e-05, "loss": 0.4017, "step": 2032 }, { "epoch": 3.3492586490939047, "grad_norm": 0.15525376556873036, "learning_rate": 2.3757161205036807e-05, "loss": 0.4086, "step": 2033 }, { "epoch": 3.3509060955518946, "grad_norm": 0.16233761333176902, "learning_rate": 2.3715122633162654e-05, "loss": 0.406, "step": 2034 }, { "epoch": 3.352553542009885, "grad_norm": 0.14322545898660236, "learning_rate": 2.3673105610966714e-05, "loss": 0.4046, "step": 2035 }, { "epoch": 3.3542009884678747, "grad_norm": 0.18380866056597267, "learning_rate": 2.363111019404987e-05, "loss": 0.4108, "step": 2036 }, { "epoch": 3.355848434925865, "grad_norm": 0.14100650827484135, "learning_rate": 2.358913643798439e-05, "loss": 0.4065, "step": 2037 }, { "epoch": 3.357495881383855, "grad_norm": 0.1502374516665071, "learning_rate": 2.3547184398313886e-05, "loss": 0.4012, "step": 2038 }, { "epoch": 3.359143327841845, "grad_norm": 0.18374051285230253, "learning_rate": 2.3505254130553247e-05, "loss": 0.4038, "step": 2039 }, { "epoch": 3.360790774299835, "grad_norm": 0.12586994898230502, "learning_rate": 2.3463345690188516e-05, "loss": 0.4058, "step": 2040 }, { "epoch": 3.3624382207578254, "grad_norm": 0.15404491193348646, "learning_rate": 2.34214591326769e-05, "loss": 0.4098, "step": 2041 }, { "epoch": 3.3640856672158153, "grad_norm": 0.13581282099250114, "learning_rate": 2.337959451344661e-05, "loss": 0.4079, "step": 2042 }, { "epoch": 3.3657331136738056, "grad_norm": 0.16632269263409524, "learning_rate": 2.3337751887896837e-05, "loss": 0.4089, "step": 2043 }, { "epoch": 3.367380560131796, "grad_norm": 0.14572134803864495, "learning_rate": 2.3295931311397677e-05, "loss": 0.4005, "step": 2044 }, { "epoch": 3.3690280065897857, "grad_norm": 0.20075655475296939, "learning_rate": 2.3254132839290045e-05, "loss": 0.4058, "step": 2045 }, { "epoch": 3.370675453047776, "grad_norm": 0.16834605721754245, "learning_rate": 2.321235652688558e-05, "loss": 0.4079, "step": 2046 }, { "epoch": 3.372322899505766, "grad_norm": 0.19086779013118274, "learning_rate": 2.3170602429466657e-05, "loss": 0.4133, "step": 2047 }, { "epoch": 3.3739703459637562, "grad_norm": 0.19186729602195107, "learning_rate": 2.3128870602286195e-05, "loss": 0.4053, "step": 2048 }, { "epoch": 3.375617792421746, "grad_norm": 0.16716276017754794, "learning_rate": 2.3087161100567664e-05, "loss": 0.4047, "step": 2049 }, { "epoch": 3.3772652388797364, "grad_norm": 0.18928960183217397, "learning_rate": 2.3045473979504998e-05, "loss": 0.4033, "step": 2050 }, { "epoch": 3.3789126853377267, "grad_norm": 0.1552861836377082, "learning_rate": 2.3003809294262488e-05, "loss": 0.4004, "step": 2051 }, { "epoch": 3.3805601317957166, "grad_norm": 0.18829949898656054, "learning_rate": 2.2962167099974785e-05, "loss": 0.4049, "step": 2052 }, { "epoch": 3.382207578253707, "grad_norm": 0.17781356178766344, "learning_rate": 2.292054745174674e-05, "loss": 0.4096, "step": 2053 }, { "epoch": 3.3838550247116967, "grad_norm": 0.15476467391368423, "learning_rate": 2.2878950404653374e-05, "loss": 0.4072, "step": 2054 }, { "epoch": 3.385502471169687, "grad_norm": 0.13436472636003718, "learning_rate": 2.2837376013739806e-05, "loss": 0.4053, "step": 2055 }, { "epoch": 3.387149917627677, "grad_norm": 0.1669328336624347, "learning_rate": 2.279582433402115e-05, "loss": 0.4019, "step": 2056 }, { "epoch": 3.3887973640856672, "grad_norm": 0.12604311140654212, "learning_rate": 2.2754295420482527e-05, "loss": 0.4081, "step": 2057 }, { "epoch": 3.3904448105436575, "grad_norm": 0.1651952014870887, "learning_rate": 2.2712789328078876e-05, "loss": 0.4034, "step": 2058 }, { "epoch": 3.3920922570016474, "grad_norm": 0.14318209197065454, "learning_rate": 2.267130611173496e-05, "loss": 0.4046, "step": 2059 }, { "epoch": 3.3937397034596377, "grad_norm": 0.15146844818408653, "learning_rate": 2.2629845826345253e-05, "loss": 0.407, "step": 2060 }, { "epoch": 3.3953871499176276, "grad_norm": 0.13464744625204345, "learning_rate": 2.2588408526773883e-05, "loss": 0.4113, "step": 2061 }, { "epoch": 3.397034596375618, "grad_norm": 0.12981950394534736, "learning_rate": 2.254699426785461e-05, "loss": 0.4082, "step": 2062 }, { "epoch": 3.3986820428336078, "grad_norm": 0.1479726776405486, "learning_rate": 2.250560310439065e-05, "loss": 0.4098, "step": 2063 }, { "epoch": 3.400329489291598, "grad_norm": 0.17296845503595565, "learning_rate": 2.2464235091154684e-05, "loss": 0.4116, "step": 2064 }, { "epoch": 3.4019769357495884, "grad_norm": 0.13732750605684946, "learning_rate": 2.242289028288873e-05, "loss": 0.4077, "step": 2065 }, { "epoch": 3.4036243822075782, "grad_norm": 0.17925028411959948, "learning_rate": 2.2381568734304126e-05, "loss": 0.4019, "step": 2066 }, { "epoch": 3.4052718286655685, "grad_norm": 0.14580267478305192, "learning_rate": 2.2340270500081437e-05, "loss": 0.407, "step": 2067 }, { "epoch": 3.4069192751235584, "grad_norm": 0.1824372665684568, "learning_rate": 2.2298995634870358e-05, "loss": 0.4009, "step": 2068 }, { "epoch": 3.4085667215815487, "grad_norm": 0.14360254561058874, "learning_rate": 2.225774419328966e-05, "loss": 0.402, "step": 2069 }, { "epoch": 3.4102141680395386, "grad_norm": 0.18524073002106184, "learning_rate": 2.2216516229927128e-05, "loss": 0.4102, "step": 2070 }, { "epoch": 3.411861614497529, "grad_norm": 0.1357978888858332, "learning_rate": 2.2175311799339444e-05, "loss": 0.4081, "step": 2071 }, { "epoch": 3.4135090609555188, "grad_norm": 0.14126982229235627, "learning_rate": 2.2134130956052228e-05, "loss": 0.408, "step": 2072 }, { "epoch": 3.415156507413509, "grad_norm": 0.13721062081500968, "learning_rate": 2.2092973754559805e-05, "loss": 0.4044, "step": 2073 }, { "epoch": 3.416803953871499, "grad_norm": 0.159537906386488, "learning_rate": 2.2051840249325257e-05, "loss": 0.3994, "step": 2074 }, { "epoch": 3.4184514003294892, "grad_norm": 0.14772719689758368, "learning_rate": 2.2010730494780274e-05, "loss": 0.403, "step": 2075 }, { "epoch": 3.4200988467874796, "grad_norm": 0.17640666272854122, "learning_rate": 2.196964454532518e-05, "loss": 0.4057, "step": 2076 }, { "epoch": 3.4217462932454694, "grad_norm": 0.1437561936943307, "learning_rate": 2.1928582455328744e-05, "loss": 0.406, "step": 2077 }, { "epoch": 3.4233937397034597, "grad_norm": 0.15938271093988501, "learning_rate": 2.1887544279128175e-05, "loss": 0.409, "step": 2078 }, { "epoch": 3.4250411861614496, "grad_norm": 0.15150215243696802, "learning_rate": 2.1846530071029043e-05, "loss": 0.4101, "step": 2079 }, { "epoch": 3.42668863261944, "grad_norm": 0.13794408552559845, "learning_rate": 2.180553988530518e-05, "loss": 0.4051, "step": 2080 }, { "epoch": 3.4283360790774298, "grad_norm": 0.13124235726675074, "learning_rate": 2.1764573776198676e-05, "loss": 0.4123, "step": 2081 }, { "epoch": 3.42998352553542, "grad_norm": 0.1425779553983553, "learning_rate": 2.172363179791972e-05, "loss": 0.4094, "step": 2082 }, { "epoch": 3.4316309719934104, "grad_norm": 0.1534097321646452, "learning_rate": 2.168271400464658e-05, "loss": 0.4196, "step": 2083 }, { "epoch": 3.4332784184514002, "grad_norm": 0.13466254206438208, "learning_rate": 2.1641820450525513e-05, "loss": 0.4024, "step": 2084 }, { "epoch": 3.4349258649093906, "grad_norm": 0.1715640865029632, "learning_rate": 2.1600951189670702e-05, "loss": 0.4038, "step": 2085 }, { "epoch": 3.4365733113673804, "grad_norm": 0.15335280152338607, "learning_rate": 2.1560106276164213e-05, "loss": 0.4056, "step": 2086 }, { "epoch": 3.4382207578253707, "grad_norm": 0.137501265079205, "learning_rate": 2.1519285764055863e-05, "loss": 0.411, "step": 2087 }, { "epoch": 3.4398682042833606, "grad_norm": 0.1345383466776521, "learning_rate": 2.147848970736318e-05, "loss": 0.4084, "step": 2088 }, { "epoch": 3.441515650741351, "grad_norm": 0.1364130816725877, "learning_rate": 2.1437718160071333e-05, "loss": 0.4069, "step": 2089 }, { "epoch": 3.443163097199341, "grad_norm": 0.12616244823723416, "learning_rate": 2.139697117613304e-05, "loss": 0.4096, "step": 2090 }, { "epoch": 3.444810543657331, "grad_norm": 0.13767634168286366, "learning_rate": 2.135624880946858e-05, "loss": 0.4109, "step": 2091 }, { "epoch": 3.4464579901153214, "grad_norm": 0.1273394438986473, "learning_rate": 2.1315551113965585e-05, "loss": 0.3988, "step": 2092 }, { "epoch": 3.4481054365733113, "grad_norm": 0.15025578172089066, "learning_rate": 2.1274878143479065e-05, "loss": 0.4049, "step": 2093 }, { "epoch": 3.4497528830313016, "grad_norm": 0.13481293237968706, "learning_rate": 2.1234229951831316e-05, "loss": 0.4011, "step": 2094 }, { "epoch": 3.4514003294892914, "grad_norm": 0.14896206306249066, "learning_rate": 2.1193606592811827e-05, "loss": 0.3983, "step": 2095 }, { "epoch": 3.4530477759472817, "grad_norm": 0.14327525559122845, "learning_rate": 2.1153008120177266e-05, "loss": 0.412, "step": 2096 }, { "epoch": 3.454695222405272, "grad_norm": 0.13744417863614605, "learning_rate": 2.1112434587651328e-05, "loss": 0.4058, "step": 2097 }, { "epoch": 3.456342668863262, "grad_norm": 0.1305451374730545, "learning_rate": 2.1071886048924713e-05, "loss": 0.407, "step": 2098 }, { "epoch": 3.4579901153212522, "grad_norm": 0.14623308625968573, "learning_rate": 2.1031362557655067e-05, "loss": 0.4095, "step": 2099 }, { "epoch": 3.459637561779242, "grad_norm": 0.14211441087107626, "learning_rate": 2.0990864167466854e-05, "loss": 0.4045, "step": 2100 }, { "epoch": 3.4612850082372324, "grad_norm": 0.1385289471018426, "learning_rate": 2.095039093195138e-05, "loss": 0.4062, "step": 2101 }, { "epoch": 3.4629324546952223, "grad_norm": 0.13473893285428126, "learning_rate": 2.0909942904666603e-05, "loss": 0.4079, "step": 2102 }, { "epoch": 3.4645799011532126, "grad_norm": 0.139037152971837, "learning_rate": 2.086952013913716e-05, "loss": 0.4056, "step": 2103 }, { "epoch": 3.466227347611203, "grad_norm": 0.11580243376193965, "learning_rate": 2.0829122688854247e-05, "loss": 0.4072, "step": 2104 }, { "epoch": 3.4678747940691927, "grad_norm": 0.1269484927360608, "learning_rate": 2.078875060727554e-05, "loss": 0.4036, "step": 2105 }, { "epoch": 3.469522240527183, "grad_norm": 0.13676208746693277, "learning_rate": 2.0748403947825207e-05, "loss": 0.4015, "step": 2106 }, { "epoch": 3.471169686985173, "grad_norm": 0.14665977722247753, "learning_rate": 2.0708082763893712e-05, "loss": 0.4106, "step": 2107 }, { "epoch": 3.4728171334431632, "grad_norm": 0.1371101755827189, "learning_rate": 2.0667787108837835e-05, "loss": 0.4048, "step": 2108 }, { "epoch": 3.474464579901153, "grad_norm": 0.1284996446809871, "learning_rate": 2.0627517035980542e-05, "loss": 0.4064, "step": 2109 }, { "epoch": 3.4761120263591434, "grad_norm": 0.1450655697052185, "learning_rate": 2.058727259861102e-05, "loss": 0.4037, "step": 2110 }, { "epoch": 3.4777594728171333, "grad_norm": 0.1313526268259749, "learning_rate": 2.054705384998446e-05, "loss": 0.4057, "step": 2111 }, { "epoch": 3.4794069192751236, "grad_norm": 0.14239200513714287, "learning_rate": 2.0506860843322088e-05, "loss": 0.4029, "step": 2112 }, { "epoch": 3.4810543657331134, "grad_norm": 0.14372668069036348, "learning_rate": 2.0466693631811067e-05, "loss": 0.408, "step": 2113 }, { "epoch": 3.4827018121911038, "grad_norm": 0.14073419962315736, "learning_rate": 2.0426552268604397e-05, "loss": 0.4021, "step": 2114 }, { "epoch": 3.484349258649094, "grad_norm": 0.14912022992087776, "learning_rate": 2.0386436806820936e-05, "loss": 0.4004, "step": 2115 }, { "epoch": 3.485996705107084, "grad_norm": 0.13389728281759491, "learning_rate": 2.034634729954521e-05, "loss": 0.4, "step": 2116 }, { "epoch": 3.4876441515650742, "grad_norm": 0.15182351968220295, "learning_rate": 2.030628379982742e-05, "loss": 0.4014, "step": 2117 }, { "epoch": 3.489291598023064, "grad_norm": 0.11806777166121267, "learning_rate": 2.026624636068334e-05, "loss": 0.4116, "step": 2118 }, { "epoch": 3.4909390444810544, "grad_norm": 0.15156528608850975, "learning_rate": 2.0226235035094264e-05, "loss": 0.4026, "step": 2119 }, { "epoch": 3.4925864909390443, "grad_norm": 0.12226637633244589, "learning_rate": 2.0186249876006963e-05, "loss": 0.4035, "step": 2120 }, { "epoch": 3.4942339373970346, "grad_norm": 0.14024046174652252, "learning_rate": 2.014629093633353e-05, "loss": 0.4089, "step": 2121 }, { "epoch": 3.495881383855025, "grad_norm": 0.12920106201723192, "learning_rate": 2.010635826895139e-05, "loss": 0.4068, "step": 2122 }, { "epoch": 3.4975288303130148, "grad_norm": 0.13446699567391507, "learning_rate": 2.0066451926703187e-05, "loss": 0.4052, "step": 2123 }, { "epoch": 3.499176276771005, "grad_norm": 0.1235474102592683, "learning_rate": 2.0026571962396737e-05, "loss": 0.3999, "step": 2124 }, { "epoch": 3.500823723228995, "grad_norm": 0.14742982717907077, "learning_rate": 1.9986718428804973e-05, "loss": 0.3989, "step": 2125 }, { "epoch": 3.5024711696869852, "grad_norm": 0.12434423893265224, "learning_rate": 1.994689137866582e-05, "loss": 0.411, "step": 2126 }, { "epoch": 3.504118616144975, "grad_norm": 0.1427073713750506, "learning_rate": 1.990709086468217e-05, "loss": 0.4116, "step": 2127 }, { "epoch": 3.5057660626029654, "grad_norm": 0.12428321032105012, "learning_rate": 1.9867316939521795e-05, "loss": 0.4054, "step": 2128 }, { "epoch": 3.5074135090609557, "grad_norm": 0.1451147586905053, "learning_rate": 1.9827569655817274e-05, "loss": 0.4033, "step": 2129 }, { "epoch": 3.5090609555189456, "grad_norm": 0.12387726712721778, "learning_rate": 1.9787849066165973e-05, "loss": 0.4135, "step": 2130 }, { "epoch": 3.510708401976936, "grad_norm": 0.15917094910862098, "learning_rate": 1.974815522312989e-05, "loss": 0.4126, "step": 2131 }, { "epoch": 3.5123558484349258, "grad_norm": 0.12331360880324381, "learning_rate": 1.9708488179235638e-05, "loss": 0.4118, "step": 2132 }, { "epoch": 3.514003294892916, "grad_norm": 0.13403033154997007, "learning_rate": 1.966884798697438e-05, "loss": 0.4045, "step": 2133 }, { "epoch": 3.515650741350906, "grad_norm": 0.13482858951421325, "learning_rate": 1.962923469880172e-05, "loss": 0.4091, "step": 2134 }, { "epoch": 3.5172981878088962, "grad_norm": 0.1271818182020153, "learning_rate": 1.9589648367137707e-05, "loss": 0.4023, "step": 2135 }, { "epoch": 3.5189456342668866, "grad_norm": 0.14055335544289485, "learning_rate": 1.9550089044366682e-05, "loss": 0.4082, "step": 2136 }, { "epoch": 3.5205930807248764, "grad_norm": 0.13231731474731837, "learning_rate": 1.951055678283725e-05, "loss": 0.4016, "step": 2137 }, { "epoch": 3.5222405271828663, "grad_norm": 0.13143067004822978, "learning_rate": 1.9471051634862212e-05, "loss": 0.4003, "step": 2138 }, { "epoch": 3.5238879736408566, "grad_norm": 0.1304751084919994, "learning_rate": 1.9431573652718468e-05, "loss": 0.4087, "step": 2139 }, { "epoch": 3.525535420098847, "grad_norm": 0.1514009480986289, "learning_rate": 1.939212288864703e-05, "loss": 0.4031, "step": 2140 }, { "epoch": 3.5271828665568368, "grad_norm": 0.14108006196467393, "learning_rate": 1.935269939485282e-05, "loss": 0.3985, "step": 2141 }, { "epoch": 3.528830313014827, "grad_norm": 0.14303590154645973, "learning_rate": 1.931330322350472e-05, "loss": 0.4024, "step": 2142 }, { "epoch": 3.5304777594728174, "grad_norm": 0.16388307544638017, "learning_rate": 1.9273934426735417e-05, "loss": 0.4019, "step": 2143 }, { "epoch": 3.5321252059308073, "grad_norm": 0.13176375206123123, "learning_rate": 1.9234593056641437e-05, "loss": 0.4087, "step": 2144 }, { "epoch": 3.533772652388797, "grad_norm": 0.17097670990830557, "learning_rate": 1.9195279165282953e-05, "loss": 0.4047, "step": 2145 }, { "epoch": 3.5354200988467874, "grad_norm": 0.1348047879237059, "learning_rate": 1.9155992804683793e-05, "loss": 0.3988, "step": 2146 }, { "epoch": 3.5370675453047777, "grad_norm": 0.19207086692471947, "learning_rate": 1.9116734026831364e-05, "loss": 0.4037, "step": 2147 }, { "epoch": 3.5387149917627676, "grad_norm": 0.15796373270674527, "learning_rate": 1.9077502883676545e-05, "loss": 0.4049, "step": 2148 }, { "epoch": 3.540362438220758, "grad_norm": 0.1911782619536139, "learning_rate": 1.90382994271337e-05, "loss": 0.4041, "step": 2149 }, { "epoch": 3.5420098846787478, "grad_norm": 0.17501710139892576, "learning_rate": 1.8999123709080508e-05, "loss": 0.4097, "step": 2150 }, { "epoch": 3.543657331136738, "grad_norm": 0.16625877286804022, "learning_rate": 1.8959975781357953e-05, "loss": 0.4072, "step": 2151 }, { "epoch": 3.545304777594728, "grad_norm": 0.18206137493626745, "learning_rate": 1.8920855695770245e-05, "loss": 0.4, "step": 2152 }, { "epoch": 3.5469522240527183, "grad_norm": 0.1685106381082805, "learning_rate": 1.8881763504084748e-05, "loss": 0.4108, "step": 2153 }, { "epoch": 3.5485996705107086, "grad_norm": 0.1653333442113096, "learning_rate": 1.884269925803194e-05, "loss": 0.3998, "step": 2154 }, { "epoch": 3.5502471169686984, "grad_norm": 0.16991955657624197, "learning_rate": 1.8803663009305287e-05, "loss": 0.404, "step": 2155 }, { "epoch": 3.5518945634266887, "grad_norm": 0.1690077385492284, "learning_rate": 1.8764654809561217e-05, "loss": 0.4042, "step": 2156 }, { "epoch": 3.5535420098846786, "grad_norm": 0.14374292384784124, "learning_rate": 1.872567471041904e-05, "loss": 0.4064, "step": 2157 }, { "epoch": 3.555189456342669, "grad_norm": 0.15667033101783356, "learning_rate": 1.8686722763460864e-05, "loss": 0.4083, "step": 2158 }, { "epoch": 3.556836902800659, "grad_norm": 0.1294914962171891, "learning_rate": 1.8647799020231597e-05, "loss": 0.4025, "step": 2159 }, { "epoch": 3.558484349258649, "grad_norm": 0.14081982276392235, "learning_rate": 1.8608903532238772e-05, "loss": 0.4047, "step": 2160 }, { "epoch": 3.5601317957166394, "grad_norm": 0.11836215441941227, "learning_rate": 1.857003635095255e-05, "loss": 0.4087, "step": 2161 }, { "epoch": 3.5617792421746293, "grad_norm": 0.12487295610318239, "learning_rate": 1.8531197527805635e-05, "loss": 0.4071, "step": 2162 }, { "epoch": 3.5634266886326196, "grad_norm": 0.136055071387399, "learning_rate": 1.849238711419318e-05, "loss": 0.4037, "step": 2163 }, { "epoch": 3.5650741350906094, "grad_norm": 0.12615614813430973, "learning_rate": 1.8453605161472804e-05, "loss": 0.4067, "step": 2164 }, { "epoch": 3.5667215815485998, "grad_norm": 0.14069482562078256, "learning_rate": 1.8414851720964414e-05, "loss": 0.4109, "step": 2165 }, { "epoch": 3.5683690280065896, "grad_norm": 0.1492699913309673, "learning_rate": 1.837612684395019e-05, "loss": 0.4038, "step": 2166 }, { "epoch": 3.57001647446458, "grad_norm": 0.14708006765296427, "learning_rate": 1.8337430581674537e-05, "loss": 0.3995, "step": 2167 }, { "epoch": 3.5716639209225702, "grad_norm": 0.15332604977350256, "learning_rate": 1.8298762985343962e-05, "loss": 0.4066, "step": 2168 }, { "epoch": 3.57331136738056, "grad_norm": 0.15513975342837885, "learning_rate": 1.826012410612708e-05, "loss": 0.4033, "step": 2169 }, { "epoch": 3.5749588138385504, "grad_norm": 0.1480684887576593, "learning_rate": 1.8221513995154473e-05, "loss": 0.4071, "step": 2170 }, { "epoch": 3.5766062602965403, "grad_norm": 0.15158733498978197, "learning_rate": 1.8182932703518667e-05, "loss": 0.4066, "step": 2171 }, { "epoch": 3.5782537067545306, "grad_norm": 0.17330426036691707, "learning_rate": 1.8144380282274042e-05, "loss": 0.409, "step": 2172 }, { "epoch": 3.5799011532125204, "grad_norm": 0.13356826689397205, "learning_rate": 1.810585678243677e-05, "loss": 0.398, "step": 2173 }, { "epoch": 3.5815485996705108, "grad_norm": 0.14569923460380552, "learning_rate": 1.806736225498479e-05, "loss": 0.4085, "step": 2174 }, { "epoch": 3.583196046128501, "grad_norm": 0.12984606607919463, "learning_rate": 1.802889675085766e-05, "loss": 0.4032, "step": 2175 }, { "epoch": 3.584843492586491, "grad_norm": 0.13559428073652363, "learning_rate": 1.7990460320956546e-05, "loss": 0.4058, "step": 2176 }, { "epoch": 3.586490939044481, "grad_norm": 0.14168942798425052, "learning_rate": 1.795205301614412e-05, "loss": 0.4068, "step": 2177 }, { "epoch": 3.588138385502471, "grad_norm": 0.13429885216424303, "learning_rate": 1.791367488724457e-05, "loss": 0.4086, "step": 2178 }, { "epoch": 3.5897858319604614, "grad_norm": 0.13164037669664155, "learning_rate": 1.7875325985043413e-05, "loss": 0.4069, "step": 2179 }, { "epoch": 3.5914332784184513, "grad_norm": 0.13782017564960222, "learning_rate": 1.7837006360287526e-05, "loss": 0.4044, "step": 2180 }, { "epoch": 3.5930807248764416, "grad_norm": 0.13052978038894203, "learning_rate": 1.779871606368503e-05, "loss": 0.4081, "step": 2181 }, { "epoch": 3.594728171334432, "grad_norm": 0.12200510150655994, "learning_rate": 1.776045514590522e-05, "loss": 0.404, "step": 2182 }, { "epoch": 3.5963756177924218, "grad_norm": 0.13821431440527196, "learning_rate": 1.7722223657578573e-05, "loss": 0.4033, "step": 2183 }, { "epoch": 3.5980230642504116, "grad_norm": 0.11907316884616267, "learning_rate": 1.7684021649296554e-05, "loss": 0.4074, "step": 2184 }, { "epoch": 3.599670510708402, "grad_norm": 0.15499498539163414, "learning_rate": 1.764584917161166e-05, "loss": 0.4017, "step": 2185 }, { "epoch": 3.6013179571663922, "grad_norm": 0.12224660326505968, "learning_rate": 1.7607706275037287e-05, "loss": 0.4091, "step": 2186 }, { "epoch": 3.602965403624382, "grad_norm": 0.15615924485794644, "learning_rate": 1.756959301004768e-05, "loss": 0.4021, "step": 2187 }, { "epoch": 3.6046128500823724, "grad_norm": 0.1238310645779833, "learning_rate": 1.753150942707793e-05, "loss": 0.3981, "step": 2188 }, { "epoch": 3.6062602965403623, "grad_norm": 0.1387314316275769, "learning_rate": 1.749345557652378e-05, "loss": 0.3972, "step": 2189 }, { "epoch": 3.6079077429983526, "grad_norm": 0.14916293236000658, "learning_rate": 1.7455431508741666e-05, "loss": 0.4011, "step": 2190 }, { "epoch": 3.6095551894563425, "grad_norm": 0.12913520940112275, "learning_rate": 1.74174372740486e-05, "loss": 0.3978, "step": 2191 }, { "epoch": 3.6112026359143328, "grad_norm": 0.14700827010959144, "learning_rate": 1.7379472922722102e-05, "loss": 0.4055, "step": 2192 }, { "epoch": 3.612850082372323, "grad_norm": 0.15655625492543387, "learning_rate": 1.73415385050002e-05, "loss": 0.4029, "step": 2193 }, { "epoch": 3.614497528830313, "grad_norm": 0.1505121241187283, "learning_rate": 1.7303634071081258e-05, "loss": 0.4038, "step": 2194 }, { "epoch": 3.6161449752883033, "grad_norm": 0.13248239394051597, "learning_rate": 1.726575967112398e-05, "loss": 0.4051, "step": 2195 }, { "epoch": 3.617792421746293, "grad_norm": 0.15527323329972514, "learning_rate": 1.7227915355247328e-05, "loss": 0.4058, "step": 2196 }, { "epoch": 3.6194398682042834, "grad_norm": 0.12806665805174355, "learning_rate": 1.7190101173530436e-05, "loss": 0.4051, "step": 2197 }, { "epoch": 3.6210873146622733, "grad_norm": 0.1870004895286202, "learning_rate": 1.7152317176012615e-05, "loss": 0.4129, "step": 2198 }, { "epoch": 3.6227347611202636, "grad_norm": 0.12783363493963, "learning_rate": 1.7114563412693174e-05, "loss": 0.4052, "step": 2199 }, { "epoch": 3.624382207578254, "grad_norm": 0.1862979565038448, "learning_rate": 1.7076839933531442e-05, "loss": 0.3998, "step": 2200 }, { "epoch": 3.6260296540362438, "grad_norm": 0.12436574873496077, "learning_rate": 1.703914678844666e-05, "loss": 0.4088, "step": 2201 }, { "epoch": 3.627677100494234, "grad_norm": 0.1711781585751942, "learning_rate": 1.700148402731793e-05, "loss": 0.4079, "step": 2202 }, { "epoch": 3.629324546952224, "grad_norm": 0.16015140240128142, "learning_rate": 1.6963851699984177e-05, "loss": 0.4071, "step": 2203 }, { "epoch": 3.6309719934102143, "grad_norm": 0.14847755785179034, "learning_rate": 1.6926249856244012e-05, "loss": 0.4089, "step": 2204 }, { "epoch": 3.632619439868204, "grad_norm": 0.15654332977521176, "learning_rate": 1.688867854585573e-05, "loss": 0.4035, "step": 2205 }, { "epoch": 3.6342668863261944, "grad_norm": 0.1407444084806613, "learning_rate": 1.685113781853721e-05, "loss": 0.4091, "step": 2206 }, { "epoch": 3.6359143327841847, "grad_norm": 0.1252577478555867, "learning_rate": 1.681362772396585e-05, "loss": 0.4062, "step": 2207 }, { "epoch": 3.6375617792421746, "grad_norm": 0.1335575720143777, "learning_rate": 1.6776148311778562e-05, "loss": 0.4051, "step": 2208 }, { "epoch": 3.6392092257001645, "grad_norm": 0.12437661029159836, "learning_rate": 1.67386996315716e-05, "loss": 0.406, "step": 2209 }, { "epoch": 3.640856672158155, "grad_norm": 0.14185941987138587, "learning_rate": 1.670128173290059e-05, "loss": 0.401, "step": 2210 }, { "epoch": 3.642504118616145, "grad_norm": 0.1277769250120039, "learning_rate": 1.6663894665280373e-05, "loss": 0.4064, "step": 2211 }, { "epoch": 3.644151565074135, "grad_norm": 0.1399432835991279, "learning_rate": 1.6626538478185062e-05, "loss": 0.4025, "step": 2212 }, { "epoch": 3.6457990115321253, "grad_norm": 0.1263695850261397, "learning_rate": 1.658921322104786e-05, "loss": 0.4094, "step": 2213 }, { "epoch": 3.6474464579901156, "grad_norm": 0.12763463386504934, "learning_rate": 1.6551918943261046e-05, "loss": 0.4032, "step": 2214 }, { "epoch": 3.6490939044481054, "grad_norm": 0.12535486936866508, "learning_rate": 1.651465569417591e-05, "loss": 0.4067, "step": 2215 }, { "epoch": 3.6507413509060953, "grad_norm": 0.126830620219855, "learning_rate": 1.6477423523102665e-05, "loss": 0.4086, "step": 2216 }, { "epoch": 3.6523887973640856, "grad_norm": 0.1375137599100663, "learning_rate": 1.6440222479310445e-05, "loss": 0.4071, "step": 2217 }, { "epoch": 3.654036243822076, "grad_norm": 0.11234888551098558, "learning_rate": 1.6403052612027147e-05, "loss": 0.4074, "step": 2218 }, { "epoch": 3.655683690280066, "grad_norm": 0.14112125675100098, "learning_rate": 1.6365913970439428e-05, "loss": 0.4147, "step": 2219 }, { "epoch": 3.657331136738056, "grad_norm": 0.12928688733861593, "learning_rate": 1.632880660369263e-05, "loss": 0.4091, "step": 2220 }, { "epoch": 3.658978583196046, "grad_norm": 0.12907173567152772, "learning_rate": 1.629173056089068e-05, "loss": 0.4013, "step": 2221 }, { "epoch": 3.6606260296540363, "grad_norm": 0.13694224036525074, "learning_rate": 1.625468589109611e-05, "loss": 0.4013, "step": 2222 }, { "epoch": 3.662273476112026, "grad_norm": 0.13667741895837476, "learning_rate": 1.6217672643329894e-05, "loss": 0.4052, "step": 2223 }, { "epoch": 3.6639209225700164, "grad_norm": 0.12594613713316696, "learning_rate": 1.618069086657143e-05, "loss": 0.4092, "step": 2224 }, { "epoch": 3.6655683690280068, "grad_norm": 0.1313793207677932, "learning_rate": 1.614374060975848e-05, "loss": 0.3997, "step": 2225 }, { "epoch": 3.6672158154859966, "grad_norm": 0.13146432640195357, "learning_rate": 1.6106821921787075e-05, "loss": 0.4128, "step": 2226 }, { "epoch": 3.668863261943987, "grad_norm": 0.1268334922826249, "learning_rate": 1.6069934851511515e-05, "loss": 0.4145, "step": 2227 }, { "epoch": 3.670510708401977, "grad_norm": 0.12712646443214135, "learning_rate": 1.6033079447744225e-05, "loss": 0.4022, "step": 2228 }, { "epoch": 3.672158154859967, "grad_norm": 0.11495774430765393, "learning_rate": 1.599625575925573e-05, "loss": 0.4063, "step": 2229 }, { "epoch": 3.673805601317957, "grad_norm": 0.12124327310673037, "learning_rate": 1.5959463834774592e-05, "loss": 0.3999, "step": 2230 }, { "epoch": 3.6754530477759473, "grad_norm": 0.13431771941761364, "learning_rate": 1.5922703722987324e-05, "loss": 0.4038, "step": 2231 }, { "epoch": 3.6771004942339376, "grad_norm": 0.12032843455212781, "learning_rate": 1.588597547253839e-05, "loss": 0.3987, "step": 2232 }, { "epoch": 3.6787479406919275, "grad_norm": 0.15654588889975188, "learning_rate": 1.5849279132030034e-05, "loss": 0.4032, "step": 2233 }, { "epoch": 3.6803953871499178, "grad_norm": 0.10795221306655806, "learning_rate": 1.5812614750022304e-05, "loss": 0.4017, "step": 2234 }, { "epoch": 3.6820428336079076, "grad_norm": 0.15092269915681122, "learning_rate": 1.5775982375032955e-05, "loss": 0.4098, "step": 2235 }, { "epoch": 3.683690280065898, "grad_norm": 0.1279930298014422, "learning_rate": 1.573938205553736e-05, "loss": 0.4001, "step": 2236 }, { "epoch": 3.685337726523888, "grad_norm": 0.13204004704207378, "learning_rate": 1.5702813839968537e-05, "loss": 0.4107, "step": 2237 }, { "epoch": 3.686985172981878, "grad_norm": 0.12363913141157479, "learning_rate": 1.5666277776716958e-05, "loss": 0.4023, "step": 2238 }, { "epoch": 3.6886326194398684, "grad_norm": 0.11775651754086554, "learning_rate": 1.5629773914130574e-05, "loss": 0.4035, "step": 2239 }, { "epoch": 3.6902800658978583, "grad_norm": 0.13075105138522253, "learning_rate": 1.559330230051472e-05, "loss": 0.41, "step": 2240 }, { "epoch": 3.6919275123558486, "grad_norm": 0.13822270509093879, "learning_rate": 1.555686298413205e-05, "loss": 0.404, "step": 2241 }, { "epoch": 3.6935749588138385, "grad_norm": 0.13060783699766582, "learning_rate": 1.552045601320251e-05, "loss": 0.4092, "step": 2242 }, { "epoch": 3.6952224052718288, "grad_norm": 0.12111326777324555, "learning_rate": 1.5484081435903213e-05, "loss": 0.41, "step": 2243 }, { "epoch": 3.6968698517298186, "grad_norm": 0.12798910050620432, "learning_rate": 1.5447739300368408e-05, "loss": 0.4093, "step": 2244 }, { "epoch": 3.698517298187809, "grad_norm": 0.1303043377997899, "learning_rate": 1.5411429654689406e-05, "loss": 0.4086, "step": 2245 }, { "epoch": 3.7001647446457993, "grad_norm": 0.12802779353243596, "learning_rate": 1.5375152546914575e-05, "loss": 0.4059, "step": 2246 }, { "epoch": 3.701812191103789, "grad_norm": 0.12949351977973061, "learning_rate": 1.5338908025049158e-05, "loss": 0.4024, "step": 2247 }, { "epoch": 3.703459637561779, "grad_norm": 0.12370080040505119, "learning_rate": 1.530269613705533e-05, "loss": 0.4103, "step": 2248 }, { "epoch": 3.7051070840197693, "grad_norm": 0.12293752199270215, "learning_rate": 1.526651693085204e-05, "loss": 0.4003, "step": 2249 }, { "epoch": 3.7067545304777596, "grad_norm": 0.13052983748785688, "learning_rate": 1.5230370454315004e-05, "loss": 0.4027, "step": 2250 }, { "epoch": 3.7084019769357495, "grad_norm": 0.12245398666345005, "learning_rate": 1.5194256755276664e-05, "loss": 0.4065, "step": 2251 }, { "epoch": 3.7100494233937398, "grad_norm": 0.13298693482676044, "learning_rate": 1.5158175881526035e-05, "loss": 0.4059, "step": 2252 }, { "epoch": 3.71169686985173, "grad_norm": 0.14335294505949184, "learning_rate": 1.512212788080872e-05, "loss": 0.4103, "step": 2253 }, { "epoch": 3.71334431630972, "grad_norm": 0.15180016822101866, "learning_rate": 1.5086112800826818e-05, "loss": 0.4102, "step": 2254 }, { "epoch": 3.71499176276771, "grad_norm": 0.13795191236828655, "learning_rate": 1.5050130689238845e-05, "loss": 0.4032, "step": 2255 }, { "epoch": 3.7166392092257, "grad_norm": 0.1467976095960115, "learning_rate": 1.501418159365974e-05, "loss": 0.4053, "step": 2256 }, { "epoch": 3.7182866556836904, "grad_norm": 0.14796793418655815, "learning_rate": 1.4978265561660701e-05, "loss": 0.4114, "step": 2257 }, { "epoch": 3.7199341021416803, "grad_norm": 0.1510598836369532, "learning_rate": 1.4942382640769202e-05, "loss": 0.4086, "step": 2258 }, { "epoch": 3.7215815485996706, "grad_norm": 0.13590864461397642, "learning_rate": 1.4906532878468882e-05, "loss": 0.4094, "step": 2259 }, { "epoch": 3.7232289950576605, "grad_norm": 0.1485566701677203, "learning_rate": 1.4870716322199505e-05, "loss": 0.409, "step": 2260 }, { "epoch": 3.724876441515651, "grad_norm": 0.1339758684788235, "learning_rate": 1.4834933019356923e-05, "loss": 0.398, "step": 2261 }, { "epoch": 3.7265238879736406, "grad_norm": 0.14625762900008718, "learning_rate": 1.479918301729295e-05, "loss": 0.4017, "step": 2262 }, { "epoch": 3.728171334431631, "grad_norm": 0.12899218612714877, "learning_rate": 1.4763466363315342e-05, "loss": 0.4043, "step": 2263 }, { "epoch": 3.7298187808896213, "grad_norm": 0.14058442777839916, "learning_rate": 1.472778310468773e-05, "loss": 0.4059, "step": 2264 }, { "epoch": 3.731466227347611, "grad_norm": 0.12162762036492109, "learning_rate": 1.4692133288629538e-05, "loss": 0.4035, "step": 2265 }, { "epoch": 3.7331136738056014, "grad_norm": 0.1325152230169547, "learning_rate": 1.4656516962315968e-05, "loss": 0.4037, "step": 2266 }, { "epoch": 3.7347611202635913, "grad_norm": 0.1292807292150583, "learning_rate": 1.4620934172877884e-05, "loss": 0.4052, "step": 2267 }, { "epoch": 3.7364085667215816, "grad_norm": 0.14094251158385795, "learning_rate": 1.4585384967401766e-05, "loss": 0.4091, "step": 2268 }, { "epoch": 3.7380560131795715, "grad_norm": 0.12937996742678576, "learning_rate": 1.4549869392929656e-05, "loss": 0.4091, "step": 2269 }, { "epoch": 3.739703459637562, "grad_norm": 0.15175858162632744, "learning_rate": 1.4514387496459091e-05, "loss": 0.4064, "step": 2270 }, { "epoch": 3.741350906095552, "grad_norm": 0.12597212199186777, "learning_rate": 1.4478939324943068e-05, "loss": 0.4069, "step": 2271 }, { "epoch": 3.742998352553542, "grad_norm": 0.14117334686525357, "learning_rate": 1.4443524925289922e-05, "loss": 0.4037, "step": 2272 }, { "epoch": 3.7446457990115323, "grad_norm": 0.13212811105357247, "learning_rate": 1.440814434436331e-05, "loss": 0.4069, "step": 2273 }, { "epoch": 3.746293245469522, "grad_norm": 0.12421697429100098, "learning_rate": 1.4372797628982142e-05, "loss": 0.3988, "step": 2274 }, { "epoch": 3.7479406919275124, "grad_norm": 0.1349142957426221, "learning_rate": 1.4337484825920491e-05, "loss": 0.4037, "step": 2275 }, { "epoch": 3.7495881383855023, "grad_norm": 0.14225757239680875, "learning_rate": 1.43022059819076e-05, "loss": 0.4089, "step": 2276 }, { "epoch": 3.7512355848434926, "grad_norm": 0.1293025215956254, "learning_rate": 1.4266961143627737e-05, "loss": 0.4021, "step": 2277 }, { "epoch": 3.752883031301483, "grad_norm": 0.13631118684385427, "learning_rate": 1.4231750357720175e-05, "loss": 0.4122, "step": 2278 }, { "epoch": 3.754530477759473, "grad_norm": 0.1383181053019654, "learning_rate": 1.4196573670779116e-05, "loss": 0.4007, "step": 2279 }, { "epoch": 3.7561779242174627, "grad_norm": 0.13005959672439774, "learning_rate": 1.416143112935369e-05, "loss": 0.3983, "step": 2280 }, { "epoch": 3.757825370675453, "grad_norm": 0.1241013773694388, "learning_rate": 1.4126322779947783e-05, "loss": 0.4021, "step": 2281 }, { "epoch": 3.7594728171334433, "grad_norm": 0.11919062340779514, "learning_rate": 1.4091248669020065e-05, "loss": 0.4017, "step": 2282 }, { "epoch": 3.761120263591433, "grad_norm": 0.12485769987971386, "learning_rate": 1.4056208842983887e-05, "loss": 0.3992, "step": 2283 }, { "epoch": 3.7627677100494235, "grad_norm": 0.12977093466347733, "learning_rate": 1.4021203348207224e-05, "loss": 0.4103, "step": 2284 }, { "epoch": 3.7644151565074138, "grad_norm": 0.13088035379063112, "learning_rate": 1.3986232231012662e-05, "loss": 0.4082, "step": 2285 }, { "epoch": 3.7660626029654036, "grad_norm": 0.15333016699614263, "learning_rate": 1.3951295537677246e-05, "loss": 0.4097, "step": 2286 }, { "epoch": 3.7677100494233935, "grad_norm": 0.11401274769708383, "learning_rate": 1.391639331443249e-05, "loss": 0.4105, "step": 2287 }, { "epoch": 3.769357495881384, "grad_norm": 0.1480494880895975, "learning_rate": 1.3881525607464297e-05, "loss": 0.4064, "step": 2288 }, { "epoch": 3.771004942339374, "grad_norm": 0.13450843823972652, "learning_rate": 1.3846692462912871e-05, "loss": 0.4053, "step": 2289 }, { "epoch": 3.772652388797364, "grad_norm": 0.12315411517939781, "learning_rate": 1.3811893926872726e-05, "loss": 0.4071, "step": 2290 }, { "epoch": 3.7742998352553543, "grad_norm": 0.14198336296311617, "learning_rate": 1.3777130045392536e-05, "loss": 0.4093, "step": 2291 }, { "epoch": 3.775947281713344, "grad_norm": 0.15196235267459957, "learning_rate": 1.3742400864475137e-05, "loss": 0.4105, "step": 2292 }, { "epoch": 3.7775947281713345, "grad_norm": 0.12985679215198728, "learning_rate": 1.3707706430077434e-05, "loss": 0.4072, "step": 2293 }, { "epoch": 3.7792421746293243, "grad_norm": 0.16849889363005222, "learning_rate": 1.3673046788110349e-05, "loss": 0.4024, "step": 2294 }, { "epoch": 3.7808896210873146, "grad_norm": 0.11554853408581023, "learning_rate": 1.3638421984438802e-05, "loss": 0.4065, "step": 2295 }, { "epoch": 3.782537067545305, "grad_norm": 0.13971002050775214, "learning_rate": 1.3603832064881566e-05, "loss": 0.3982, "step": 2296 }, { "epoch": 3.784184514003295, "grad_norm": 0.13683420637898566, "learning_rate": 1.3569277075211274e-05, "loss": 0.4064, "step": 2297 }, { "epoch": 3.785831960461285, "grad_norm": 0.11559872518505512, "learning_rate": 1.3534757061154324e-05, "loss": 0.4051, "step": 2298 }, { "epoch": 3.787479406919275, "grad_norm": 0.12353324543743825, "learning_rate": 1.3500272068390831e-05, "loss": 0.4013, "step": 2299 }, { "epoch": 3.7891268533772653, "grad_norm": 0.11538064823584006, "learning_rate": 1.3465822142554602e-05, "loss": 0.4033, "step": 2300 }, { "epoch": 3.790774299835255, "grad_norm": 0.1186876441981998, "learning_rate": 1.3431407329233e-05, "loss": 0.3962, "step": 2301 }, { "epoch": 3.7924217462932455, "grad_norm": 0.10988719628165391, "learning_rate": 1.339702767396693e-05, "loss": 0.3984, "step": 2302 }, { "epoch": 3.7940691927512358, "grad_norm": 0.11955249379568461, "learning_rate": 1.3362683222250778e-05, "loss": 0.3989, "step": 2303 }, { "epoch": 3.7957166392092256, "grad_norm": 0.12349717063263765, "learning_rate": 1.3328374019532339e-05, "loss": 0.4037, "step": 2304 }, { "epoch": 3.797364085667216, "grad_norm": 0.11848833752375933, "learning_rate": 1.3294100111212797e-05, "loss": 0.4136, "step": 2305 }, { "epoch": 3.799011532125206, "grad_norm": 0.13165303183201832, "learning_rate": 1.3259861542646589e-05, "loss": 0.408, "step": 2306 }, { "epoch": 3.800658978583196, "grad_norm": 0.1194017961577191, "learning_rate": 1.3225658359141398e-05, "loss": 0.4049, "step": 2307 }, { "epoch": 3.802306425041186, "grad_norm": 0.13478942622522902, "learning_rate": 1.3191490605958092e-05, "loss": 0.4112, "step": 2308 }, { "epoch": 3.8039538714991763, "grad_norm": 0.11520158870518052, "learning_rate": 1.3157358328310635e-05, "loss": 0.4084, "step": 2309 }, { "epoch": 3.8056013179571666, "grad_norm": 0.12361308945579765, "learning_rate": 1.3123261571366083e-05, "loss": 0.4106, "step": 2310 }, { "epoch": 3.8072487644151565, "grad_norm": 0.12446807009304212, "learning_rate": 1.308920038024446e-05, "loss": 0.4057, "step": 2311 }, { "epoch": 3.808896210873147, "grad_norm": 0.1158978308586627, "learning_rate": 1.3055174800018721e-05, "loss": 0.4008, "step": 2312 }, { "epoch": 3.8105436573311366, "grad_norm": 0.14061405497486087, "learning_rate": 1.3021184875714705e-05, "loss": 0.4099, "step": 2313 }, { "epoch": 3.812191103789127, "grad_norm": 0.11204102760205067, "learning_rate": 1.298723065231109e-05, "loss": 0.4047, "step": 2314 }, { "epoch": 3.813838550247117, "grad_norm": 0.13568845983441222, "learning_rate": 1.2953312174739275e-05, "loss": 0.4004, "step": 2315 }, { "epoch": 3.815485996705107, "grad_norm": 0.11826392260750561, "learning_rate": 1.291942948788338e-05, "loss": 0.4016, "step": 2316 }, { "epoch": 3.8171334431630974, "grad_norm": 0.11786592051462633, "learning_rate": 1.2885582636580152e-05, "loss": 0.402, "step": 2317 }, { "epoch": 3.8187808896210873, "grad_norm": 0.12404615317035302, "learning_rate": 1.285177166561891e-05, "loss": 0.4017, "step": 2318 }, { "epoch": 3.820428336079077, "grad_norm": 0.11707079811963075, "learning_rate": 1.2817996619741533e-05, "loss": 0.3985, "step": 2319 }, { "epoch": 3.8220757825370675, "grad_norm": 0.12127107012085778, "learning_rate": 1.2784257543642315e-05, "loss": 0.4061, "step": 2320 }, { "epoch": 3.823723228995058, "grad_norm": 0.10678647314205382, "learning_rate": 1.2750554481967968e-05, "loss": 0.4025, "step": 2321 }, { "epoch": 3.8253706754530477, "grad_norm": 0.12040975711035501, "learning_rate": 1.271688747931755e-05, "loss": 0.4092, "step": 2322 }, { "epoch": 3.827018121911038, "grad_norm": 0.11261260174233834, "learning_rate": 1.2683256580242383e-05, "loss": 0.4015, "step": 2323 }, { "epoch": 3.8286655683690283, "grad_norm": 0.11170191185934841, "learning_rate": 1.2649661829246057e-05, "loss": 0.3975, "step": 2324 }, { "epoch": 3.830313014827018, "grad_norm": 0.11323832152176118, "learning_rate": 1.2616103270784281e-05, "loss": 0.404, "step": 2325 }, { "epoch": 3.831960461285008, "grad_norm": 0.12821543168263677, "learning_rate": 1.25825809492649e-05, "loss": 0.4067, "step": 2326 }, { "epoch": 3.8336079077429983, "grad_norm": 0.11478719106642928, "learning_rate": 1.2549094909047787e-05, "loss": 0.4017, "step": 2327 }, { "epoch": 3.8352553542009886, "grad_norm": 0.12169140677561197, "learning_rate": 1.2515645194444806e-05, "loss": 0.4078, "step": 2328 }, { "epoch": 3.8369028006589785, "grad_norm": 0.14674057850204889, "learning_rate": 1.2482231849719782e-05, "loss": 0.4134, "step": 2329 }, { "epoch": 3.838550247116969, "grad_norm": 0.11149278968489443, "learning_rate": 1.2448854919088373e-05, "loss": 0.4024, "step": 2330 }, { "epoch": 3.8401976935749587, "grad_norm": 0.12390918974473403, "learning_rate": 1.2415514446718073e-05, "loss": 0.402, "step": 2331 }, { "epoch": 3.841845140032949, "grad_norm": 0.11871047889126288, "learning_rate": 1.2382210476728122e-05, "loss": 0.4061, "step": 2332 }, { "epoch": 3.843492586490939, "grad_norm": 0.12582543090076057, "learning_rate": 1.2348943053189441e-05, "loss": 0.404, "step": 2333 }, { "epoch": 3.845140032948929, "grad_norm": 0.11752322932503263, "learning_rate": 1.2315712220124643e-05, "loss": 0.4036, "step": 2334 }, { "epoch": 3.8467874794069195, "grad_norm": 0.13368929314246475, "learning_rate": 1.2282518021507865e-05, "loss": 0.4033, "step": 2335 }, { "epoch": 3.8484349258649093, "grad_norm": 0.1163386743875632, "learning_rate": 1.2249360501264786e-05, "loss": 0.4042, "step": 2336 }, { "epoch": 3.8500823723228996, "grad_norm": 0.1146697888312744, "learning_rate": 1.2216239703272552e-05, "loss": 0.4091, "step": 2337 }, { "epoch": 3.8517298187808895, "grad_norm": 0.11396340945256751, "learning_rate": 1.2183155671359699e-05, "loss": 0.408, "step": 2338 }, { "epoch": 3.85337726523888, "grad_norm": 0.10254308879142168, "learning_rate": 1.2150108449306148e-05, "loss": 0.412, "step": 2339 }, { "epoch": 3.8550247116968697, "grad_norm": 0.12085333872178856, "learning_rate": 1.211709808084307e-05, "loss": 0.4045, "step": 2340 }, { "epoch": 3.85667215815486, "grad_norm": 0.11632326774808065, "learning_rate": 1.2084124609652883e-05, "loss": 0.398, "step": 2341 }, { "epoch": 3.8583196046128503, "grad_norm": 0.12126291907130304, "learning_rate": 1.2051188079369186e-05, "loss": 0.4034, "step": 2342 }, { "epoch": 3.85996705107084, "grad_norm": 0.11393374741376774, "learning_rate": 1.2018288533576664e-05, "loss": 0.4046, "step": 2343 }, { "epoch": 3.8616144975288305, "grad_norm": 0.11628534902976519, "learning_rate": 1.1985426015811118e-05, "loss": 0.4043, "step": 2344 }, { "epoch": 3.8632619439868203, "grad_norm": 0.11792253145534046, "learning_rate": 1.19526005695593e-05, "loss": 0.4087, "step": 2345 }, { "epoch": 3.8649093904448106, "grad_norm": 0.12155858021751392, "learning_rate": 1.1919812238258918e-05, "loss": 0.4062, "step": 2346 }, { "epoch": 3.8665568369028005, "grad_norm": 0.13121749017972076, "learning_rate": 1.1887061065298564e-05, "loss": 0.4058, "step": 2347 }, { "epoch": 3.868204283360791, "grad_norm": 0.1028268975781745, "learning_rate": 1.1854347094017684e-05, "loss": 0.4107, "step": 2348 }, { "epoch": 3.869851729818781, "grad_norm": 0.15016011046023497, "learning_rate": 1.1821670367706464e-05, "loss": 0.4035, "step": 2349 }, { "epoch": 3.871499176276771, "grad_norm": 0.11012826394332557, "learning_rate": 1.1789030929605816e-05, "loss": 0.4032, "step": 2350 }, { "epoch": 3.873146622734761, "grad_norm": 0.11596721392980151, "learning_rate": 1.1756428822907306e-05, "loss": 0.3935, "step": 2351 }, { "epoch": 3.874794069192751, "grad_norm": 0.12647800781701385, "learning_rate": 1.1723864090753092e-05, "loss": 0.4101, "step": 2352 }, { "epoch": 3.8764415156507415, "grad_norm": 0.10600920045898116, "learning_rate": 1.1691336776235893e-05, "loss": 0.4016, "step": 2353 }, { "epoch": 3.8780889621087313, "grad_norm": 0.11652427759209116, "learning_rate": 1.1658846922398924e-05, "loss": 0.4116, "step": 2354 }, { "epoch": 3.8797364085667216, "grad_norm": 0.10937129696683356, "learning_rate": 1.1626394572235786e-05, "loss": 0.4099, "step": 2355 }, { "epoch": 3.881383855024712, "grad_norm": 0.11360017439283625, "learning_rate": 1.1593979768690482e-05, "loss": 0.4051, "step": 2356 }, { "epoch": 3.883031301482702, "grad_norm": 0.11397898548309386, "learning_rate": 1.1561602554657321e-05, "loss": 0.4109, "step": 2357 }, { "epoch": 3.8846787479406917, "grad_norm": 0.10775264375303663, "learning_rate": 1.152926297298086e-05, "loss": 0.4021, "step": 2358 }, { "epoch": 3.886326194398682, "grad_norm": 0.10199733868050433, "learning_rate": 1.1496961066455895e-05, "loss": 0.3997, "step": 2359 }, { "epoch": 3.8879736408566723, "grad_norm": 0.10532137576549963, "learning_rate": 1.1464696877827324e-05, "loss": 0.4031, "step": 2360 }, { "epoch": 3.889621087314662, "grad_norm": 0.10388804664604163, "learning_rate": 1.1432470449790149e-05, "loss": 0.4064, "step": 2361 }, { "epoch": 3.8912685337726525, "grad_norm": 0.11583574107010323, "learning_rate": 1.1400281824989409e-05, "loss": 0.4039, "step": 2362 }, { "epoch": 3.892915980230643, "grad_norm": 0.11303640082923827, "learning_rate": 1.1368131046020103e-05, "loss": 0.4072, "step": 2363 }, { "epoch": 3.8945634266886326, "grad_norm": 0.10340962888624548, "learning_rate": 1.1336018155427175e-05, "loss": 0.4155, "step": 2364 }, { "epoch": 3.8962108731466225, "grad_norm": 0.10803935081775638, "learning_rate": 1.1303943195705412e-05, "loss": 0.4112, "step": 2365 }, { "epoch": 3.897858319604613, "grad_norm": 0.1046843323063632, "learning_rate": 1.1271906209299406e-05, "loss": 0.4078, "step": 2366 }, { "epoch": 3.899505766062603, "grad_norm": 0.11383166890635525, "learning_rate": 1.1239907238603504e-05, "loss": 0.4096, "step": 2367 }, { "epoch": 3.901153212520593, "grad_norm": 0.09889738117546497, "learning_rate": 1.1207946325961743e-05, "loss": 0.401, "step": 2368 }, { "epoch": 3.9028006589785833, "grad_norm": 0.11080888150121249, "learning_rate": 1.1176023513667817e-05, "loss": 0.4043, "step": 2369 }, { "epoch": 3.904448105436573, "grad_norm": 0.11726407865792567, "learning_rate": 1.1144138843964982e-05, "loss": 0.4067, "step": 2370 }, { "epoch": 3.9060955518945635, "grad_norm": 0.11611298512246009, "learning_rate": 1.1112292359046025e-05, "loss": 0.4101, "step": 2371 }, { "epoch": 3.9077429983525533, "grad_norm": 0.10631232124034702, "learning_rate": 1.1080484101053202e-05, "loss": 0.3965, "step": 2372 }, { "epoch": 3.9093904448105437, "grad_norm": 0.11547040639519605, "learning_rate": 1.104871411207817e-05, "loss": 0.4018, "step": 2373 }, { "epoch": 3.911037891268534, "grad_norm": 0.11651778593472693, "learning_rate": 1.1016982434161991e-05, "loss": 0.3989, "step": 2374 }, { "epoch": 3.912685337726524, "grad_norm": 0.10512316004703916, "learning_rate": 1.0985289109294985e-05, "loss": 0.4065, "step": 2375 }, { "epoch": 3.914332784184514, "grad_norm": 0.11205767730206392, "learning_rate": 1.0953634179416733e-05, "loss": 0.3963, "step": 2376 }, { "epoch": 3.915980230642504, "grad_norm": 0.10707782190287857, "learning_rate": 1.0922017686416008e-05, "loss": 0.401, "step": 2377 }, { "epoch": 3.9176276771004943, "grad_norm": 0.1067613291749874, "learning_rate": 1.0890439672130713e-05, "loss": 0.408, "step": 2378 }, { "epoch": 3.919275123558484, "grad_norm": 0.10730351984952474, "learning_rate": 1.0858900178347863e-05, "loss": 0.4141, "step": 2379 }, { "epoch": 3.9209225700164745, "grad_norm": 0.10934871313338125, "learning_rate": 1.0827399246803462e-05, "loss": 0.4096, "step": 2380 }, { "epoch": 3.922570016474465, "grad_norm": 0.10777716409166763, "learning_rate": 1.0795936919182503e-05, "loss": 0.4018, "step": 2381 }, { "epoch": 3.9242174629324547, "grad_norm": 0.1071064546146671, "learning_rate": 1.0764513237118882e-05, "loss": 0.4025, "step": 2382 }, { "epoch": 3.925864909390445, "grad_norm": 0.10633522359498992, "learning_rate": 1.0733128242195377e-05, "loss": 0.4019, "step": 2383 }, { "epoch": 3.927512355848435, "grad_norm": 0.10680170746438851, "learning_rate": 1.0701781975943555e-05, "loss": 0.4068, "step": 2384 }, { "epoch": 3.929159802306425, "grad_norm": 0.10575370243431872, "learning_rate": 1.0670474479843737e-05, "loss": 0.4001, "step": 2385 }, { "epoch": 3.930807248764415, "grad_norm": 0.10637061321115347, "learning_rate": 1.0639205795324945e-05, "loss": 0.4081, "step": 2386 }, { "epoch": 3.9324546952224053, "grad_norm": 0.1040368516641849, "learning_rate": 1.0607975963764817e-05, "loss": 0.4009, "step": 2387 }, { "epoch": 3.9341021416803956, "grad_norm": 0.10371374260261959, "learning_rate": 1.0576785026489623e-05, "loss": 0.4107, "step": 2388 }, { "epoch": 3.9357495881383855, "grad_norm": 0.1141729423655362, "learning_rate": 1.0545633024774124e-05, "loss": 0.4026, "step": 2389 }, { "epoch": 3.9373970345963754, "grad_norm": 0.10403519746895483, "learning_rate": 1.0514519999841583e-05, "loss": 0.4143, "step": 2390 }, { "epoch": 3.9390444810543657, "grad_norm": 0.11179090708882242, "learning_rate": 1.0483445992863666e-05, "loss": 0.4064, "step": 2391 }, { "epoch": 3.940691927512356, "grad_norm": 0.1044490455268984, "learning_rate": 1.0452411044960407e-05, "loss": 0.4098, "step": 2392 }, { "epoch": 3.942339373970346, "grad_norm": 0.10966483517589579, "learning_rate": 1.0421415197200186e-05, "loss": 0.4092, "step": 2393 }, { "epoch": 3.943986820428336, "grad_norm": 0.10898161570287415, "learning_rate": 1.0390458490599604e-05, "loss": 0.4049, "step": 2394 }, { "epoch": 3.9456342668863265, "grad_norm": 0.10371607084051537, "learning_rate": 1.0359540966123483e-05, "loss": 0.4073, "step": 2395 }, { "epoch": 3.9472817133443163, "grad_norm": 0.10518711818074518, "learning_rate": 1.0328662664684791e-05, "loss": 0.4058, "step": 2396 }, { "epoch": 3.948929159802306, "grad_norm": 0.10594419803111728, "learning_rate": 1.0297823627144584e-05, "loss": 0.4059, "step": 2397 }, { "epoch": 3.9505766062602965, "grad_norm": 0.10183550970301579, "learning_rate": 1.0267023894311997e-05, "loss": 0.4028, "step": 2398 }, { "epoch": 3.952224052718287, "grad_norm": 0.10306758738491556, "learning_rate": 1.023626350694411e-05, "loss": 0.4102, "step": 2399 }, { "epoch": 3.9538714991762767, "grad_norm": 0.11389379587538571, "learning_rate": 1.020554250574596e-05, "loss": 0.4102, "step": 2400 }, { "epoch": 3.955518945634267, "grad_norm": 0.10036376567408656, "learning_rate": 1.0174860931370456e-05, "loss": 0.4105, "step": 2401 }, { "epoch": 3.957166392092257, "grad_norm": 0.10680329009359137, "learning_rate": 1.014421882441833e-05, "loss": 0.4095, "step": 2402 }, { "epoch": 3.958813838550247, "grad_norm": 0.12865143526818892, "learning_rate": 1.0113616225438112e-05, "loss": 0.4112, "step": 2403 }, { "epoch": 3.960461285008237, "grad_norm": 0.1226564998074037, "learning_rate": 1.008305317492602e-05, "loss": 0.4028, "step": 2404 }, { "epoch": 3.9621087314662273, "grad_norm": 0.10407917828286839, "learning_rate": 1.0052529713325957e-05, "loss": 0.3994, "step": 2405 }, { "epoch": 3.9637561779242176, "grad_norm": 0.12163501492297779, "learning_rate": 1.0022045881029433e-05, "loss": 0.4044, "step": 2406 }, { "epoch": 3.9654036243822075, "grad_norm": 0.11943607838018458, "learning_rate": 9.991601718375499e-06, "loss": 0.4062, "step": 2407 }, { "epoch": 3.967051070840198, "grad_norm": 0.1029080295807752, "learning_rate": 9.961197265650751e-06, "loss": 0.4053, "step": 2408 }, { "epoch": 3.9686985172981877, "grad_norm": 0.11597051503375451, "learning_rate": 9.930832563089208e-06, "loss": 0.4045, "step": 2409 }, { "epoch": 3.970345963756178, "grad_norm": 0.11105686266391422, "learning_rate": 9.900507650872292e-06, "loss": 0.4064, "step": 2410 }, { "epoch": 3.971993410214168, "grad_norm": 0.11400277488855157, "learning_rate": 9.870222569128779e-06, "loss": 0.4029, "step": 2411 }, { "epoch": 3.973640856672158, "grad_norm": 0.11574692074584922, "learning_rate": 9.839977357934707e-06, "loss": 0.4076, "step": 2412 }, { "epoch": 3.9752883031301485, "grad_norm": 0.10350488306260175, "learning_rate": 9.809772057313416e-06, "loss": 0.4109, "step": 2413 }, { "epoch": 3.9769357495881383, "grad_norm": 0.10852956179609499, "learning_rate": 9.779606707235372e-06, "loss": 0.4069, "step": 2414 }, { "epoch": 3.9785831960461286, "grad_norm": 0.10728781049617064, "learning_rate": 9.749481347618203e-06, "loss": 0.4067, "step": 2415 }, { "epoch": 3.9802306425041185, "grad_norm": 0.1039142604926376, "learning_rate": 9.719396018326601e-06, "loss": 0.4034, "step": 2416 }, { "epoch": 3.981878088962109, "grad_norm": 0.10875020050104264, "learning_rate": 9.689350759172318e-06, "loss": 0.4058, "step": 2417 }, { "epoch": 3.9835255354200987, "grad_norm": 0.10955230867597493, "learning_rate": 9.659345609914048e-06, "loss": 0.4048, "step": 2418 }, { "epoch": 3.985172981878089, "grad_norm": 0.10957184697069006, "learning_rate": 9.629380610257426e-06, "loss": 0.4055, "step": 2419 }, { "epoch": 3.9868204283360793, "grad_norm": 0.11253146397186416, "learning_rate": 9.59945579985495e-06, "loss": 0.3974, "step": 2420 }, { "epoch": 3.988467874794069, "grad_norm": 0.10135653672983241, "learning_rate": 9.569571218305928e-06, "loss": 0.3963, "step": 2421 }, { "epoch": 3.990115321252059, "grad_norm": 0.12328945894791121, "learning_rate": 9.539726905156464e-06, "loss": 0.4134, "step": 2422 }, { "epoch": 3.9917627677100493, "grad_norm": 0.10559886960273712, "learning_rate": 9.509922899899337e-06, "loss": 0.4055, "step": 2423 }, { "epoch": 3.9934102141680397, "grad_norm": 0.092783168522401, "learning_rate": 9.480159241974017e-06, "loss": 0.4106, "step": 2424 }, { "epoch": 3.9950576606260295, "grad_norm": 0.11658820273810303, "learning_rate": 9.450435970766559e-06, "loss": 0.4024, "step": 2425 }, { "epoch": 3.99670510708402, "grad_norm": 0.11494658291188588, "learning_rate": 9.420753125609581e-06, "loss": 0.4081, "step": 2426 }, { "epoch": 3.99835255354201, "grad_norm": 0.10954864171678838, "learning_rate": 9.391110745782223e-06, "loss": 0.4098, "step": 2427 }, { "epoch": 4.0, "grad_norm": 0.14768260201970002, "learning_rate": 9.361508870510065e-06, "loss": 0.3871, "step": 2428 }, { "epoch": 4.00164744645799, "grad_norm": 0.11671353407902953, "learning_rate": 9.331947538965079e-06, "loss": 0.3903, "step": 2429 }, { "epoch": 4.003294892915981, "grad_norm": 0.12822980347112978, "learning_rate": 9.302426790265597e-06, "loss": 0.3829, "step": 2430 }, { "epoch": 4.0049423393739705, "grad_norm": 0.13361826189692586, "learning_rate": 9.272946663476228e-06, "loss": 0.3814, "step": 2431 }, { "epoch": 4.00658978583196, "grad_norm": 0.12204530588907808, "learning_rate": 9.243507197607875e-06, "loss": 0.3755, "step": 2432 }, { "epoch": 4.00823723228995, "grad_norm": 0.13256086190589036, "learning_rate": 9.21410843161758e-06, "loss": 0.3819, "step": 2433 }, { "epoch": 4.009884678747941, "grad_norm": 0.13803329367686984, "learning_rate": 9.184750404408556e-06, "loss": 0.385, "step": 2434 }, { "epoch": 4.011532125205931, "grad_norm": 0.12917165169348707, "learning_rate": 9.155433154830095e-06, "loss": 0.3864, "step": 2435 }, { "epoch": 4.013179571663921, "grad_norm": 0.12208833880548586, "learning_rate": 9.126156721677533e-06, "loss": 0.3863, "step": 2436 }, { "epoch": 4.0148270181219115, "grad_norm": 0.12737629165256859, "learning_rate": 9.096921143692206e-06, "loss": 0.3812, "step": 2437 }, { "epoch": 4.016474464579901, "grad_norm": 0.12359132577131783, "learning_rate": 9.067726459561369e-06, "loss": 0.3874, "step": 2438 }, { "epoch": 4.018121911037891, "grad_norm": 0.13104151564720884, "learning_rate": 9.03857270791816e-06, "loss": 0.3832, "step": 2439 }, { "epoch": 4.019769357495881, "grad_norm": 0.12698964612491598, "learning_rate": 9.009459927341573e-06, "loss": 0.3783, "step": 2440 }, { "epoch": 4.021416803953872, "grad_norm": 0.10736682929714648, "learning_rate": 8.980388156356348e-06, "loss": 0.3888, "step": 2441 }, { "epoch": 4.023064250411862, "grad_norm": 0.12619910034854165, "learning_rate": 8.95135743343301e-06, "loss": 0.3801, "step": 2442 }, { "epoch": 4.0247116968698515, "grad_norm": 0.11257807808837626, "learning_rate": 8.922367796987728e-06, "loss": 0.381, "step": 2443 }, { "epoch": 4.026359143327842, "grad_norm": 0.1122977456266927, "learning_rate": 8.893419285382304e-06, "loss": 0.3841, "step": 2444 }, { "epoch": 4.028006589785832, "grad_norm": 0.11546126138547769, "learning_rate": 8.864511936924125e-06, "loss": 0.3857, "step": 2445 }, { "epoch": 4.029654036243822, "grad_norm": 0.13338346392919342, "learning_rate": 8.835645789866101e-06, "loss": 0.3912, "step": 2446 }, { "epoch": 4.031301482701812, "grad_norm": 0.1199982931895278, "learning_rate": 8.806820882406649e-06, "loss": 0.3885, "step": 2447 }, { "epoch": 4.032948929159803, "grad_norm": 0.10048093442576508, "learning_rate": 8.778037252689575e-06, "loss": 0.3926, "step": 2448 }, { "epoch": 4.0345963756177925, "grad_norm": 0.12155760512975301, "learning_rate": 8.74929493880409e-06, "loss": 0.3768, "step": 2449 }, { "epoch": 4.036243822075782, "grad_norm": 0.1207783489050689, "learning_rate": 8.720593978784699e-06, "loss": 0.3915, "step": 2450 }, { "epoch": 4.037891268533772, "grad_norm": 0.10743426612015623, "learning_rate": 8.691934410611237e-06, "loss": 0.3862, "step": 2451 }, { "epoch": 4.039538714991763, "grad_norm": 0.10850775733739933, "learning_rate": 8.66331627220872e-06, "loss": 0.3875, "step": 2452 }, { "epoch": 4.041186161449753, "grad_norm": 0.10603914880394481, "learning_rate": 8.634739601447352e-06, "loss": 0.3909, "step": 2453 }, { "epoch": 4.042833607907743, "grad_norm": 0.1082879708239984, "learning_rate": 8.606204436142472e-06, "loss": 0.3825, "step": 2454 }, { "epoch": 4.0444810543657335, "grad_norm": 0.10498941374552526, "learning_rate": 8.577710814054473e-06, "loss": 0.3829, "step": 2455 }, { "epoch": 4.046128500823723, "grad_norm": 0.11685381092406984, "learning_rate": 8.549258772888812e-06, "loss": 0.3827, "step": 2456 }, { "epoch": 4.047775947281713, "grad_norm": 0.1078092626709551, "learning_rate": 8.520848350295896e-06, "loss": 0.3882, "step": 2457 }, { "epoch": 4.049423393739703, "grad_norm": 0.0976535123430002, "learning_rate": 8.492479583871059e-06, "loss": 0.3907, "step": 2458 }, { "epoch": 4.051070840197694, "grad_norm": 0.11126576668085557, "learning_rate": 8.46415251115451e-06, "loss": 0.3881, "step": 2459 }, { "epoch": 4.052718286655684, "grad_norm": 0.1130764720443303, "learning_rate": 8.435867169631287e-06, "loss": 0.3888, "step": 2460 }, { "epoch": 4.0543657331136735, "grad_norm": 0.09397756103543668, "learning_rate": 8.407623596731227e-06, "loss": 0.3853, "step": 2461 }, { "epoch": 4.056013179571664, "grad_norm": 0.09759168776507117, "learning_rate": 8.379421829828862e-06, "loss": 0.3855, "step": 2462 }, { "epoch": 4.057660626029654, "grad_norm": 0.09708839687995904, "learning_rate": 8.35126190624343e-06, "loss": 0.387, "step": 2463 }, { "epoch": 4.059308072487644, "grad_norm": 0.09923385779959788, "learning_rate": 8.323143863238767e-06, "loss": 0.3813, "step": 2464 }, { "epoch": 4.060955518945634, "grad_norm": 0.11102186633494582, "learning_rate": 8.29506773802331e-06, "loss": 0.3832, "step": 2465 }, { "epoch": 4.062602965403625, "grad_norm": 0.09826891177885007, "learning_rate": 8.267033567750041e-06, "loss": 0.3884, "step": 2466 }, { "epoch": 4.0642504118616145, "grad_norm": 0.1032631697580363, "learning_rate": 8.239041389516389e-06, "loss": 0.3909, "step": 2467 }, { "epoch": 4.065897858319604, "grad_norm": 0.09924751935936478, "learning_rate": 8.211091240364237e-06, "loss": 0.3913, "step": 2468 }, { "epoch": 4.067545304777595, "grad_norm": 0.0965865284702469, "learning_rate": 8.183183157279847e-06, "loss": 0.3861, "step": 2469 }, { "epoch": 4.069192751235585, "grad_norm": 0.09919058955804506, "learning_rate": 8.1553171771938e-06, "loss": 0.3844, "step": 2470 }, { "epoch": 4.070840197693575, "grad_norm": 0.09703094554812414, "learning_rate": 8.127493336981e-06, "loss": 0.3834, "step": 2471 }, { "epoch": 4.072487644151565, "grad_norm": 0.10184634389904121, "learning_rate": 8.099711673460548e-06, "loss": 0.3844, "step": 2472 }, { "epoch": 4.0741350906095555, "grad_norm": 0.09544736749584329, "learning_rate": 8.071972223395756e-06, "loss": 0.3773, "step": 2473 }, { "epoch": 4.075782537067545, "grad_norm": 0.09383191367373431, "learning_rate": 8.044275023494067e-06, "loss": 0.3854, "step": 2474 }, { "epoch": 4.077429983525535, "grad_norm": 0.09618810992906864, "learning_rate": 8.016620110407e-06, "loss": 0.3837, "step": 2475 }, { "epoch": 4.079077429983526, "grad_norm": 0.093922004359889, "learning_rate": 7.989007520730152e-06, "loss": 0.3784, "step": 2476 }, { "epoch": 4.080724876441516, "grad_norm": 0.09700838017738161, "learning_rate": 7.96143729100309e-06, "loss": 0.3879, "step": 2477 }, { "epoch": 4.082372322899506, "grad_norm": 0.10321967857081762, "learning_rate": 7.933909457709323e-06, "loss": 0.3827, "step": 2478 }, { "epoch": 4.0840197693574956, "grad_norm": 0.09362748331906372, "learning_rate": 7.906424057276268e-06, "loss": 0.3874, "step": 2479 }, { "epoch": 4.085667215815486, "grad_norm": 0.09271946405012513, "learning_rate": 7.878981126075174e-06, "loss": 0.3846, "step": 2480 }, { "epoch": 4.087314662273476, "grad_norm": 0.10027041961819207, "learning_rate": 7.851580700421127e-06, "loss": 0.3886, "step": 2481 }, { "epoch": 4.088962108731466, "grad_norm": 0.09551668667339572, "learning_rate": 7.824222816572926e-06, "loss": 0.3801, "step": 2482 }, { "epoch": 4.090609555189456, "grad_norm": 0.09322972275564215, "learning_rate": 7.796907510733099e-06, "loss": 0.3866, "step": 2483 }, { "epoch": 4.092257001647447, "grad_norm": 0.09198277493056907, "learning_rate": 7.769634819047804e-06, "loss": 0.3823, "step": 2484 }, { "epoch": 4.0939044481054365, "grad_norm": 0.09594078951851427, "learning_rate": 7.74240477760685e-06, "loss": 0.3783, "step": 2485 }, { "epoch": 4.095551894563426, "grad_norm": 0.09653642508939485, "learning_rate": 7.715217422443574e-06, "loss": 0.3828, "step": 2486 }, { "epoch": 4.097199341021417, "grad_norm": 0.09911569497764092, "learning_rate": 7.688072789534837e-06, "loss": 0.3836, "step": 2487 }, { "epoch": 4.098846787479407, "grad_norm": 0.09947900894517307, "learning_rate": 7.660970914800966e-06, "loss": 0.3784, "step": 2488 }, { "epoch": 4.100494233937397, "grad_norm": 0.10408918588441014, "learning_rate": 7.63391183410569e-06, "loss": 0.3904, "step": 2489 }, { "epoch": 4.102141680395387, "grad_norm": 0.10553737932677114, "learning_rate": 7.606895583256148e-06, "loss": 0.3831, "step": 2490 }, { "epoch": 4.1037891268533775, "grad_norm": 0.11603421619500077, "learning_rate": 7.5799221980027735e-06, "loss": 0.3852, "step": 2491 }, { "epoch": 4.105436573311367, "grad_norm": 0.10195598871590648, "learning_rate": 7.552991714039271e-06, "loss": 0.3873, "step": 2492 }, { "epoch": 4.107084019769357, "grad_norm": 0.1205856959194731, "learning_rate": 7.526104167002595e-06, "loss": 0.3873, "step": 2493 }, { "epoch": 4.108731466227348, "grad_norm": 0.11594552882691746, "learning_rate": 7.499259592472854e-06, "loss": 0.3908, "step": 2494 }, { "epoch": 4.110378912685338, "grad_norm": 0.10555445272934666, "learning_rate": 7.47245802597333e-06, "loss": 0.3907, "step": 2495 }, { "epoch": 4.112026359143328, "grad_norm": 0.11227531475965213, "learning_rate": 7.445699502970365e-06, "loss": 0.3838, "step": 2496 }, { "epoch": 4.113673805601318, "grad_norm": 0.10258740506159607, "learning_rate": 7.418984058873344e-06, "loss": 0.3932, "step": 2497 }, { "epoch": 4.115321252059308, "grad_norm": 0.10193573738744585, "learning_rate": 7.392311729034651e-06, "loss": 0.3823, "step": 2498 }, { "epoch": 4.116968698517298, "grad_norm": 0.10754383819589715, "learning_rate": 7.365682548749608e-06, "loss": 0.3858, "step": 2499 }, { "epoch": 4.118616144975288, "grad_norm": 0.10171120183415323, "learning_rate": 7.339096553256468e-06, "loss": 0.3851, "step": 2500 }, { "epoch": 4.120263591433279, "grad_norm": 0.101084681940138, "learning_rate": 7.312553777736297e-06, "loss": 0.3813, "step": 2501 }, { "epoch": 4.121911037891269, "grad_norm": 0.11859925320254114, "learning_rate": 7.286054257313e-06, "loss": 0.3847, "step": 2502 }, { "epoch": 4.1235584843492585, "grad_norm": 0.09914036221305968, "learning_rate": 7.259598027053214e-06, "loss": 0.3895, "step": 2503 }, { "epoch": 4.125205930807248, "grad_norm": 0.0999989064600633, "learning_rate": 7.233185121966309e-06, "loss": 0.3907, "step": 2504 }, { "epoch": 4.126853377265239, "grad_norm": 0.12367962074666546, "learning_rate": 7.206815577004334e-06, "loss": 0.3885, "step": 2505 }, { "epoch": 4.128500823723229, "grad_norm": 0.104552589245472, "learning_rate": 7.180489427061936e-06, "loss": 0.3853, "step": 2506 }, { "epoch": 4.130148270181219, "grad_norm": 0.1044979353921967, "learning_rate": 7.1542067069763525e-06, "loss": 0.3802, "step": 2507 }, { "epoch": 4.13179571663921, "grad_norm": 0.09391878700278031, "learning_rate": 7.127967451527338e-06, "loss": 0.3738, "step": 2508 }, { "epoch": 4.1334431630971995, "grad_norm": 0.1038881726699926, "learning_rate": 7.101771695437137e-06, "loss": 0.3797, "step": 2509 }, { "epoch": 4.135090609555189, "grad_norm": 0.1117508806285978, "learning_rate": 7.075619473370441e-06, "loss": 0.3903, "step": 2510 }, { "epoch": 4.136738056013179, "grad_norm": 0.09387378373461007, "learning_rate": 7.049510819934324e-06, "loss": 0.3873, "step": 2511 }, { "epoch": 4.13838550247117, "grad_norm": 0.09676294630803184, "learning_rate": 7.023445769678207e-06, "loss": 0.3872, "step": 2512 }, { "epoch": 4.14003294892916, "grad_norm": 0.1099279999524642, "learning_rate": 6.997424357093802e-06, "loss": 0.3891, "step": 2513 }, { "epoch": 4.14168039538715, "grad_norm": 0.11265426885484113, "learning_rate": 6.971446616615085e-06, "loss": 0.3908, "step": 2514 }, { "epoch": 4.1433278418451405, "grad_norm": 0.09026382447240529, "learning_rate": 6.945512582618259e-06, "loss": 0.3779, "step": 2515 }, { "epoch": 4.14497528830313, "grad_norm": 0.10486069202009982, "learning_rate": 6.919622289421654e-06, "loss": 0.3847, "step": 2516 }, { "epoch": 4.14662273476112, "grad_norm": 0.11138283125995257, "learning_rate": 6.893775771285743e-06, "loss": 0.3874, "step": 2517 }, { "epoch": 4.14827018121911, "grad_norm": 0.10204771880002196, "learning_rate": 6.8679730624130514e-06, "loss": 0.3916, "step": 2518 }, { "epoch": 4.149917627677101, "grad_norm": 0.10933856829159341, "learning_rate": 6.842214196948172e-06, "loss": 0.3903, "step": 2519 }, { "epoch": 4.151565074135091, "grad_norm": 0.10258868240067845, "learning_rate": 6.816499208977631e-06, "loss": 0.3857, "step": 2520 }, { "epoch": 4.1532125205930805, "grad_norm": 0.1089399572800417, "learning_rate": 6.790828132529923e-06, "loss": 0.3921, "step": 2521 }, { "epoch": 4.15485996705107, "grad_norm": 0.10814535235893788, "learning_rate": 6.765201001575427e-06, "loss": 0.3822, "step": 2522 }, { "epoch": 4.156507413509061, "grad_norm": 0.09518617481404482, "learning_rate": 6.739617850026352e-06, "loss": 0.3837, "step": 2523 }, { "epoch": 4.158154859967051, "grad_norm": 0.10914504493295692, "learning_rate": 6.714078711736754e-06, "loss": 0.3803, "step": 2524 }, { "epoch": 4.159802306425041, "grad_norm": 0.10770300832071733, "learning_rate": 6.688583620502398e-06, "loss": 0.3906, "step": 2525 }, { "epoch": 4.161449752883032, "grad_norm": 0.08990468138472782, "learning_rate": 6.663132610060783e-06, "loss": 0.3777, "step": 2526 }, { "epoch": 4.1630971993410215, "grad_norm": 0.1093434852185456, "learning_rate": 6.637725714091083e-06, "loss": 0.3897, "step": 2527 }, { "epoch": 4.164744645799011, "grad_norm": 0.11520269759555288, "learning_rate": 6.612362966214067e-06, "loss": 0.3835, "step": 2528 }, { "epoch": 4.166392092257001, "grad_norm": 0.0922664040734784, "learning_rate": 6.587044399992133e-06, "loss": 0.3856, "step": 2529 }, { "epoch": 4.168039538714992, "grad_norm": 0.10532867167100085, "learning_rate": 6.561770048929168e-06, "loss": 0.3835, "step": 2530 }, { "epoch": 4.169686985172982, "grad_norm": 0.11039251682207249, "learning_rate": 6.536539946470571e-06, "loss": 0.3874, "step": 2531 }, { "epoch": 4.171334431630972, "grad_norm": 0.10010965088872567, "learning_rate": 6.511354126003175e-06, "loss": 0.386, "step": 2532 }, { "epoch": 4.1729818780889625, "grad_norm": 0.10446775275282968, "learning_rate": 6.4862126208552164e-06, "loss": 0.3889, "step": 2533 }, { "epoch": 4.174629324546952, "grad_norm": 0.10788971349811383, "learning_rate": 6.4611154642963125e-06, "loss": 0.3915, "step": 2534 }, { "epoch": 4.176276771004942, "grad_norm": 0.10629917059788127, "learning_rate": 6.436062689537363e-06, "loss": 0.3884, "step": 2535 }, { "epoch": 4.177924217462932, "grad_norm": 0.10033100260214377, "learning_rate": 6.411054329730558e-06, "loss": 0.3878, "step": 2536 }, { "epoch": 4.179571663920923, "grad_norm": 0.09380969175718985, "learning_rate": 6.386090417969306e-06, "loss": 0.3893, "step": 2537 }, { "epoch": 4.181219110378913, "grad_norm": 0.1029674098384084, "learning_rate": 6.361170987288181e-06, "loss": 0.383, "step": 2538 }, { "epoch": 4.182866556836903, "grad_norm": 0.10120444024079094, "learning_rate": 6.336296070662937e-06, "loss": 0.3839, "step": 2539 }, { "epoch": 4.184514003294893, "grad_norm": 0.10454671128057194, "learning_rate": 6.311465701010391e-06, "loss": 0.3863, "step": 2540 }, { "epoch": 4.186161449752883, "grad_norm": 0.10180494161699799, "learning_rate": 6.286679911188414e-06, "loss": 0.388, "step": 2541 }, { "epoch": 4.187808896210873, "grad_norm": 0.10043460801351183, "learning_rate": 6.261938733995894e-06, "loss": 0.3811, "step": 2542 }, { "epoch": 4.189456342668863, "grad_norm": 0.0978828945521921, "learning_rate": 6.237242202172664e-06, "loss": 0.3859, "step": 2543 }, { "epoch": 4.191103789126854, "grad_norm": 0.11003186212831519, "learning_rate": 6.212590348399516e-06, "loss": 0.3864, "step": 2544 }, { "epoch": 4.1927512355848435, "grad_norm": 0.11487830484788229, "learning_rate": 6.187983205298089e-06, "loss": 0.3882, "step": 2545 }, { "epoch": 4.194398682042833, "grad_norm": 0.0940580976599399, "learning_rate": 6.1634208054308595e-06, "loss": 0.3851, "step": 2546 }, { "epoch": 4.196046128500824, "grad_norm": 0.1030842868473507, "learning_rate": 6.13890318130109e-06, "loss": 0.3767, "step": 2547 }, { "epoch": 4.197693574958814, "grad_norm": 0.10425424615623712, "learning_rate": 6.114430365352828e-06, "loss": 0.3859, "step": 2548 }, { "epoch": 4.199341021416804, "grad_norm": 0.09806917690196765, "learning_rate": 6.090002389970781e-06, "loss": 0.3847, "step": 2549 }, { "epoch": 4.200988467874794, "grad_norm": 0.09603661086502388, "learning_rate": 6.065619287480351e-06, "loss": 0.3894, "step": 2550 }, { "epoch": 4.2026359143327845, "grad_norm": 0.09977716894154459, "learning_rate": 6.0412810901475395e-06, "loss": 0.3826, "step": 2551 }, { "epoch": 4.204283360790774, "grad_norm": 0.09645512329533013, "learning_rate": 6.016987830178927e-06, "loss": 0.3845, "step": 2552 }, { "epoch": 4.205930807248764, "grad_norm": 0.10097773073792508, "learning_rate": 5.992739539721651e-06, "loss": 0.3863, "step": 2553 }, { "epoch": 4.207578253706755, "grad_norm": 0.09260397719983136, "learning_rate": 5.968536250863319e-06, "loss": 0.383, "step": 2554 }, { "epoch": 4.209225700164745, "grad_norm": 0.10046682478206505, "learning_rate": 5.944377995631993e-06, "loss": 0.3907, "step": 2555 }, { "epoch": 4.210873146622735, "grad_norm": 0.0980307080692976, "learning_rate": 5.9202648059961455e-06, "loss": 0.3914, "step": 2556 }, { "epoch": 4.212520593080725, "grad_norm": 0.10894234958692835, "learning_rate": 5.896196713864601e-06, "loss": 0.3857, "step": 2557 }, { "epoch": 4.214168039538715, "grad_norm": 0.09483636545868036, "learning_rate": 5.8721737510865384e-06, "loss": 0.3958, "step": 2558 }, { "epoch": 4.215815485996705, "grad_norm": 0.0928879649270983, "learning_rate": 5.848195949451385e-06, "loss": 0.3897, "step": 2559 }, { "epoch": 4.217462932454695, "grad_norm": 0.10386665641332309, "learning_rate": 5.824263340688822e-06, "loss": 0.3902, "step": 2560 }, { "epoch": 4.219110378912685, "grad_norm": 0.09617639376078681, "learning_rate": 5.800375956468723e-06, "loss": 0.3885, "step": 2561 }, { "epoch": 4.220757825370676, "grad_norm": 0.09808605606058328, "learning_rate": 5.776533828401105e-06, "loss": 0.3801, "step": 2562 }, { "epoch": 4.2224052718286655, "grad_norm": 0.092486820658499, "learning_rate": 5.752736988036129e-06, "loss": 0.3858, "step": 2563 }, { "epoch": 4.224052718286655, "grad_norm": 0.10293686804658193, "learning_rate": 5.728985466864001e-06, "loss": 0.3875, "step": 2564 }, { "epoch": 4.225700164744646, "grad_norm": 0.09816758215337024, "learning_rate": 5.70527929631496e-06, "loss": 0.3846, "step": 2565 }, { "epoch": 4.227347611202636, "grad_norm": 0.09045368996628327, "learning_rate": 5.681618507759235e-06, "loss": 0.3831, "step": 2566 }, { "epoch": 4.228995057660626, "grad_norm": 0.0905810419716805, "learning_rate": 5.658003132507e-06, "loss": 0.382, "step": 2567 }, { "epoch": 4.230642504118616, "grad_norm": 0.087212106821264, "learning_rate": 5.6344332018083424e-06, "loss": 0.386, "step": 2568 }, { "epoch": 4.2322899505766065, "grad_norm": 0.09736546511851349, "learning_rate": 5.610908746853212e-06, "loss": 0.3808, "step": 2569 }, { "epoch": 4.233937397034596, "grad_norm": 0.0929971376969822, "learning_rate": 5.587429798771364e-06, "loss": 0.3844, "step": 2570 }, { "epoch": 4.235584843492586, "grad_norm": 0.08739692720073329, "learning_rate": 5.563996388632351e-06, "loss": 0.3875, "step": 2571 }, { "epoch": 4.237232289950577, "grad_norm": 0.0976587758891922, "learning_rate": 5.540608547445452e-06, "loss": 0.3829, "step": 2572 }, { "epoch": 4.238879736408567, "grad_norm": 0.09064360090563095, "learning_rate": 5.517266306159674e-06, "loss": 0.3838, "step": 2573 }, { "epoch": 4.240527182866557, "grad_norm": 0.09655832307923759, "learning_rate": 5.493969695663644e-06, "loss": 0.3847, "step": 2574 }, { "epoch": 4.242174629324547, "grad_norm": 0.09523482446375801, "learning_rate": 5.470718746785632e-06, "loss": 0.3871, "step": 2575 }, { "epoch": 4.243822075782537, "grad_norm": 0.0924043964979705, "learning_rate": 5.4475134902934744e-06, "loss": 0.3832, "step": 2576 }, { "epoch": 4.245469522240527, "grad_norm": 0.0900396155315452, "learning_rate": 5.42435395689453e-06, "loss": 0.387, "step": 2577 }, { "epoch": 4.247116968698517, "grad_norm": 0.0887832531418917, "learning_rate": 5.401240177235694e-06, "loss": 0.3797, "step": 2578 }, { "epoch": 4.248764415156508, "grad_norm": 0.09351148227055261, "learning_rate": 5.378172181903272e-06, "loss": 0.3892, "step": 2579 }, { "epoch": 4.250411861614498, "grad_norm": 0.08771894332254718, "learning_rate": 5.355150001423006e-06, "loss": 0.389, "step": 2580 }, { "epoch": 4.2520593080724876, "grad_norm": 0.08861084298920335, "learning_rate": 5.332173666259995e-06, "loss": 0.3832, "step": 2581 }, { "epoch": 4.253706754530477, "grad_norm": 0.08935181331886421, "learning_rate": 5.309243206818697e-06, "loss": 0.3814, "step": 2582 }, { "epoch": 4.255354200988468, "grad_norm": 0.08838066145472763, "learning_rate": 5.286358653442847e-06, "loss": 0.3855, "step": 2583 }, { "epoch": 4.257001647446458, "grad_norm": 0.0900742928442201, "learning_rate": 5.263520036415424e-06, "loss": 0.3825, "step": 2584 }, { "epoch": 4.258649093904448, "grad_norm": 0.08856904900492628, "learning_rate": 5.240727385958639e-06, "loss": 0.39, "step": 2585 }, { "epoch": 4.260296540362438, "grad_norm": 0.09705636584477968, "learning_rate": 5.217980732233851e-06, "loss": 0.3852, "step": 2586 }, { "epoch": 4.2619439868204285, "grad_norm": 0.09362835819343845, "learning_rate": 5.195280105341592e-06, "loss": 0.3792, "step": 2587 }, { "epoch": 4.263591433278418, "grad_norm": 0.10342090170343285, "learning_rate": 5.1726255353214475e-06, "loss": 0.3888, "step": 2588 }, { "epoch": 4.265238879736408, "grad_norm": 0.09416362010603406, "learning_rate": 5.15001705215207e-06, "loss": 0.3796, "step": 2589 }, { "epoch": 4.266886326194399, "grad_norm": 0.087331995279997, "learning_rate": 5.127454685751128e-06, "loss": 0.378, "step": 2590 }, { "epoch": 4.268533772652389, "grad_norm": 0.08850392558639696, "learning_rate": 5.104938465975258e-06, "loss": 0.384, "step": 2591 }, { "epoch": 4.270181219110379, "grad_norm": 0.0904482725486977, "learning_rate": 5.0824684226200485e-06, "loss": 0.3856, "step": 2592 }, { "epoch": 4.2718286655683695, "grad_norm": 0.09185156192945504, "learning_rate": 5.060044585419959e-06, "loss": 0.381, "step": 2593 }, { "epoch": 4.273476112026359, "grad_norm": 0.09108160462054513, "learning_rate": 5.0376669840483196e-06, "loss": 0.3829, "step": 2594 }, { "epoch": 4.275123558484349, "grad_norm": 0.08509807441850924, "learning_rate": 5.01533564811727e-06, "loss": 0.3835, "step": 2595 }, { "epoch": 4.276771004942339, "grad_norm": 0.09628840026022692, "learning_rate": 4.993050607177728e-06, "loss": 0.3924, "step": 2596 }, { "epoch": 4.27841845140033, "grad_norm": 0.09839497574865382, "learning_rate": 4.97081189071936e-06, "loss": 0.3887, "step": 2597 }, { "epoch": 4.28006589785832, "grad_norm": 0.08996748369478559, "learning_rate": 4.948619528170517e-06, "loss": 0.3861, "step": 2598 }, { "epoch": 4.28171334431631, "grad_norm": 0.09121983494056739, "learning_rate": 4.926473548898219e-06, "loss": 0.3807, "step": 2599 }, { "epoch": 4.283360790774299, "grad_norm": 0.0932435116524032, "learning_rate": 4.9043739822081015e-06, "loss": 0.3867, "step": 2600 }, { "epoch": 4.28500823723229, "grad_norm": 0.08921759351956465, "learning_rate": 4.882320857344378e-06, "loss": 0.3872, "step": 2601 }, { "epoch": 4.28665568369028, "grad_norm": 0.09692265889195605, "learning_rate": 4.860314203489829e-06, "loss": 0.3801, "step": 2602 }, { "epoch": 4.28830313014827, "grad_norm": 0.09332926712135198, "learning_rate": 4.838354049765719e-06, "loss": 0.3875, "step": 2603 }, { "epoch": 4.289950576606261, "grad_norm": 0.08871579860316833, "learning_rate": 4.816440425231781e-06, "loss": 0.3795, "step": 2604 }, { "epoch": 4.2915980230642505, "grad_norm": 0.09843455924758504, "learning_rate": 4.7945733588861785e-06, "loss": 0.3807, "step": 2605 }, { "epoch": 4.29324546952224, "grad_norm": 0.0925489514550579, "learning_rate": 4.772752879665463e-06, "loss": 0.3901, "step": 2606 }, { "epoch": 4.29489291598023, "grad_norm": 0.09687542803237935, "learning_rate": 4.750979016444559e-06, "loss": 0.3906, "step": 2607 }, { "epoch": 4.296540362438221, "grad_norm": 0.09746609840409995, "learning_rate": 4.729251798036667e-06, "loss": 0.3859, "step": 2608 }, { "epoch": 4.298187808896211, "grad_norm": 0.08992242008913358, "learning_rate": 4.707571253193291e-06, "loss": 0.3799, "step": 2609 }, { "epoch": 4.299835255354201, "grad_norm": 0.08634657312614939, "learning_rate": 4.68593741060416e-06, "loss": 0.3831, "step": 2610 }, { "epoch": 4.3014827018121915, "grad_norm": 0.09115901497022692, "learning_rate": 4.664350298897198e-06, "loss": 0.3863, "step": 2611 }, { "epoch": 4.303130148270181, "grad_norm": 0.100025799930206, "learning_rate": 4.642809946638514e-06, "loss": 0.3888, "step": 2612 }, { "epoch": 4.304777594728171, "grad_norm": 0.08463591285989959, "learning_rate": 4.62131638233231e-06, "loss": 0.3802, "step": 2613 }, { "epoch": 4.306425041186161, "grad_norm": 0.0846078120849064, "learning_rate": 4.5998696344208944e-06, "loss": 0.3828, "step": 2614 }, { "epoch": 4.308072487644152, "grad_norm": 0.09050033679258787, "learning_rate": 4.578469731284605e-06, "loss": 0.3889, "step": 2615 }, { "epoch": 4.309719934102142, "grad_norm": 0.11772725268928315, "learning_rate": 4.55711670124182e-06, "loss": 0.3883, "step": 2616 }, { "epoch": 4.311367380560132, "grad_norm": 0.09291474896020858, "learning_rate": 4.5358105725488685e-06, "loss": 0.3777, "step": 2617 }, { "epoch": 4.313014827018122, "grad_norm": 0.08428747438413131, "learning_rate": 4.5145513734000135e-06, "loss": 0.3883, "step": 2618 }, { "epoch": 4.314662273476112, "grad_norm": 0.0921900382506276, "learning_rate": 4.493339131927425e-06, "loss": 0.389, "step": 2619 }, { "epoch": 4.316309719934102, "grad_norm": 0.09510792826378765, "learning_rate": 4.472173876201123e-06, "loss": 0.39, "step": 2620 }, { "epoch": 4.317957166392092, "grad_norm": 0.08571193302891365, "learning_rate": 4.45105563422898e-06, "loss": 0.3882, "step": 2621 }, { "epoch": 4.319604612850083, "grad_norm": 0.09176123465979952, "learning_rate": 4.429984433956623e-06, "loss": 0.3861, "step": 2622 }, { "epoch": 4.3212520593080725, "grad_norm": 0.09016540767896028, "learning_rate": 4.4089603032674466e-06, "loss": 0.3766, "step": 2623 }, { "epoch": 4.322899505766062, "grad_norm": 0.09748232867134486, "learning_rate": 4.38798326998255e-06, "loss": 0.3838, "step": 2624 }, { "epoch": 4.324546952224052, "grad_norm": 0.0898962237218832, "learning_rate": 4.367053361860709e-06, "loss": 0.3796, "step": 2625 }, { "epoch": 4.326194398682043, "grad_norm": 0.0856969291577296, "learning_rate": 4.346170606598352e-06, "loss": 0.3858, "step": 2626 }, { "epoch": 4.327841845140033, "grad_norm": 0.09527807712642845, "learning_rate": 4.325335031829498e-06, "loss": 0.383, "step": 2627 }, { "epoch": 4.329489291598023, "grad_norm": 0.09012007276393832, "learning_rate": 4.304546665125733e-06, "loss": 0.3841, "step": 2628 }, { "epoch": 4.3311367380560135, "grad_norm": 0.09287391757128753, "learning_rate": 4.283805533996179e-06, "loss": 0.3935, "step": 2629 }, { "epoch": 4.332784184514003, "grad_norm": 0.0838641000680639, "learning_rate": 4.263111665887434e-06, "loss": 0.3856, "step": 2630 }, { "epoch": 4.334431630971993, "grad_norm": 0.09685072234209428, "learning_rate": 4.2424650881835826e-06, "loss": 0.3878, "step": 2631 }, { "epoch": 4.336079077429984, "grad_norm": 0.09408586124834925, "learning_rate": 4.221865828206113e-06, "loss": 0.3888, "step": 2632 }, { "epoch": 4.337726523887974, "grad_norm": 0.09283183174705965, "learning_rate": 4.201313913213892e-06, "loss": 0.3841, "step": 2633 }, { "epoch": 4.339373970345964, "grad_norm": 0.08271294746999086, "learning_rate": 4.180809370403145e-06, "loss": 0.3834, "step": 2634 }, { "epoch": 4.341021416803954, "grad_norm": 0.09227043098293752, "learning_rate": 4.1603522269074e-06, "loss": 0.3909, "step": 2635 }, { "epoch": 4.342668863261944, "grad_norm": 0.08271424706623905, "learning_rate": 4.139942509797483e-06, "loss": 0.387, "step": 2636 }, { "epoch": 4.344316309719934, "grad_norm": 0.08368840959687074, "learning_rate": 4.119580246081447e-06, "loss": 0.3914, "step": 2637 }, { "epoch": 4.345963756177924, "grad_norm": 0.09139792535711457, "learning_rate": 4.099265462704543e-06, "loss": 0.3819, "step": 2638 }, { "epoch": 4.347611202635914, "grad_norm": 0.10312937323716731, "learning_rate": 4.078998186549199e-06, "loss": 0.3809, "step": 2639 }, { "epoch": 4.349258649093905, "grad_norm": 0.09118873545073264, "learning_rate": 4.058778444434976e-06, "loss": 0.381, "step": 2640 }, { "epoch": 4.350906095551895, "grad_norm": 0.08501176575556252, "learning_rate": 4.038606263118543e-06, "loss": 0.3905, "step": 2641 }, { "epoch": 4.352553542009884, "grad_norm": 0.08543288775881161, "learning_rate": 4.018481669293617e-06, "loss": 0.3889, "step": 2642 }, { "epoch": 4.354200988467875, "grad_norm": 0.08789765409072814, "learning_rate": 3.998404689590954e-06, "loss": 0.3817, "step": 2643 }, { "epoch": 4.355848434925865, "grad_norm": 0.08992570181121534, "learning_rate": 3.978375350578292e-06, "loss": 0.3857, "step": 2644 }, { "epoch": 4.357495881383855, "grad_norm": 0.08581035705074391, "learning_rate": 3.958393678760328e-06, "loss": 0.392, "step": 2645 }, { "epoch": 4.359143327841845, "grad_norm": 0.08683964354508913, "learning_rate": 3.938459700578703e-06, "loss": 0.3852, "step": 2646 }, { "epoch": 4.3607907742998355, "grad_norm": 0.08435727086669524, "learning_rate": 3.918573442411919e-06, "loss": 0.3907, "step": 2647 }, { "epoch": 4.362438220757825, "grad_norm": 0.08864051456611151, "learning_rate": 3.898734930575336e-06, "loss": 0.3933, "step": 2648 }, { "epoch": 4.364085667215815, "grad_norm": 0.09007363978791914, "learning_rate": 3.878944191321128e-06, "loss": 0.3826, "step": 2649 }, { "epoch": 4.365733113673806, "grad_norm": 0.09239020885303398, "learning_rate": 3.859201250838278e-06, "loss": 0.3833, "step": 2650 }, { "epoch": 4.367380560131796, "grad_norm": 0.08733594959256014, "learning_rate": 3.839506135252489e-06, "loss": 0.3861, "step": 2651 }, { "epoch": 4.369028006589786, "grad_norm": 0.09247741121838406, "learning_rate": 3.819858870626183e-06, "loss": 0.3838, "step": 2652 }, { "epoch": 4.370675453047776, "grad_norm": 0.08574072912822522, "learning_rate": 3.800259482958466e-06, "loss": 0.387, "step": 2653 }, { "epoch": 4.372322899505766, "grad_norm": 0.08826031284383622, "learning_rate": 3.78070799818508e-06, "loss": 0.3888, "step": 2654 }, { "epoch": 4.373970345963756, "grad_norm": 0.08552769721803992, "learning_rate": 3.761204442178401e-06, "loss": 0.3878, "step": 2655 }, { "epoch": 4.375617792421746, "grad_norm": 0.09044542196433455, "learning_rate": 3.741748840747361e-06, "loss": 0.387, "step": 2656 }, { "epoch": 4.377265238879737, "grad_norm": 0.08883989308282406, "learning_rate": 3.7223412196374288e-06, "loss": 0.3912, "step": 2657 }, { "epoch": 4.378912685337727, "grad_norm": 0.08493588349859618, "learning_rate": 3.7029816045305977e-06, "loss": 0.3825, "step": 2658 }, { "epoch": 4.380560131795717, "grad_norm": 0.08000904882854183, "learning_rate": 3.683670021045318e-06, "loss": 0.3824, "step": 2659 }, { "epoch": 4.382207578253706, "grad_norm": 0.08527082725591158, "learning_rate": 3.6644064947364986e-06, "loss": 0.3831, "step": 2660 }, { "epoch": 4.383855024711697, "grad_norm": 0.09011388572806628, "learning_rate": 3.6451910510954514e-06, "loss": 0.3861, "step": 2661 }, { "epoch": 4.385502471169687, "grad_norm": 0.09036664395371698, "learning_rate": 3.6260237155498403e-06, "loss": 0.3902, "step": 2662 }, { "epoch": 4.387149917627677, "grad_norm": 0.08945243954505684, "learning_rate": 3.606904513463696e-06, "loss": 0.3909, "step": 2663 }, { "epoch": 4.388797364085667, "grad_norm": 0.08648626394799235, "learning_rate": 3.587833470137323e-06, "loss": 0.3876, "step": 2664 }, { "epoch": 4.3904448105436575, "grad_norm": 0.0873823387763672, "learning_rate": 3.568810610807334e-06, "loss": 0.3848, "step": 2665 }, { "epoch": 4.392092257001647, "grad_norm": 0.1015835557990769, "learning_rate": 3.549835960646557e-06, "loss": 0.3891, "step": 2666 }, { "epoch": 4.393739703459637, "grad_norm": 0.10078623566782019, "learning_rate": 3.5309095447640316e-06, "loss": 0.3916, "step": 2667 }, { "epoch": 4.395387149917628, "grad_norm": 0.08462747046608397, "learning_rate": 3.5120313882049595e-06, "loss": 0.3833, "step": 2668 }, { "epoch": 4.397034596375618, "grad_norm": 0.08916815099131421, "learning_rate": 3.4932015159506903e-06, "loss": 0.3906, "step": 2669 }, { "epoch": 4.398682042833608, "grad_norm": 0.08763282221644317, "learning_rate": 3.474419952918693e-06, "loss": 0.3821, "step": 2670 }, { "epoch": 4.400329489291598, "grad_norm": 0.10466004903351712, "learning_rate": 3.4556867239624768e-06, "loss": 0.3863, "step": 2671 }, { "epoch": 4.401976935749588, "grad_norm": 0.09389509161148604, "learning_rate": 3.437001853871622e-06, "loss": 0.3846, "step": 2672 }, { "epoch": 4.403624382207578, "grad_norm": 0.08390409726254774, "learning_rate": 3.418365367371692e-06, "loss": 0.3759, "step": 2673 }, { "epoch": 4.405271828665568, "grad_norm": 0.09781295696351217, "learning_rate": 3.3997772891242353e-06, "loss": 0.38, "step": 2674 }, { "epoch": 4.406919275123559, "grad_norm": 0.08325574711575155, "learning_rate": 3.3812376437267536e-06, "loss": 0.3823, "step": 2675 }, { "epoch": 4.408566721581549, "grad_norm": 0.0812408531406364, "learning_rate": 3.3627464557126356e-06, "loss": 0.3821, "step": 2676 }, { "epoch": 4.410214168039539, "grad_norm": 0.07943442256912649, "learning_rate": 3.3443037495511566e-06, "loss": 0.3895, "step": 2677 }, { "epoch": 4.4118616144975284, "grad_norm": 0.08554263653866932, "learning_rate": 3.3259095496474436e-06, "loss": 0.3828, "step": 2678 }, { "epoch": 4.413509060955519, "grad_norm": 0.0929810499471753, "learning_rate": 3.307563880342417e-06, "loss": 0.3888, "step": 2679 }, { "epoch": 4.415156507413509, "grad_norm": 0.08180348236529447, "learning_rate": 3.2892667659127996e-06, "loss": 0.3925, "step": 2680 }, { "epoch": 4.416803953871499, "grad_norm": 0.08165822109464882, "learning_rate": 3.271018230571046e-06, "loss": 0.3815, "step": 2681 }, { "epoch": 4.41845140032949, "grad_norm": 0.09074887436892569, "learning_rate": 3.252818298465332e-06, "loss": 0.383, "step": 2682 }, { "epoch": 4.4200988467874796, "grad_norm": 0.08201114932120655, "learning_rate": 3.2346669936795095e-06, "loss": 0.3862, "step": 2683 }, { "epoch": 4.421746293245469, "grad_norm": 0.08342092319552, "learning_rate": 3.216564340233097e-06, "loss": 0.3894, "step": 2684 }, { "epoch": 4.423393739703459, "grad_norm": 0.08868844656761757, "learning_rate": 3.1985103620812263e-06, "loss": 0.3906, "step": 2685 }, { "epoch": 4.42504118616145, "grad_norm": 0.08281652774048263, "learning_rate": 3.180505083114609e-06, "loss": 0.3821, "step": 2686 }, { "epoch": 4.42668863261944, "grad_norm": 0.08236862928782007, "learning_rate": 3.1625485271595236e-06, "loss": 0.3735, "step": 2687 }, { "epoch": 4.42833607907743, "grad_norm": 0.07936594434951788, "learning_rate": 3.144640717977763e-06, "loss": 0.3809, "step": 2688 }, { "epoch": 4.42998352553542, "grad_norm": 0.08544834451799938, "learning_rate": 3.126781679266633e-06, "loss": 0.3893, "step": 2689 }, { "epoch": 4.43163097199341, "grad_norm": 0.08821436626163891, "learning_rate": 3.108971434658883e-06, "loss": 0.3882, "step": 2690 }, { "epoch": 4.4332784184514, "grad_norm": 0.08804998678497698, "learning_rate": 3.0912100077227047e-06, "loss": 0.3916, "step": 2691 }, { "epoch": 4.43492586490939, "grad_norm": 0.08432094028814714, "learning_rate": 3.07349742196168e-06, "loss": 0.3859, "step": 2692 }, { "epoch": 4.436573311367381, "grad_norm": 0.08285648970337743, "learning_rate": 3.055833700814761e-06, "loss": 0.3792, "step": 2693 }, { "epoch": 4.438220757825371, "grad_norm": 0.08719167628218921, "learning_rate": 3.038218867656255e-06, "loss": 0.382, "step": 2694 }, { "epoch": 4.439868204283361, "grad_norm": 0.08422784195108797, "learning_rate": 3.0206529457957523e-06, "loss": 0.3796, "step": 2695 }, { "epoch": 4.441515650741351, "grad_norm": 0.08456274811022918, "learning_rate": 3.003135958478138e-06, "loss": 0.3883, "step": 2696 }, { "epoch": 4.443163097199341, "grad_norm": 0.08570269114358595, "learning_rate": 2.9856679288835246e-06, "loss": 0.3894, "step": 2697 }, { "epoch": 4.444810543657331, "grad_norm": 0.09020496503204624, "learning_rate": 2.9682488801272426e-06, "loss": 0.3891, "step": 2698 }, { "epoch": 4.446457990115321, "grad_norm": 0.08899308966070262, "learning_rate": 2.9508788352598316e-06, "loss": 0.3871, "step": 2699 }, { "epoch": 4.448105436573312, "grad_norm": 0.0840587511773375, "learning_rate": 2.9335578172669542e-06, "loss": 0.3826, "step": 2700 }, { "epoch": 4.449752883031302, "grad_norm": 0.0839274312709225, "learning_rate": 2.916285849069409e-06, "loss": 0.3836, "step": 2701 }, { "epoch": 4.451400329489291, "grad_norm": 0.08243348777638419, "learning_rate": 2.899062953523082e-06, "loss": 0.3947, "step": 2702 }, { "epoch": 4.453047775947281, "grad_norm": 0.08602940450613018, "learning_rate": 2.881889153418924e-06, "loss": 0.3872, "step": 2703 }, { "epoch": 4.454695222405272, "grad_norm": 0.07676461100091557, "learning_rate": 2.8647644714829304e-06, "loss": 0.3893, "step": 2704 }, { "epoch": 4.456342668863262, "grad_norm": 0.08233117237724566, "learning_rate": 2.8476889303760756e-06, "loss": 0.3897, "step": 2705 }, { "epoch": 4.457990115321252, "grad_norm": 0.08873904277595618, "learning_rate": 2.8306625526943256e-06, "loss": 0.388, "step": 2706 }, { "epoch": 4.4596375617792425, "grad_norm": 0.08013698082995548, "learning_rate": 2.813685360968581e-06, "loss": 0.3817, "step": 2707 }, { "epoch": 4.461285008237232, "grad_norm": 0.08922539677499323, "learning_rate": 2.796757377664645e-06, "loss": 0.3863, "step": 2708 }, { "epoch": 4.462932454695222, "grad_norm": 0.0791681614561564, "learning_rate": 2.7798786251832256e-06, "loss": 0.3765, "step": 2709 }, { "epoch": 4.464579901153212, "grad_norm": 0.08469116138627797, "learning_rate": 2.7630491258598734e-06, "loss": 0.3835, "step": 2710 }, { "epoch": 4.466227347611203, "grad_norm": 0.08714968565084114, "learning_rate": 2.746268901964957e-06, "loss": 0.3898, "step": 2711 }, { "epoch": 4.467874794069193, "grad_norm": 0.08600796488759785, "learning_rate": 2.7295379757036376e-06, "loss": 0.3856, "step": 2712 }, { "epoch": 4.469522240527183, "grad_norm": 0.0953139421363791, "learning_rate": 2.7128563692158506e-06, "loss": 0.3801, "step": 2713 }, { "epoch": 4.471169686985173, "grad_norm": 0.09844415118815365, "learning_rate": 2.696224104576275e-06, "loss": 0.3883, "step": 2714 }, { "epoch": 4.472817133443163, "grad_norm": 0.08716602133741712, "learning_rate": 2.6796412037942698e-06, "loss": 0.3826, "step": 2715 }, { "epoch": 4.474464579901153, "grad_norm": 0.08854058987396618, "learning_rate": 2.6631076888138952e-06, "loss": 0.3894, "step": 2716 }, { "epoch": 4.476112026359143, "grad_norm": 0.08215847469348775, "learning_rate": 2.6466235815138408e-06, "loss": 0.3856, "step": 2717 }, { "epoch": 4.477759472817134, "grad_norm": 0.08499263283579503, "learning_rate": 2.630188903707436e-06, "loss": 0.3789, "step": 2718 }, { "epoch": 4.479406919275124, "grad_norm": 0.09226298143961174, "learning_rate": 2.613803677142581e-06, "loss": 0.3948, "step": 2719 }, { "epoch": 4.481054365733113, "grad_norm": 0.08533742027748094, "learning_rate": 2.5974679235017462e-06, "loss": 0.3831, "step": 2720 }, { "epoch": 4.482701812191104, "grad_norm": 0.08221037014429343, "learning_rate": 2.5811816644019374e-06, "loss": 0.3796, "step": 2721 }, { "epoch": 4.484349258649094, "grad_norm": 0.08342262601145033, "learning_rate": 2.564944921394652e-06, "loss": 0.3846, "step": 2722 }, { "epoch": 4.485996705107084, "grad_norm": 0.08814143005950412, "learning_rate": 2.548757715965886e-06, "loss": 0.3841, "step": 2723 }, { "epoch": 4.487644151565074, "grad_norm": 0.08428108507026329, "learning_rate": 2.5326200695360647e-06, "loss": 0.3828, "step": 2724 }, { "epoch": 4.4892915980230645, "grad_norm": 0.08869134231456398, "learning_rate": 2.5165320034600347e-06, "loss": 0.3835, "step": 2725 }, { "epoch": 4.490939044481054, "grad_norm": 0.08334295484122937, "learning_rate": 2.5004935390270335e-06, "loss": 0.3844, "step": 2726 }, { "epoch": 4.492586490939044, "grad_norm": 0.08348807548491165, "learning_rate": 2.484504697460657e-06, "loss": 0.3869, "step": 2727 }, { "epoch": 4.494233937397034, "grad_norm": 0.08710332198084753, "learning_rate": 2.4685654999188515e-06, "loss": 0.3846, "step": 2728 }, { "epoch": 4.495881383855025, "grad_norm": 0.08460425668526034, "learning_rate": 2.4526759674938604e-06, "loss": 0.383, "step": 2729 }, { "epoch": 4.497528830313015, "grad_norm": 0.08034444373977181, "learning_rate": 2.4368361212121984e-06, "loss": 0.3863, "step": 2730 }, { "epoch": 4.499176276771005, "grad_norm": 0.08433911279144066, "learning_rate": 2.4210459820346355e-06, "loss": 0.3796, "step": 2731 }, { "epoch": 4.500823723228995, "grad_norm": 0.08580518784196318, "learning_rate": 2.4053055708561646e-06, "loss": 0.3891, "step": 2732 }, { "epoch": 4.502471169686985, "grad_norm": 0.0858816814893418, "learning_rate": 2.389614908505986e-06, "loss": 0.3857, "step": 2733 }, { "epoch": 4.504118616144975, "grad_norm": 0.08113714365483722, "learning_rate": 2.3739740157474468e-06, "loss": 0.386, "step": 2734 }, { "epoch": 4.505766062602966, "grad_norm": 0.07968364725832171, "learning_rate": 2.3583829132780478e-06, "loss": 0.3934, "step": 2735 }, { "epoch": 4.507413509060956, "grad_norm": 0.08378303033664136, "learning_rate": 2.3428416217294016e-06, "loss": 0.3914, "step": 2736 }, { "epoch": 4.509060955518946, "grad_norm": 0.07936917044022215, "learning_rate": 2.32735016166719e-06, "loss": 0.3813, "step": 2737 }, { "epoch": 4.5107084019769355, "grad_norm": 0.08537200770398287, "learning_rate": 2.311908553591189e-06, "loss": 0.3788, "step": 2738 }, { "epoch": 4.512355848434926, "grad_norm": 0.08390549982743159, "learning_rate": 2.2965168179351725e-06, "loss": 0.3871, "step": 2739 }, { "epoch": 4.514003294892916, "grad_norm": 0.07962164900548423, "learning_rate": 2.2811749750669286e-06, "loss": 0.374, "step": 2740 }, { "epoch": 4.515650741350906, "grad_norm": 0.08313549123109769, "learning_rate": 2.265883045288222e-06, "loss": 0.3779, "step": 2741 }, { "epoch": 4.517298187808896, "grad_norm": 0.08361144854433167, "learning_rate": 2.2506410488347716e-06, "loss": 0.3883, "step": 2742 }, { "epoch": 4.518945634266887, "grad_norm": 0.08290063371369055, "learning_rate": 2.2354490058762226e-06, "loss": 0.3825, "step": 2743 }, { "epoch": 4.520593080724876, "grad_norm": 0.08206941570070521, "learning_rate": 2.22030693651611e-06, "loss": 0.3934, "step": 2744 }, { "epoch": 4.522240527182866, "grad_norm": 0.08565733143997381, "learning_rate": 2.205214860791838e-06, "loss": 0.3853, "step": 2745 }, { "epoch": 4.523887973640857, "grad_norm": 0.08253026867690105, "learning_rate": 2.1901727986746567e-06, "loss": 0.3802, "step": 2746 }, { "epoch": 4.525535420098847, "grad_norm": 0.08622509262120841, "learning_rate": 2.1751807700696357e-06, "loss": 0.386, "step": 2747 }, { "epoch": 4.527182866556837, "grad_norm": 0.08405186080162631, "learning_rate": 2.160238794815639e-06, "loss": 0.3874, "step": 2748 }, { "epoch": 4.528830313014827, "grad_norm": 0.0768448846641926, "learning_rate": 2.1453468926852893e-06, "loss": 0.3851, "step": 2749 }, { "epoch": 4.530477759472817, "grad_norm": 0.08844325819459861, "learning_rate": 2.130505083384944e-06, "loss": 0.3889, "step": 2750 }, { "epoch": 4.532125205930807, "grad_norm": 0.08701403828963969, "learning_rate": 2.1157133865546787e-06, "loss": 0.3891, "step": 2751 }, { "epoch": 4.533772652388797, "grad_norm": 0.08027132640696234, "learning_rate": 2.1009718217682627e-06, "loss": 0.3791, "step": 2752 }, { "epoch": 4.535420098846787, "grad_norm": 0.08419002763846271, "learning_rate": 2.0862804085331144e-06, "loss": 0.3832, "step": 2753 }, { "epoch": 4.537067545304778, "grad_norm": 0.07846894409942134, "learning_rate": 2.071639166290287e-06, "loss": 0.3874, "step": 2754 }, { "epoch": 4.538714991762768, "grad_norm": 0.08175231174095839, "learning_rate": 2.057048114414455e-06, "loss": 0.3877, "step": 2755 }, { "epoch": 4.5403624382207575, "grad_norm": 0.08905031631865257, "learning_rate": 2.0425072722138496e-06, "loss": 0.3878, "step": 2756 }, { "epoch": 4.542009884678748, "grad_norm": 0.07783634061580492, "learning_rate": 2.028016658930301e-06, "loss": 0.3802, "step": 2757 }, { "epoch": 4.543657331136738, "grad_norm": 0.07672220275738392, "learning_rate": 2.0135762937391366e-06, "loss": 0.3816, "step": 2758 }, { "epoch": 4.545304777594728, "grad_norm": 0.07825920169259983, "learning_rate": 1.999186195749201e-06, "loss": 0.3867, "step": 2759 }, { "epoch": 4.546952224052719, "grad_norm": 0.077876216840093, "learning_rate": 1.9848463840028297e-06, "loss": 0.3855, "step": 2760 }, { "epoch": 4.548599670510709, "grad_norm": 0.08153934982576498, "learning_rate": 1.9705568774757953e-06, "loss": 0.3814, "step": 2761 }, { "epoch": 4.550247116968698, "grad_norm": 0.08328452726012148, "learning_rate": 1.956317695077332e-06, "loss": 0.3886, "step": 2762 }, { "epoch": 4.551894563426688, "grad_norm": 0.08285725241054917, "learning_rate": 1.942128855650052e-06, "loss": 0.3831, "step": 2763 }, { "epoch": 4.553542009884679, "grad_norm": 0.08245904792989382, "learning_rate": 1.927990377969957e-06, "loss": 0.3846, "step": 2764 }, { "epoch": 4.555189456342669, "grad_norm": 0.08677371829474852, "learning_rate": 1.913902280746416e-06, "loss": 0.3914, "step": 2765 }, { "epoch": 4.556836902800659, "grad_norm": 0.07879133394339617, "learning_rate": 1.8998645826221062e-06, "loss": 0.3842, "step": 2766 }, { "epoch": 4.558484349258649, "grad_norm": 0.08457823160187378, "learning_rate": 1.8858773021730449e-06, "loss": 0.383, "step": 2767 }, { "epoch": 4.560131795716639, "grad_norm": 0.08029994785146889, "learning_rate": 1.8719404579085099e-06, "loss": 0.3814, "step": 2768 }, { "epoch": 4.561779242174629, "grad_norm": 0.09340568383273609, "learning_rate": 1.858054068271038e-06, "loss": 0.3883, "step": 2769 }, { "epoch": 4.563426688632619, "grad_norm": 0.08656577736349354, "learning_rate": 1.844218151636401e-06, "loss": 0.3839, "step": 2770 }, { "epoch": 4.56507413509061, "grad_norm": 0.08187489653015005, "learning_rate": 1.8304327263135845e-06, "loss": 0.385, "step": 2771 }, { "epoch": 4.5667215815486, "grad_norm": 0.08219139355839501, "learning_rate": 1.8166978105447608e-06, "loss": 0.3884, "step": 2772 }, { "epoch": 4.56836902800659, "grad_norm": 0.0860228318490402, "learning_rate": 1.8030134225052576e-06, "loss": 0.3787, "step": 2773 }, { "epoch": 4.57001647446458, "grad_norm": 0.07858953820289587, "learning_rate": 1.7893795803035408e-06, "loss": 0.3719, "step": 2774 }, { "epoch": 4.57166392092257, "grad_norm": 0.08731812610813014, "learning_rate": 1.775796301981192e-06, "loss": 0.3868, "step": 2775 }, { "epoch": 4.57331136738056, "grad_norm": 0.08271016713721731, "learning_rate": 1.7622636055128728e-06, "loss": 0.3882, "step": 2776 }, { "epoch": 4.57495881383855, "grad_norm": 0.08089394910829234, "learning_rate": 1.7487815088063298e-06, "loss": 0.3761, "step": 2777 }, { "epoch": 4.576606260296541, "grad_norm": 0.08876144688274779, "learning_rate": 1.7353500297023318e-06, "loss": 0.381, "step": 2778 }, { "epoch": 4.578253706754531, "grad_norm": 0.08453913257930455, "learning_rate": 1.7219691859746701e-06, "loss": 0.3764, "step": 2779 }, { "epoch": 4.5799011532125204, "grad_norm": 0.07844046800531515, "learning_rate": 1.7086389953301363e-06, "loss": 0.3891, "step": 2780 }, { "epoch": 4.58154859967051, "grad_norm": 0.07741775969429661, "learning_rate": 1.6953594754084867e-06, "loss": 0.383, "step": 2781 }, { "epoch": 4.583196046128501, "grad_norm": 0.07695586860169058, "learning_rate": 1.6821306437824336e-06, "loss": 0.3763, "step": 2782 }, { "epoch": 4.584843492586491, "grad_norm": 0.0871027966786418, "learning_rate": 1.6689525179576049e-06, "loss": 0.3878, "step": 2783 }, { "epoch": 4.586490939044481, "grad_norm": 0.07876965257567517, "learning_rate": 1.6558251153725357e-06, "loss": 0.3895, "step": 2784 }, { "epoch": 4.5881383855024716, "grad_norm": 0.08357976921667967, "learning_rate": 1.6427484533986326e-06, "loss": 0.3897, "step": 2785 }, { "epoch": 4.589785831960461, "grad_norm": 0.07777683539420732, "learning_rate": 1.6297225493401735e-06, "loss": 0.3851, "step": 2786 }, { "epoch": 4.591433278418451, "grad_norm": 0.07789857822751922, "learning_rate": 1.616747420434246e-06, "loss": 0.3859, "step": 2787 }, { "epoch": 4.593080724876441, "grad_norm": 0.08359880311262508, "learning_rate": 1.6038230838507597e-06, "loss": 0.3913, "step": 2788 }, { "epoch": 4.594728171334432, "grad_norm": 0.08104333955340083, "learning_rate": 1.590949556692416e-06, "loss": 0.381, "step": 2789 }, { "epoch": 4.596375617792422, "grad_norm": 0.07850025141604078, "learning_rate": 1.5781268559946639e-06, "loss": 0.3824, "step": 2790 }, { "epoch": 4.598023064250412, "grad_norm": 0.07953161120203318, "learning_rate": 1.5653549987257165e-06, "loss": 0.3861, "step": 2791 }, { "epoch": 4.5996705107084015, "grad_norm": 0.0807570444935751, "learning_rate": 1.5526340017864906e-06, "loss": 0.3824, "step": 2792 }, { "epoch": 4.601317957166392, "grad_norm": 0.07977437175698039, "learning_rate": 1.5399638820106e-06, "loss": 0.3889, "step": 2793 }, { "epoch": 4.602965403624382, "grad_norm": 0.08455595800177194, "learning_rate": 1.5273446561643358e-06, "loss": 0.3901, "step": 2794 }, { "epoch": 4.604612850082372, "grad_norm": 0.08241410088111947, "learning_rate": 1.5147763409466421e-06, "loss": 0.3835, "step": 2795 }, { "epoch": 4.606260296540363, "grad_norm": 0.07788597883626798, "learning_rate": 1.5022589529890996e-06, "loss": 0.385, "step": 2796 }, { "epoch": 4.607907742998353, "grad_norm": 0.0780473466031327, "learning_rate": 1.4897925088558851e-06, "loss": 0.3838, "step": 2797 }, { "epoch": 4.6095551894563425, "grad_norm": 0.07978669192097296, "learning_rate": 1.4773770250437713e-06, "loss": 0.3847, "step": 2798 }, { "epoch": 4.611202635914333, "grad_norm": 0.08048914688924316, "learning_rate": 1.4650125179820874e-06, "loss": 0.392, "step": 2799 }, { "epoch": 4.612850082372323, "grad_norm": 0.0775296376334266, "learning_rate": 1.4526990040327093e-06, "loss": 0.3851, "step": 2800 }, { "epoch": 4.614497528830313, "grad_norm": 0.07859237586512227, "learning_rate": 1.4404364994900388e-06, "loss": 0.3894, "step": 2801 }, { "epoch": 4.616144975288303, "grad_norm": 0.08102843741550522, "learning_rate": 1.428225020580971e-06, "loss": 0.3882, "step": 2802 }, { "epoch": 4.617792421746294, "grad_norm": 0.07839815086434243, "learning_rate": 1.4160645834648822e-06, "loss": 0.3867, "step": 2803 }, { "epoch": 4.619439868204283, "grad_norm": 0.08431836564583027, "learning_rate": 1.4039552042336025e-06, "loss": 0.3845, "step": 2804 }, { "epoch": 4.621087314662273, "grad_norm": 0.07906788244458075, "learning_rate": 1.3918968989113979e-06, "loss": 0.3823, "step": 2805 }, { "epoch": 4.622734761120263, "grad_norm": 0.12866675431920932, "learning_rate": 1.379889683454949e-06, "loss": 0.386, "step": 2806 }, { "epoch": 4.624382207578254, "grad_norm": 0.07628260582854698, "learning_rate": 1.3679335737533373e-06, "loss": 0.3848, "step": 2807 }, { "epoch": 4.626029654036244, "grad_norm": 0.07616934802372154, "learning_rate": 1.3560285856280043e-06, "loss": 0.3839, "step": 2808 }, { "epoch": 4.627677100494234, "grad_norm": 0.07709593601585023, "learning_rate": 1.3441747348327484e-06, "loss": 0.3775, "step": 2809 }, { "epoch": 4.629324546952224, "grad_norm": 0.0831031126787321, "learning_rate": 1.3323720370536886e-06, "loss": 0.3868, "step": 2810 }, { "epoch": 4.630971993410214, "grad_norm": 0.08204053183491597, "learning_rate": 1.3206205079092827e-06, "loss": 0.3848, "step": 2811 }, { "epoch": 4.632619439868204, "grad_norm": 0.08374495195920346, "learning_rate": 1.308920162950238e-06, "loss": 0.3816, "step": 2812 }, { "epoch": 4.634266886326195, "grad_norm": 0.07985641360115057, "learning_rate": 1.2972710176595605e-06, "loss": 0.3852, "step": 2813 }, { "epoch": 4.635914332784185, "grad_norm": 0.07758175016860855, "learning_rate": 1.285673087452488e-06, "loss": 0.3771, "step": 2814 }, { "epoch": 4.637561779242175, "grad_norm": 0.07738903503987388, "learning_rate": 1.2741263876764864e-06, "loss": 0.3859, "step": 2815 }, { "epoch": 4.6392092257001645, "grad_norm": 0.07530577159154588, "learning_rate": 1.2626309336112396e-06, "loss": 0.3867, "step": 2816 }, { "epoch": 4.640856672158155, "grad_norm": 0.08143045567249337, "learning_rate": 1.2511867404686108e-06, "loss": 0.3849, "step": 2817 }, { "epoch": 4.642504118616145, "grad_norm": 0.07664100192840106, "learning_rate": 1.2397938233926276e-06, "loss": 0.3796, "step": 2818 }, { "epoch": 4.644151565074135, "grad_norm": 0.0780493166645864, "learning_rate": 1.2284521974594666e-06, "loss": 0.3856, "step": 2819 }, { "epoch": 4.645799011532125, "grad_norm": 0.08410816923746604, "learning_rate": 1.217161877677433e-06, "loss": 0.3851, "step": 2820 }, { "epoch": 4.647446457990116, "grad_norm": 0.0833613136885151, "learning_rate": 1.2059228789869403e-06, "loss": 0.3861, "step": 2821 }, { "epoch": 4.649093904448105, "grad_norm": 0.08159873935912111, "learning_rate": 1.1947352162604874e-06, "loss": 0.3876, "step": 2822 }, { "epoch": 4.650741350906095, "grad_norm": 0.07613262703214137, "learning_rate": 1.183598904302632e-06, "loss": 0.3834, "step": 2823 }, { "epoch": 4.652388797364086, "grad_norm": 0.08106615714997698, "learning_rate": 1.17251395784999e-06, "loss": 0.3772, "step": 2824 }, { "epoch": 4.654036243822076, "grad_norm": 0.07857070672620534, "learning_rate": 1.161480391571206e-06, "loss": 0.3892, "step": 2825 }, { "epoch": 4.655683690280066, "grad_norm": 0.10025746140103033, "learning_rate": 1.1504982200669335e-06, "loss": 0.388, "step": 2826 }, { "epoch": 4.657331136738056, "grad_norm": 0.07991603972554027, "learning_rate": 1.1395674578698057e-06, "loss": 0.3846, "step": 2827 }, { "epoch": 4.658978583196046, "grad_norm": 0.08061436721447347, "learning_rate": 1.1286881194444343e-06, "loss": 0.3886, "step": 2828 }, { "epoch": 4.660626029654036, "grad_norm": 0.07796567730717031, "learning_rate": 1.1178602191873789e-06, "loss": 0.3938, "step": 2829 }, { "epoch": 4.662273476112026, "grad_norm": 0.0856286756931493, "learning_rate": 1.107083771427142e-06, "loss": 0.3889, "step": 2830 }, { "epoch": 4.663920922570016, "grad_norm": 0.08156189314023689, "learning_rate": 1.0963587904241302e-06, "loss": 0.3833, "step": 2831 }, { "epoch": 4.665568369028007, "grad_norm": 0.0814936533931284, "learning_rate": 1.085685290370635e-06, "loss": 0.3879, "step": 2832 }, { "epoch": 4.667215815485997, "grad_norm": 0.07787626102856171, "learning_rate": 1.0750632853908384e-06, "loss": 0.3873, "step": 2833 }, { "epoch": 4.6688632619439865, "grad_norm": 0.07693941162336897, "learning_rate": 1.0644927895407765e-06, "loss": 0.3872, "step": 2834 }, { "epoch": 4.670510708401977, "grad_norm": 0.07872195038679022, "learning_rate": 1.053973816808318e-06, "loss": 0.3892, "step": 2835 }, { "epoch": 4.672158154859967, "grad_norm": 0.07883836191888999, "learning_rate": 1.0435063811131596e-06, "loss": 0.3804, "step": 2836 }, { "epoch": 4.673805601317957, "grad_norm": 0.08660860879215461, "learning_rate": 1.0330904963067944e-06, "loss": 0.3869, "step": 2837 }, { "epoch": 4.675453047775948, "grad_norm": 0.0804066470615869, "learning_rate": 1.022726176172495e-06, "loss": 0.3871, "step": 2838 }, { "epoch": 4.677100494233938, "grad_norm": 0.07841601455076505, "learning_rate": 1.012413434425299e-06, "loss": 0.3845, "step": 2839 }, { "epoch": 4.6787479406919275, "grad_norm": 0.08142306371948627, "learning_rate": 1.0021522847120013e-06, "loss": 0.3773, "step": 2840 }, { "epoch": 4.680395387149917, "grad_norm": 0.07886597882216739, "learning_rate": 9.919427406111182e-07, "loss": 0.3875, "step": 2841 }, { "epoch": 4.682042833607908, "grad_norm": 0.07948303171764094, "learning_rate": 9.817848156328735e-07, "loss": 0.3827, "step": 2842 }, { "epoch": 4.683690280065898, "grad_norm": 0.07761095626180033, "learning_rate": 9.716785232191906e-07, "loss": 0.3852, "step": 2843 }, { "epoch": 4.685337726523888, "grad_norm": 0.07849258861710101, "learning_rate": 9.616238767436602e-07, "loss": 0.3851, "step": 2844 }, { "epoch": 4.686985172981878, "grad_norm": 0.07737783187840605, "learning_rate": 9.51620889511533e-07, "loss": 0.3848, "step": 2845 }, { "epoch": 4.688632619439868, "grad_norm": 0.07511884478478763, "learning_rate": 9.416695747597093e-07, "loss": 0.38, "step": 2846 }, { "epoch": 4.690280065897858, "grad_norm": 0.07869921914036107, "learning_rate": 9.317699456566953e-07, "loss": 0.389, "step": 2847 }, { "epoch": 4.691927512355848, "grad_norm": 0.08665194803101353, "learning_rate": 9.219220153026121e-07, "loss": 0.386, "step": 2848 }, { "epoch": 4.693574958813839, "grad_norm": 0.07766077392293845, "learning_rate": 9.121257967291553e-07, "loss": 0.3855, "step": 2849 }, { "epoch": 4.695222405271829, "grad_norm": 0.08055076969386823, "learning_rate": 9.023813028996176e-07, "loss": 0.385, "step": 2850 }, { "epoch": 4.696869851729819, "grad_norm": 0.08121942610834537, "learning_rate": 8.92688546708822e-07, "loss": 0.3776, "step": 2851 }, { "epoch": 4.698517298187809, "grad_norm": 0.07567807731157844, "learning_rate": 8.830475409831307e-07, "loss": 0.3821, "step": 2852 }, { "epoch": 4.700164744645799, "grad_norm": 0.07847243957438291, "learning_rate": 8.73458298480423e-07, "loss": 0.3869, "step": 2853 }, { "epoch": 4.701812191103789, "grad_norm": 0.07746037986568258, "learning_rate": 8.639208318900949e-07, "loss": 0.3805, "step": 2854 }, { "epoch": 4.703459637561779, "grad_norm": 0.07548315677318675, "learning_rate": 8.544351538330109e-07, "loss": 0.3874, "step": 2855 }, { "epoch": 4.70510708401977, "grad_norm": 0.07950512241506763, "learning_rate": 8.450012768615079e-07, "loss": 0.3827, "step": 2856 }, { "epoch": 4.70675453047776, "grad_norm": 0.07777146763251756, "learning_rate": 8.356192134593865e-07, "loss": 0.3878, "step": 2857 }, { "epoch": 4.7084019769357495, "grad_norm": 0.08041635595859857, "learning_rate": 8.262889760418624e-07, "loss": 0.3789, "step": 2858 }, { "epoch": 4.710049423393739, "grad_norm": 0.08082647648797922, "learning_rate": 8.170105769555925e-07, "loss": 0.3742, "step": 2859 }, { "epoch": 4.71169686985173, "grad_norm": 0.07750606501558786, "learning_rate": 8.07784028478622e-07, "loss": 0.3911, "step": 2860 }, { "epoch": 4.71334431630972, "grad_norm": 0.07794302614952385, "learning_rate": 7.986093428203801e-07, "loss": 0.3893, "step": 2861 }, { "epoch": 4.71499176276771, "grad_norm": 0.07629854768195607, "learning_rate": 7.894865321216793e-07, "loss": 0.3868, "step": 2862 }, { "epoch": 4.716639209225701, "grad_norm": 0.07999611471815554, "learning_rate": 7.804156084546721e-07, "loss": 0.3847, "step": 2863 }, { "epoch": 4.71828665568369, "grad_norm": 0.07820929843529872, "learning_rate": 7.713965838228588e-07, "loss": 0.3846, "step": 2864 }, { "epoch": 4.71993410214168, "grad_norm": 0.08362280179790177, "learning_rate": 7.624294701610657e-07, "loss": 0.3882, "step": 2865 }, { "epoch": 4.72158154859967, "grad_norm": 0.07745184363532176, "learning_rate": 7.5351427933541e-07, "loss": 0.3797, "step": 2866 }, { "epoch": 4.723228995057661, "grad_norm": 0.07821931398932946, "learning_rate": 7.446510231433169e-07, "loss": 0.3813, "step": 2867 }, { "epoch": 4.724876441515651, "grad_norm": 0.07734644218944206, "learning_rate": 7.358397133134665e-07, "loss": 0.3905, "step": 2868 }, { "epoch": 4.726523887973641, "grad_norm": 0.07877853504951128, "learning_rate": 7.270803615058253e-07, "loss": 0.3858, "step": 2869 }, { "epoch": 4.7281713344316305, "grad_norm": 0.07367477452836899, "learning_rate": 7.183729793115834e-07, "loss": 0.3835, "step": 2870 }, { "epoch": 4.729818780889621, "grad_norm": 0.07728824939126458, "learning_rate": 7.09717578253164e-07, "loss": 0.3842, "step": 2871 }, { "epoch": 4.731466227347611, "grad_norm": 0.07878634806358763, "learning_rate": 7.011141697842095e-07, "loss": 0.3814, "step": 2872 }, { "epoch": 4.733113673805601, "grad_norm": 0.08600095051142831, "learning_rate": 6.92562765289555e-07, "loss": 0.3904, "step": 2873 }, { "epoch": 4.734761120263592, "grad_norm": 0.08046990039817462, "learning_rate": 6.840633760852288e-07, "loss": 0.3841, "step": 2874 }, { "epoch": 4.736408566721582, "grad_norm": 0.07911240720746135, "learning_rate": 6.756160134184119e-07, "loss": 0.3861, "step": 2875 }, { "epoch": 4.7380560131795715, "grad_norm": 0.07517744199478563, "learning_rate": 6.67220688467447e-07, "loss": 0.389, "step": 2876 }, { "epoch": 4.739703459637562, "grad_norm": 0.07728781718661555, "learning_rate": 6.588774123418251e-07, "loss": 0.3793, "step": 2877 }, { "epoch": 4.741350906095552, "grad_norm": 0.07505345592248414, "learning_rate": 6.505861960821413e-07, "loss": 0.3827, "step": 2878 }, { "epoch": 4.742998352553542, "grad_norm": 0.07678977228300644, "learning_rate": 6.423470506601214e-07, "loss": 0.3898, "step": 2879 }, { "epoch": 4.744645799011532, "grad_norm": 0.07673922323688424, "learning_rate": 6.341599869785686e-07, "loss": 0.3879, "step": 2880 }, { "epoch": 4.746293245469523, "grad_norm": 0.07861273641333111, "learning_rate": 6.260250158713766e-07, "loss": 0.3844, "step": 2881 }, { "epoch": 4.7479406919275124, "grad_norm": 0.0789642117415223, "learning_rate": 6.17942148103503e-07, "loss": 0.3827, "step": 2882 }, { "epoch": 4.749588138385502, "grad_norm": 0.07574272515683943, "learning_rate": 6.099113943709523e-07, "loss": 0.3837, "step": 2883 }, { "epoch": 4.751235584843492, "grad_norm": 0.07862495462402772, "learning_rate": 6.019327653007789e-07, "loss": 0.3822, "step": 2884 }, { "epoch": 4.752883031301483, "grad_norm": 0.07760611557176916, "learning_rate": 5.94006271451053e-07, "loss": 0.3835, "step": 2885 }, { "epoch": 4.754530477759473, "grad_norm": 0.07745808005461098, "learning_rate": 5.861319233108509e-07, "loss": 0.3864, "step": 2886 }, { "epoch": 4.756177924217463, "grad_norm": 0.0736391439168703, "learning_rate": 5.783097313002506e-07, "loss": 0.3808, "step": 2887 }, { "epoch": 4.757825370675453, "grad_norm": 0.07954228442690203, "learning_rate": 5.705397057703099e-07, "loss": 0.3869, "step": 2888 }, { "epoch": 4.759472817133443, "grad_norm": 0.08049195344306917, "learning_rate": 5.628218570030619e-07, "loss": 0.3808, "step": 2889 }, { "epoch": 4.761120263591433, "grad_norm": 0.07963647384534461, "learning_rate": 5.551561952114793e-07, "loss": 0.3881, "step": 2890 }, { "epoch": 4.762767710049423, "grad_norm": 0.07893640687914434, "learning_rate": 5.475427305394921e-07, "loss": 0.3815, "step": 2891 }, { "epoch": 4.764415156507414, "grad_norm": 0.07697917817736355, "learning_rate": 5.399814730619479e-07, "loss": 0.3775, "step": 2892 }, { "epoch": 4.766062602965404, "grad_norm": 0.0794628743935022, "learning_rate": 5.324724327846165e-07, "loss": 0.3772, "step": 2893 }, { "epoch": 4.7677100494233935, "grad_norm": 0.0805195295393744, "learning_rate": 5.25015619644158e-07, "loss": 0.3862, "step": 2894 }, { "epoch": 4.769357495881383, "grad_norm": 0.08090028928070922, "learning_rate": 5.176110435081327e-07, "loss": 0.3813, "step": 2895 }, { "epoch": 4.771004942339374, "grad_norm": 0.07980904078896382, "learning_rate": 5.102587141749693e-07, "loss": 0.3828, "step": 2896 }, { "epoch": 4.772652388797364, "grad_norm": 0.0776387954246342, "learning_rate": 5.029586413739607e-07, "loss": 0.3845, "step": 2897 }, { "epoch": 4.774299835255354, "grad_norm": 0.07743408160259282, "learning_rate": 4.957108347652462e-07, "loss": 0.3878, "step": 2898 }, { "epoch": 4.775947281713345, "grad_norm": 0.07698907488006328, "learning_rate": 4.885153039398072e-07, "loss": 0.3882, "step": 2899 }, { "epoch": 4.7775947281713345, "grad_norm": 0.07581365663592399, "learning_rate": 4.813720584194404e-07, "loss": 0.3846, "step": 2900 }, { "epoch": 4.779242174629324, "grad_norm": 0.0790503180719633, "learning_rate": 4.742811076567666e-07, "loss": 0.3888, "step": 2901 }, { "epoch": 4.780889621087315, "grad_norm": 0.07741397755365283, "learning_rate": 4.672424610351867e-07, "loss": 0.3889, "step": 2902 }, { "epoch": 4.782537067545305, "grad_norm": 0.07671421504103544, "learning_rate": 4.6025612786890773e-07, "loss": 0.3852, "step": 2903 }, { "epoch": 4.784184514003295, "grad_norm": 0.08299905974064209, "learning_rate": 4.53322117402899e-07, "loss": 0.3838, "step": 2904 }, { "epoch": 4.785831960461285, "grad_norm": 0.0774485481515933, "learning_rate": 4.4644043881289177e-07, "loss": 0.3956, "step": 2905 }, { "epoch": 4.787479406919275, "grad_norm": 0.0834869285021157, "learning_rate": 4.396111012053705e-07, "loss": 0.3878, "step": 2906 }, { "epoch": 4.789126853377265, "grad_norm": 0.08053546009906382, "learning_rate": 4.3283411361755513e-07, "loss": 0.3875, "step": 2907 }, { "epoch": 4.790774299835255, "grad_norm": 0.07498068612938273, "learning_rate": 4.261094850173919e-07, "loss": 0.385, "step": 2908 }, { "epoch": 4.792421746293245, "grad_norm": 0.07497902639216182, "learning_rate": 4.1943722430354494e-07, "loss": 0.3825, "step": 2909 }, { "epoch": 4.794069192751236, "grad_norm": 0.08050116512486423, "learning_rate": 4.1281734030536925e-07, "loss": 0.3877, "step": 2910 }, { "epoch": 4.795716639209226, "grad_norm": 0.07992358666600001, "learning_rate": 4.0624984178291973e-07, "loss": 0.3807, "step": 2911 }, { "epoch": 4.7973640856672155, "grad_norm": 0.07621476978665485, "learning_rate": 3.997347374269245e-07, "loss": 0.3891, "step": 2912 }, { "epoch": 4.799011532125206, "grad_norm": 0.07662036699258935, "learning_rate": 3.9327203585878494e-07, "loss": 0.3842, "step": 2913 }, { "epoch": 4.800658978583196, "grad_norm": 0.07393144262841901, "learning_rate": 3.8686174563055344e-07, "loss": 0.3783, "step": 2914 }, { "epoch": 4.802306425041186, "grad_norm": 0.07618140165841208, "learning_rate": 3.8050387522492904e-07, "loss": 0.389, "step": 2915 }, { "epoch": 4.803953871499177, "grad_norm": 0.0875886544274742, "learning_rate": 3.7419843305523505e-07, "loss": 0.3826, "step": 2916 }, { "epoch": 4.805601317957167, "grad_norm": 0.08036031987224777, "learning_rate": 3.679454274654282e-07, "loss": 0.3877, "step": 2917 }, { "epoch": 4.8072487644151565, "grad_norm": 0.07743055469512798, "learning_rate": 3.617448667300805e-07, "loss": 0.3873, "step": 2918 }, { "epoch": 4.808896210873146, "grad_norm": 0.0803895002659614, "learning_rate": 3.555967590543441e-07, "loss": 0.3955, "step": 2919 }, { "epoch": 4.810543657331137, "grad_norm": 0.0771952832914062, "learning_rate": 3.4950111257398225e-07, "loss": 0.3869, "step": 2920 }, { "epoch": 4.812191103789127, "grad_norm": 0.07593698735793164, "learning_rate": 3.434579353553158e-07, "loss": 0.3831, "step": 2921 }, { "epoch": 4.813838550247117, "grad_norm": 0.07519439489489768, "learning_rate": 3.3746723539525463e-07, "loss": 0.3853, "step": 2922 }, { "epoch": 4.815485996705107, "grad_norm": 0.07534119824332079, "learning_rate": 3.315290206212485e-07, "loss": 0.3849, "step": 2923 }, { "epoch": 4.817133443163097, "grad_norm": 0.08172652881330557, "learning_rate": 3.256432988913005e-07, "loss": 0.3828, "step": 2924 }, { "epoch": 4.818780889621087, "grad_norm": 0.07578384151105047, "learning_rate": 3.198100779939539e-07, "loss": 0.3877, "step": 2925 }, { "epoch": 4.820428336079077, "grad_norm": 0.07187199183039947, "learning_rate": 3.1402936564826956e-07, "loss": 0.378, "step": 2926 }, { "epoch": 4.822075782537068, "grad_norm": 0.07601790911125526, "learning_rate": 3.0830116950383514e-07, "loss": 0.3793, "step": 2927 }, { "epoch": 4.823723228995058, "grad_norm": 0.07736441518329262, "learning_rate": 3.0262549714073386e-07, "loss": 0.3887, "step": 2928 }, { "epoch": 4.825370675453048, "grad_norm": 0.07869452390130681, "learning_rate": 2.970023560695445e-07, "loss": 0.3822, "step": 2929 }, { "epoch": 4.8270181219110375, "grad_norm": 0.07580434256185925, "learning_rate": 2.914317537313416e-07, "loss": 0.3876, "step": 2930 }, { "epoch": 4.828665568369028, "grad_norm": 0.0784771652201865, "learning_rate": 2.859136974976595e-07, "loss": 0.3842, "step": 2931 }, { "epoch": 4.830313014827018, "grad_norm": 0.07755876601227585, "learning_rate": 2.8044819467051955e-07, "loss": 0.3807, "step": 2932 }, { "epoch": 4.831960461285008, "grad_norm": 0.07633577514352248, "learning_rate": 2.7503525248238515e-07, "loss": 0.3889, "step": 2933 }, { "epoch": 4.833607907742998, "grad_norm": 0.07606905218505185, "learning_rate": 2.6967487809616224e-07, "loss": 0.3932, "step": 2934 }, { "epoch": 4.835255354200989, "grad_norm": 0.07479787150428321, "learning_rate": 2.643670786052077e-07, "loss": 0.3891, "step": 2935 }, { "epoch": 4.8369028006589785, "grad_norm": 0.07762731802077885, "learning_rate": 2.591118610332988e-07, "loss": 0.3865, "step": 2936 }, { "epoch": 4.838550247116968, "grad_norm": 0.078652068899439, "learning_rate": 2.539092323346326e-07, "loss": 0.3829, "step": 2937 }, { "epoch": 4.840197693574959, "grad_norm": 0.0769513873343332, "learning_rate": 2.4875919939381766e-07, "loss": 0.3806, "step": 2938 }, { "epoch": 4.841845140032949, "grad_norm": 0.08040905521243723, "learning_rate": 2.4366176902586024e-07, "loss": 0.3908, "step": 2939 }, { "epoch": 4.843492586490939, "grad_norm": 0.07981492045187713, "learning_rate": 2.386169479761513e-07, "loss": 0.38, "step": 2940 }, { "epoch": 4.84514003294893, "grad_norm": 0.07641352353812103, "learning_rate": 2.3362474292047964e-07, "loss": 0.3827, "step": 2941 }, { "epoch": 4.8467874794069195, "grad_norm": 0.08137120190845122, "learning_rate": 2.2868516046500088e-07, "loss": 0.3867, "step": 2942 }, { "epoch": 4.848434925864909, "grad_norm": 0.08535398922727404, "learning_rate": 2.2379820714622858e-07, "loss": 0.3829, "step": 2943 }, { "epoch": 4.850082372322899, "grad_norm": 0.07931814711330239, "learning_rate": 2.1896388943104307e-07, "loss": 0.392, "step": 2944 }, { "epoch": 4.85172981878089, "grad_norm": 0.07862311572282175, "learning_rate": 2.1418221371666048e-07, "loss": 0.3839, "step": 2945 }, { "epoch": 4.85337726523888, "grad_norm": 0.0766064408954837, "learning_rate": 2.0945318633064593e-07, "loss": 0.3852, "step": 2946 }, { "epoch": 4.85502471169687, "grad_norm": 0.07536549171193861, "learning_rate": 2.0477681353089584e-07, "loss": 0.3846, "step": 2947 }, { "epoch": 4.8566721581548595, "grad_norm": 0.07733627192898278, "learning_rate": 2.0015310150562462e-07, "loss": 0.3864, "step": 2948 }, { "epoch": 4.85831960461285, "grad_norm": 0.07660580688439216, "learning_rate": 1.9558205637336013e-07, "loss": 0.392, "step": 2949 }, { "epoch": 4.85996705107084, "grad_norm": 0.07500024359677103, "learning_rate": 1.9106368418293496e-07, "loss": 0.3792, "step": 2950 }, { "epoch": 4.86161449752883, "grad_norm": 0.07538414477070297, "learning_rate": 1.8659799091349073e-07, "loss": 0.3908, "step": 2951 }, { "epoch": 4.863261943986821, "grad_norm": 0.07827286709949922, "learning_rate": 1.8218498247444706e-07, "loss": 0.384, "step": 2952 }, { "epoch": 4.864909390444811, "grad_norm": 0.07437834269279703, "learning_rate": 1.778246647055104e-07, "loss": 0.3821, "step": 2953 }, { "epoch": 4.8665568369028005, "grad_norm": 0.07436192505492323, "learning_rate": 1.735170433766653e-07, "loss": 0.3763, "step": 2954 }, { "epoch": 4.868204283360791, "grad_norm": 0.07761902206986661, "learning_rate": 1.692621241881609e-07, "loss": 0.3879, "step": 2955 }, { "epoch": 4.869851729818781, "grad_norm": 0.07581202389477573, "learning_rate": 1.6505991277050214e-07, "loss": 0.3807, "step": 2956 }, { "epoch": 4.871499176276771, "grad_norm": 0.0746972536554942, "learning_rate": 1.6091041468445423e-07, "loss": 0.3795, "step": 2957 }, { "epoch": 4.873146622734761, "grad_norm": 0.07689579895488967, "learning_rate": 1.5681363542102034e-07, "loss": 0.3916, "step": 2958 }, { "epoch": 4.874794069192752, "grad_norm": 0.07699310931076818, "learning_rate": 1.527695804014373e-07, "loss": 0.3814, "step": 2959 }, { "epoch": 4.8764415156507415, "grad_norm": 0.07961531556294037, "learning_rate": 1.4877825497718878e-07, "loss": 0.3882, "step": 2960 }, { "epoch": 4.878088962108731, "grad_norm": 0.07830546663057353, "learning_rate": 1.4483966442996546e-07, "loss": 0.3777, "step": 2961 }, { "epoch": 4.879736408566721, "grad_norm": 0.07784343908575368, "learning_rate": 1.4095381397167818e-07, "loss": 0.3825, "step": 2962 }, { "epoch": 4.881383855024712, "grad_norm": 0.07595138022806021, "learning_rate": 1.371207087444537e-07, "loss": 0.3903, "step": 2963 }, { "epoch": 4.883031301482702, "grad_norm": 0.07527435464114501, "learning_rate": 1.3334035382061683e-07, "loss": 0.3859, "step": 2964 }, { "epoch": 4.884678747940692, "grad_norm": 0.07780228151135386, "learning_rate": 1.296127542026815e-07, "loss": 0.383, "step": 2965 }, { "epoch": 4.886326194398682, "grad_norm": 0.07602164070151451, "learning_rate": 1.2593791482336415e-07, "loss": 0.3841, "step": 2966 }, { "epoch": 4.887973640856672, "grad_norm": 0.07691835431008359, "learning_rate": 1.223158405455571e-07, "loss": 0.387, "step": 2967 }, { "epoch": 4.889621087314662, "grad_norm": 0.07435671226286901, "learning_rate": 1.1874653616232412e-07, "loss": 0.3876, "step": 2968 }, { "epoch": 4.891268533772652, "grad_norm": 0.07657616973650183, "learning_rate": 1.1523000639690474e-07, "loss": 0.3829, "step": 2969 }, { "epoch": 4.892915980230643, "grad_norm": 0.07819050580942709, "learning_rate": 1.117662559027055e-07, "loss": 0.3759, "step": 2970 }, { "epoch": 4.894563426688633, "grad_norm": 0.07401549648245499, "learning_rate": 1.083552892632822e-07, "loss": 0.3852, "step": 2971 }, { "epoch": 4.8962108731466225, "grad_norm": 0.07707801892580822, "learning_rate": 1.0499711099234421e-07, "loss": 0.3924, "step": 2972 }, { "epoch": 4.897858319604612, "grad_norm": 0.07530365276343606, "learning_rate": 1.0169172553375461e-07, "loss": 0.3837, "step": 2973 }, { "epoch": 4.899505766062603, "grad_norm": 0.07991355211784461, "learning_rate": 9.843913726150789e-08, "loss": 0.393, "step": 2974 }, { "epoch": 4.901153212520593, "grad_norm": 0.0772778386642492, "learning_rate": 9.523935047972998e-08, "loss": 0.3851, "step": 2975 }, { "epoch": 4.902800658978583, "grad_norm": 0.07626679249371565, "learning_rate": 9.209236942268273e-08, "loss": 0.3879, "step": 2976 }, { "epoch": 4.904448105436574, "grad_norm": 0.07506097434069711, "learning_rate": 8.899819825474609e-08, "loss": 0.381, "step": 2977 }, { "epoch": 4.9060955518945635, "grad_norm": 0.08040382836093886, "learning_rate": 8.595684107041369e-08, "loss": 0.378, "step": 2978 }, { "epoch": 4.907742998352553, "grad_norm": 0.07533856969277168, "learning_rate": 8.296830189430171e-08, "loss": 0.3904, "step": 2979 }, { "epoch": 4.909390444810544, "grad_norm": 0.07561383332212916, "learning_rate": 8.00325846811223e-08, "loss": 0.3839, "step": 2980 }, { "epoch": 4.911037891268534, "grad_norm": 0.07928800500695035, "learning_rate": 7.714969331569233e-08, "loss": 0.3839, "step": 2981 }, { "epoch": 4.912685337726524, "grad_norm": 0.0771172452039934, "learning_rate": 7.431963161292465e-08, "loss": 0.3893, "step": 2982 }, { "epoch": 4.914332784184514, "grad_norm": 0.07673033925281132, "learning_rate": 7.154240331782358e-08, "loss": 0.3898, "step": 2983 }, { "epoch": 4.9159802306425044, "grad_norm": 0.07682663511873832, "learning_rate": 6.881801210547157e-08, "loss": 0.3911, "step": 2984 }, { "epoch": 4.917627677100494, "grad_norm": 0.07609261610968633, "learning_rate": 6.614646158104698e-08, "loss": 0.3908, "step": 2985 }, { "epoch": 4.919275123558484, "grad_norm": 0.07662533895748642, "learning_rate": 6.352775527979304e-08, "loss": 0.3871, "step": 2986 }, { "epoch": 4.920922570016474, "grad_norm": 0.07901748605747504, "learning_rate": 6.096189666703112e-08, "loss": 0.3839, "step": 2987 }, { "epoch": 4.922570016474465, "grad_norm": 0.07526362536800751, "learning_rate": 5.844888913813851e-08, "loss": 0.3839, "step": 2988 }, { "epoch": 4.924217462932455, "grad_norm": 0.07366296367668292, "learning_rate": 5.598873601857069e-08, "loss": 0.386, "step": 2989 }, { "epoch": 4.9258649093904445, "grad_norm": 0.07445059487140077, "learning_rate": 5.358144056382575e-08, "loss": 0.3913, "step": 2990 }, { "epoch": 4.927512355848435, "grad_norm": 0.07261279117097154, "learning_rate": 5.1227005959471056e-08, "loss": 0.3814, "step": 2991 }, { "epoch": 4.929159802306425, "grad_norm": 0.07546422966211475, "learning_rate": 4.8925435321107715e-08, "loss": 0.3923, "step": 2992 }, { "epoch": 4.930807248764415, "grad_norm": 0.07414434022989147, "learning_rate": 4.66767316943928e-08, "loss": 0.3845, "step": 2993 }, { "epoch": 4.932454695222406, "grad_norm": 0.07507233091574107, "learning_rate": 4.448089805502598e-08, "loss": 0.3827, "step": 2994 }, { "epoch": 4.934102141680396, "grad_norm": 0.07417784602502427, "learning_rate": 4.233793730873625e-08, "loss": 0.3781, "step": 2995 }, { "epoch": 4.9357495881383855, "grad_norm": 0.0789795653904322, "learning_rate": 4.02478522912908e-08, "loss": 0.3776, "step": 2996 }, { "epoch": 4.937397034596375, "grad_norm": 0.07495750919451778, "learning_rate": 3.82106457684861e-08, "loss": 0.3849, "step": 2997 }, { "epoch": 4.939044481054366, "grad_norm": 0.07595064323723756, "learning_rate": 3.622632043614793e-08, "loss": 0.3818, "step": 2998 }, { "epoch": 4.940691927512356, "grad_norm": 0.07379873613325619, "learning_rate": 3.429487892011363e-08, "loss": 0.3805, "step": 2999 }, { "epoch": 4.942339373970346, "grad_norm": 0.0750379725458703, "learning_rate": 3.24163237762587e-08, "loss": 0.388, "step": 3000 }, { "epoch": 4.943986820428336, "grad_norm": 0.07478143194489986, "learning_rate": 3.0590657490461305e-08, "loss": 0.3813, "step": 3001 }, { "epoch": 4.9456342668863265, "grad_norm": 0.07443411465154157, "learning_rate": 2.8817882478606728e-08, "loss": 0.3864, "step": 3002 }, { "epoch": 4.947281713344316, "grad_norm": 0.07509187478647349, "learning_rate": 2.7098001086605098e-08, "loss": 0.3829, "step": 3003 }, { "epoch": 4.948929159802306, "grad_norm": 0.0806319165654447, "learning_rate": 2.543101559036476e-08, "loss": 0.3821, "step": 3004 }, { "epoch": 4.950576606260297, "grad_norm": 0.07375556512550221, "learning_rate": 2.381692819579229e-08, "loss": 0.3881, "step": 3005 }, { "epoch": 4.952224052718287, "grad_norm": 0.07827655755595185, "learning_rate": 2.225574103880135e-08, "loss": 0.3866, "step": 3006 }, { "epoch": 4.953871499176277, "grad_norm": 0.07351797915655545, "learning_rate": 2.0747456185303826e-08, "loss": 0.3848, "step": 3007 }, { "epoch": 4.9555189456342665, "grad_norm": 0.07621339684707451, "learning_rate": 1.9292075631200948e-08, "loss": 0.3856, "step": 3008 }, { "epoch": 4.957166392092257, "grad_norm": 0.07289605022366104, "learning_rate": 1.7889601302387704e-08, "loss": 0.3837, "step": 3009 }, { "epoch": 4.958813838550247, "grad_norm": 0.07616988441090057, "learning_rate": 1.654003505475732e-08, "loss": 0.39, "step": 3010 }, { "epoch": 4.960461285008237, "grad_norm": 0.07361278205567681, "learning_rate": 1.524337867417458e-08, "loss": 0.3808, "step": 3011 }, { "epoch": 4.962108731466227, "grad_norm": 0.07518044843571588, "learning_rate": 1.3999633876502494e-08, "loss": 0.3891, "step": 3012 }, { "epoch": 4.963756177924218, "grad_norm": 0.07250352753924257, "learning_rate": 1.280880230757564e-08, "loss": 0.3831, "step": 3013 }, { "epoch": 4.9654036243822075, "grad_norm": 0.07983977285316181, "learning_rate": 1.1670885543213494e-08, "loss": 0.3885, "step": 3014 }, { "epoch": 4.967051070840197, "grad_norm": 0.07790982473286077, "learning_rate": 1.0585885089224868e-08, "loss": 0.3833, "step": 3015 }, { "epoch": 4.968698517298188, "grad_norm": 0.07113218567417655, "learning_rate": 9.55380238137238e-09, "loss": 0.3836, "step": 3016 }, { "epoch": 4.970345963756178, "grad_norm": 0.078318376856106, "learning_rate": 8.574638785407984e-09, "loss": 0.3878, "step": 3017 }, { "epoch": 4.971993410214168, "grad_norm": 0.07391497211934928, "learning_rate": 7.648395597055214e-09, "loss": 0.3856, "step": 3018 }, { "epoch": 4.973640856672159, "grad_norm": 0.07711901026345427, "learning_rate": 6.775074042004725e-09, "loss": 0.3918, "step": 3019 }, { "epoch": 4.9752883031301485, "grad_norm": 0.07176848398467063, "learning_rate": 5.954675275918753e-09, "loss": 0.3839, "step": 3020 }, { "epoch": 4.976935749588138, "grad_norm": 0.07406214575222084, "learning_rate": 5.187200384422219e-09, "loss": 0.3815, "step": 3021 }, { "epoch": 4.978583196046128, "grad_norm": 0.07636702859412732, "learning_rate": 4.472650383116062e-09, "loss": 0.3844, "step": 3022 }, { "epoch": 4.980230642504119, "grad_norm": 0.07323337257819096, "learning_rate": 3.811026217555025e-09, "loss": 0.3863, "step": 3023 }, { "epoch": 4.981878088962109, "grad_norm": 0.07486870051290802, "learning_rate": 3.2023287632609866e-09, "loss": 0.3839, "step": 3024 }, { "epoch": 4.983525535420099, "grad_norm": 0.07759066656970298, "learning_rate": 2.6465588257273966e-09, "loss": 0.3831, "step": 3025 }, { "epoch": 4.9851729818780885, "grad_norm": 0.07390930406112436, "learning_rate": 2.1437171403926314e-09, "loss": 0.3824, "step": 3026 }, { "epoch": 4.986820428336079, "grad_norm": 0.07605850667276627, "learning_rate": 1.693804372666641e-09, "loss": 0.3879, "step": 3027 }, { "epoch": 4.988467874794069, "grad_norm": 0.07441016134547336, "learning_rate": 1.2968211179176238e-09, "loss": 0.3849, "step": 3028 }, { "epoch": 4.990115321252059, "grad_norm": 0.07468390015413809, "learning_rate": 9.527679014720293e-10, "loss": 0.3814, "step": 3029 }, { "epoch": 4.99176276771005, "grad_norm": 0.07197466119496766, "learning_rate": 6.616451786101152e-10, "loss": 0.3737, "step": 3030 }, { "epoch": 4.99341021416804, "grad_norm": 0.07651670541733648, "learning_rate": 4.234533345748304e-10, "loss": 0.386, "step": 3031 }, { "epoch": 4.9950576606260295, "grad_norm": 0.07243828260125279, "learning_rate": 2.3819268456293234e-10, "loss": 0.3875, "step": 3032 }, { "epoch": 4.996705107084019, "grad_norm": 0.07440740329333327, "learning_rate": 1.0586347373386929e-10, "loss": 0.3842, "step": 3033 }, { "epoch": 4.99835255354201, "grad_norm": 0.0760243384164705, "learning_rate": 2.646587718757587e-11, "loss": 0.3864, "step": 3034 }, { "epoch": 5.0, "grad_norm": 0.09396803028686655, "learning_rate": 0.0, "loss": 0.3794, "step": 3035 }, { "epoch": 5.0, "step": 3035, "total_flos": 5.091559175356416e+16, "train_loss": 0.4507998693225018, "train_runtime": 46225.9672, "train_samples_per_second": 33.569, "train_steps_per_second": 0.066 } ], "logging_steps": 1, "max_steps": 3035, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.091559175356416e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }