{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998573466476462, "eval_steps": 500, "global_step": 1095, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00456490727532097, "grad_norm": 6.303826851202761, "learning_rate": 7.272727272727273e-07, "loss": 0.8692, "step": 1 }, { "epoch": 0.00912981455064194, "grad_norm": 6.417436981270692, "learning_rate": 1.4545454545454546e-06, "loss": 0.8764, "step": 2 }, { "epoch": 0.013694721825962911, "grad_norm": 6.266046371260386, "learning_rate": 2.181818181818182e-06, "loss": 0.8604, "step": 3 }, { "epoch": 0.01825962910128388, "grad_norm": 5.871418428231522, "learning_rate": 2.9090909090909093e-06, "loss": 0.8563, "step": 4 }, { "epoch": 0.02282453637660485, "grad_norm": 4.515904039751017, "learning_rate": 3.6363636363636366e-06, "loss": 0.8116, "step": 5 }, { "epoch": 0.027389443651925822, "grad_norm": 4.154460736926, "learning_rate": 4.363636363636364e-06, "loss": 0.8014, "step": 6 }, { "epoch": 0.03195435092724679, "grad_norm": 2.436881516437234, "learning_rate": 5.090909090909091e-06, "loss": 0.7707, "step": 7 }, { "epoch": 0.03651925820256776, "grad_norm": 2.091681221098197, "learning_rate": 5.8181818181818185e-06, "loss": 0.7609, "step": 8 }, { "epoch": 0.04108416547788873, "grad_norm": 4.371182375936429, "learning_rate": 6.545454545454546e-06, "loss": 0.7714, "step": 9 }, { "epoch": 0.0456490727532097, "grad_norm": 4.419533826001375, "learning_rate": 7.272727272727273e-06, "loss": 0.7639, "step": 10 }, { "epoch": 0.05021398002853067, "grad_norm": 4.0789627340324515, "learning_rate": 8.000000000000001e-06, "loss": 0.738, "step": 11 }, { "epoch": 0.054778887303851644, "grad_norm": 4.249564440992856, "learning_rate": 8.727272727272728e-06, "loss": 0.7147, "step": 12 }, { "epoch": 0.05934379457917261, "grad_norm": 3.376070643486793, "learning_rate": 9.454545454545456e-06, "loss": 0.6984, "step": 13 }, { "epoch": 0.06390870185449359, "grad_norm": 2.084910644051683, "learning_rate": 1.0181818181818182e-05, "loss": 0.6752, "step": 14 }, { "epoch": 0.06847360912981455, "grad_norm": 1.7829575333835848, "learning_rate": 1.0909090909090909e-05, "loss": 0.6564, "step": 15 }, { "epoch": 0.07303851640513552, "grad_norm": 2.3763989053887546, "learning_rate": 1.1636363636363637e-05, "loss": 0.6595, "step": 16 }, { "epoch": 0.07760342368045649, "grad_norm": 1.8994179663807598, "learning_rate": 1.2363636363636364e-05, "loss": 0.6332, "step": 17 }, { "epoch": 0.08216833095577745, "grad_norm": 1.0918180057614881, "learning_rate": 1.3090909090909092e-05, "loss": 0.6275, "step": 18 }, { "epoch": 0.08673323823109844, "grad_norm": 1.162384665458545, "learning_rate": 1.381818181818182e-05, "loss": 0.617, "step": 19 }, { "epoch": 0.0912981455064194, "grad_norm": 1.1197113251451245, "learning_rate": 1.4545454545454546e-05, "loss": 0.6102, "step": 20 }, { "epoch": 0.09586305278174037, "grad_norm": 0.7298206551544844, "learning_rate": 1.5272727272727276e-05, "loss": 0.6001, "step": 21 }, { "epoch": 0.10042796005706134, "grad_norm": 0.8218899465689595, "learning_rate": 1.6000000000000003e-05, "loss": 0.5908, "step": 22 }, { "epoch": 0.1049928673323823, "grad_norm": 0.7081964845957092, "learning_rate": 1.672727272727273e-05, "loss": 0.5857, "step": 23 }, { "epoch": 0.10955777460770329, "grad_norm": 0.7152113888700518, "learning_rate": 1.7454545454545456e-05, "loss": 0.5813, "step": 24 }, { "epoch": 0.11412268188302425, "grad_norm": 0.5908192981103763, "learning_rate": 1.8181818181818182e-05, "loss": 0.5654, "step": 25 }, { "epoch": 0.11868758915834522, "grad_norm": 0.5657878890257612, "learning_rate": 1.8909090909090912e-05, "loss": 0.5681, "step": 26 }, { "epoch": 0.12325249643366619, "grad_norm": 0.5407715119275086, "learning_rate": 1.963636363636364e-05, "loss": 0.5726, "step": 27 }, { "epoch": 0.12781740370898717, "grad_norm": 0.6238305162701853, "learning_rate": 2.0363636363636365e-05, "loss": 0.5707, "step": 28 }, { "epoch": 0.13238231098430814, "grad_norm": 0.7048129462669643, "learning_rate": 2.109090909090909e-05, "loss": 0.562, "step": 29 }, { "epoch": 0.1369472182596291, "grad_norm": 0.4792558557034262, "learning_rate": 2.1818181818181818e-05, "loss": 0.5622, "step": 30 }, { "epoch": 0.14151212553495007, "grad_norm": 0.648556738189594, "learning_rate": 2.2545454545454544e-05, "loss": 0.5556, "step": 31 }, { "epoch": 0.14607703281027104, "grad_norm": 0.5375842492193321, "learning_rate": 2.3272727272727274e-05, "loss": 0.5521, "step": 32 }, { "epoch": 0.150641940085592, "grad_norm": 0.6204936152045187, "learning_rate": 2.4e-05, "loss": 0.5521, "step": 33 }, { "epoch": 0.15520684736091298, "grad_norm": 0.7793633373804746, "learning_rate": 2.4727272727272727e-05, "loss": 0.5556, "step": 34 }, { "epoch": 0.15977175463623394, "grad_norm": 1.077779426955439, "learning_rate": 2.5454545454545457e-05, "loss": 0.5439, "step": 35 }, { "epoch": 0.1643366619115549, "grad_norm": 0.8548536747868706, "learning_rate": 2.6181818181818183e-05, "loss": 0.5399, "step": 36 }, { "epoch": 0.1689015691868759, "grad_norm": 0.6156323737152537, "learning_rate": 2.690909090909091e-05, "loss": 0.534, "step": 37 }, { "epoch": 0.17346647646219687, "grad_norm": 0.7996160287459234, "learning_rate": 2.763636363636364e-05, "loss": 0.5432, "step": 38 }, { "epoch": 0.17803138373751784, "grad_norm": 0.8822172922338606, "learning_rate": 2.8363636363636366e-05, "loss": 0.538, "step": 39 }, { "epoch": 0.1825962910128388, "grad_norm": 0.7341791467955449, "learning_rate": 2.9090909090909093e-05, "loss": 0.5311, "step": 40 }, { "epoch": 0.18716119828815977, "grad_norm": 0.7448434985433166, "learning_rate": 2.9818181818181823e-05, "loss": 0.5302, "step": 41 }, { "epoch": 0.19172610556348074, "grad_norm": 1.3134209857981531, "learning_rate": 3.054545454545455e-05, "loss": 0.5287, "step": 42 }, { "epoch": 0.1962910128388017, "grad_norm": 1.4079955673671256, "learning_rate": 3.127272727272728e-05, "loss": 0.529, "step": 43 }, { "epoch": 0.20085592011412268, "grad_norm": 0.7806435793966361, "learning_rate": 3.2000000000000005e-05, "loss": 0.5211, "step": 44 }, { "epoch": 0.20542082738944364, "grad_norm": 1.696793688392228, "learning_rate": 3.272727272727273e-05, "loss": 0.5286, "step": 45 }, { "epoch": 0.2099857346647646, "grad_norm": 0.9163370159272217, "learning_rate": 3.345454545454546e-05, "loss": 0.5251, "step": 46 }, { "epoch": 0.21455064194008558, "grad_norm": 1.6162977222772477, "learning_rate": 3.4181818181818185e-05, "loss": 0.5307, "step": 47 }, { "epoch": 0.21911554921540657, "grad_norm": 0.8813157838119612, "learning_rate": 3.490909090909091e-05, "loss": 0.5119, "step": 48 }, { "epoch": 0.22368045649072754, "grad_norm": 1.6994696349279637, "learning_rate": 3.563636363636364e-05, "loss": 0.522, "step": 49 }, { "epoch": 0.2282453637660485, "grad_norm": 1.1550073074270106, "learning_rate": 3.6363636363636364e-05, "loss": 0.5185, "step": 50 }, { "epoch": 0.23281027104136948, "grad_norm": 2.273035117030142, "learning_rate": 3.709090909090909e-05, "loss": 0.5178, "step": 51 }, { "epoch": 0.23737517831669044, "grad_norm": 2.142670073640285, "learning_rate": 3.7818181818181824e-05, "loss": 0.5164, "step": 52 }, { "epoch": 0.2419400855920114, "grad_norm": 1.178743753159302, "learning_rate": 3.854545454545455e-05, "loss": 0.5161, "step": 53 }, { "epoch": 0.24650499286733238, "grad_norm": 2.0029521626283864, "learning_rate": 3.927272727272728e-05, "loss": 0.5142, "step": 54 }, { "epoch": 0.25106990014265335, "grad_norm": 1.2261785948783424, "learning_rate": 4e-05, "loss": 0.5097, "step": 55 }, { "epoch": 0.25563480741797434, "grad_norm": 1.9929717547816588, "learning_rate": 4.072727272727273e-05, "loss": 0.52, "step": 56 }, { "epoch": 0.2601997146932953, "grad_norm": 1.6292265517873907, "learning_rate": 4.1454545454545456e-05, "loss": 0.5214, "step": 57 }, { "epoch": 0.2647646219686163, "grad_norm": 1.3334399259774528, "learning_rate": 4.218181818181818e-05, "loss": 0.5131, "step": 58 }, { "epoch": 0.2693295292439372, "grad_norm": 1.7895060448687017, "learning_rate": 4.2909090909090916e-05, "loss": 0.5156, "step": 59 }, { "epoch": 0.2738944365192582, "grad_norm": 1.2434711772815448, "learning_rate": 4.3636363636363636e-05, "loss": 0.5063, "step": 60 }, { "epoch": 0.27845934379457915, "grad_norm": 1.674712591320644, "learning_rate": 4.436363636363637e-05, "loss": 0.5058, "step": 61 }, { "epoch": 0.28302425106990015, "grad_norm": 1.7391744678652716, "learning_rate": 4.509090909090909e-05, "loss": 0.5169, "step": 62 }, { "epoch": 0.2875891583452211, "grad_norm": 1.2704765036397798, "learning_rate": 4.581818181818182e-05, "loss": 0.5077, "step": 63 }, { "epoch": 0.2921540656205421, "grad_norm": 1.7222012804003408, "learning_rate": 4.654545454545455e-05, "loss": 0.5137, "step": 64 }, { "epoch": 0.2967189728958631, "grad_norm": 1.293937134094515, "learning_rate": 4.727272727272728e-05, "loss": 0.5066, "step": 65 }, { "epoch": 0.301283880171184, "grad_norm": 1.1101483554895153, "learning_rate": 4.8e-05, "loss": 0.5012, "step": 66 }, { "epoch": 0.305848787446505, "grad_norm": 1.2651179859008774, "learning_rate": 4.8727272727272734e-05, "loss": 0.4996, "step": 67 }, { "epoch": 0.31041369472182595, "grad_norm": 1.4372242319377802, "learning_rate": 4.9454545454545454e-05, "loss": 0.5048, "step": 68 }, { "epoch": 0.31497860199714695, "grad_norm": 1.1173991577334563, "learning_rate": 5.018181818181819e-05, "loss": 0.5005, "step": 69 }, { "epoch": 0.3195435092724679, "grad_norm": 1.0219934142555631, "learning_rate": 5.0909090909090914e-05, "loss": 0.5011, "step": 70 }, { "epoch": 0.3241084165477889, "grad_norm": 1.7032035010169793, "learning_rate": 5.163636363636365e-05, "loss": 0.5094, "step": 71 }, { "epoch": 0.3286733238231098, "grad_norm": 0.8364496496396441, "learning_rate": 5.236363636363637e-05, "loss": 0.4963, "step": 72 }, { "epoch": 0.3332382310984308, "grad_norm": 1.3858540890091062, "learning_rate": 5.30909090909091e-05, "loss": 0.5079, "step": 73 }, { "epoch": 0.3378031383737518, "grad_norm": 1.3469249650056214, "learning_rate": 5.381818181818182e-05, "loss": 0.5094, "step": 74 }, { "epoch": 0.34236804564907275, "grad_norm": 2.3532033300244266, "learning_rate": 5.4545454545454546e-05, "loss": 0.5101, "step": 75 }, { "epoch": 0.34693295292439374, "grad_norm": 1.2379724037013697, "learning_rate": 5.527272727272728e-05, "loss": 0.5009, "step": 76 }, { "epoch": 0.3514978601997147, "grad_norm": 2.1483188283998365, "learning_rate": 5.6e-05, "loss": 0.5185, "step": 77 }, { "epoch": 0.3560627674750357, "grad_norm": 1.6081458329954548, "learning_rate": 5.672727272727273e-05, "loss": 0.5033, "step": 78 }, { "epoch": 0.3606276747503566, "grad_norm": 1.379511883135697, "learning_rate": 5.745454545454546e-05, "loss": 0.5038, "step": 79 }, { "epoch": 0.3651925820256776, "grad_norm": 2.314662935076633, "learning_rate": 5.8181818181818185e-05, "loss": 0.4969, "step": 80 }, { "epoch": 0.36975748930099855, "grad_norm": 1.3583834941356125, "learning_rate": 5.890909090909091e-05, "loss": 0.5045, "step": 81 }, { "epoch": 0.37432239657631955, "grad_norm": 2.6348157549287157, "learning_rate": 5.9636363636363645e-05, "loss": 0.5058, "step": 82 }, { "epoch": 0.3788873038516405, "grad_norm": 1.944213494123695, "learning_rate": 6.0363636363636365e-05, "loss": 0.5064, "step": 83 }, { "epoch": 0.3834522111269615, "grad_norm": 2.0575793444325194, "learning_rate": 6.10909090909091e-05, "loss": 0.5018, "step": 84 }, { "epoch": 0.3880171184022825, "grad_norm": 1.978062761509244, "learning_rate": 6.181818181818182e-05, "loss": 0.5017, "step": 85 }, { "epoch": 0.3925820256776034, "grad_norm": 1.4801623619535438, "learning_rate": 6.254545454545456e-05, "loss": 0.498, "step": 86 }, { "epoch": 0.3971469329529244, "grad_norm": 1.8537084794208918, "learning_rate": 6.327272727272727e-05, "loss": 0.5019, "step": 87 }, { "epoch": 0.40171184022824535, "grad_norm": 1.3702877983773376, "learning_rate": 6.400000000000001e-05, "loss": 0.4968, "step": 88 }, { "epoch": 0.40627674750356635, "grad_norm": 1.885572104808451, "learning_rate": 6.472727272727274e-05, "loss": 0.4957, "step": 89 }, { "epoch": 0.4108416547788873, "grad_norm": 1.3542328558823338, "learning_rate": 6.545454545454546e-05, "loss": 0.4966, "step": 90 }, { "epoch": 0.4154065620542083, "grad_norm": 1.7718230138924214, "learning_rate": 6.618181818181819e-05, "loss": 0.4966, "step": 91 }, { "epoch": 0.4199714693295292, "grad_norm": 1.3980578794491678, "learning_rate": 6.690909090909092e-05, "loss": 0.4925, "step": 92 }, { "epoch": 0.4245363766048502, "grad_norm": 1.5334202672866126, "learning_rate": 6.763636363636364e-05, "loss": 0.4876, "step": 93 }, { "epoch": 0.42910128388017116, "grad_norm": 1.4720834156887759, "learning_rate": 6.836363636363637e-05, "loss": 0.4913, "step": 94 }, { "epoch": 0.43366619115549215, "grad_norm": 1.338014419694779, "learning_rate": 6.90909090909091e-05, "loss": 0.4903, "step": 95 }, { "epoch": 0.43823109843081315, "grad_norm": 1.0761852253986315, "learning_rate": 6.981818181818182e-05, "loss": 0.4912, "step": 96 }, { "epoch": 0.4427960057061341, "grad_norm": 1.6024764467654846, "learning_rate": 7.054545454545455e-05, "loss": 0.4901, "step": 97 }, { "epoch": 0.4473609129814551, "grad_norm": 1.492316053791499, "learning_rate": 7.127272727272728e-05, "loss": 0.4873, "step": 98 }, { "epoch": 0.451925820256776, "grad_norm": 1.525604026590939, "learning_rate": 7.2e-05, "loss": 0.4891, "step": 99 }, { "epoch": 0.456490727532097, "grad_norm": 1.3034542842679084, "learning_rate": 7.272727272727273e-05, "loss": 0.49, "step": 100 }, { "epoch": 0.46105563480741796, "grad_norm": 2.1642815195092666, "learning_rate": 7.345454545454547e-05, "loss": 0.5055, "step": 101 }, { "epoch": 0.46562054208273895, "grad_norm": 1.1678257677428678, "learning_rate": 7.418181818181818e-05, "loss": 0.4852, "step": 102 }, { "epoch": 0.4701854493580599, "grad_norm": 1.4078725478189906, "learning_rate": 7.490909090909092e-05, "loss": 0.4919, "step": 103 }, { "epoch": 0.4747503566333809, "grad_norm": 2.357567405283945, "learning_rate": 7.563636363636365e-05, "loss": 0.4968, "step": 104 }, { "epoch": 0.4793152639087018, "grad_norm": 1.4758593059891392, "learning_rate": 7.636363636363637e-05, "loss": 0.4874, "step": 105 }, { "epoch": 0.4838801711840228, "grad_norm": 1.980018157376651, "learning_rate": 7.70909090909091e-05, "loss": 0.4953, "step": 106 }, { "epoch": 0.4884450784593438, "grad_norm": 2.0401864619014467, "learning_rate": 7.781818181818183e-05, "loss": 0.5006, "step": 107 }, { "epoch": 0.49300998573466476, "grad_norm": 1.0897659104208783, "learning_rate": 7.854545454545455e-05, "loss": 0.4871, "step": 108 }, { "epoch": 0.49757489300998575, "grad_norm": 1.8417886476775482, "learning_rate": 7.927272727272728e-05, "loss": 0.4975, "step": 109 }, { "epoch": 0.5021398002853067, "grad_norm": 1.2728786506557457, "learning_rate": 8e-05, "loss": 0.492, "step": 110 }, { "epoch": 0.5067047075606277, "grad_norm": 2.4453848065966817, "learning_rate": 7.999979655036647e-05, "loss": 0.5094, "step": 111 }, { "epoch": 0.5112696148359487, "grad_norm": 1.6374824473909455, "learning_rate": 7.999918620353548e-05, "loss": 0.4931, "step": 112 }, { "epoch": 0.5158345221112696, "grad_norm": 1.8705264691341723, "learning_rate": 7.999816896571574e-05, "loss": 0.5051, "step": 113 }, { "epoch": 0.5203994293865906, "grad_norm": 1.6343167214375214, "learning_rate": 7.999674484725512e-05, "loss": 0.4984, "step": 114 }, { "epoch": 0.5249643366619116, "grad_norm": 1.9112883595327201, "learning_rate": 7.999491386264042e-05, "loss": 0.492, "step": 115 }, { "epoch": 0.5295292439372326, "grad_norm": 1.35541163706321, "learning_rate": 7.999267603049729e-05, "loss": 0.4902, "step": 116 }, { "epoch": 0.5340941512125535, "grad_norm": 1.6103620556897125, "learning_rate": 7.999003137359006e-05, "loss": 0.4927, "step": 117 }, { "epoch": 0.5386590584878744, "grad_norm": 1.4810569251620247, "learning_rate": 7.998697991882144e-05, "loss": 0.4876, "step": 118 }, { "epoch": 0.5432239657631954, "grad_norm": 1.3665267966836736, "learning_rate": 7.998352169723229e-05, "loss": 0.4865, "step": 119 }, { "epoch": 0.5477888730385164, "grad_norm": 1.1684492343192325, "learning_rate": 7.997965674400132e-05, "loss": 0.4898, "step": 120 }, { "epoch": 0.5523537803138374, "grad_norm": 1.6135417427578114, "learning_rate": 7.997538509844469e-05, "loss": 0.4884, "step": 121 }, { "epoch": 0.5569186875891583, "grad_norm": 1.081634967694094, "learning_rate": 7.997070680401562e-05, "loss": 0.4814, "step": 122 }, { "epoch": 0.5614835948644793, "grad_norm": 1.323277108291229, "learning_rate": 7.9965621908304e-05, "loss": 0.4862, "step": 123 }, { "epoch": 0.5660485021398003, "grad_norm": 1.5122505161166728, "learning_rate": 7.996013046303583e-05, "loss": 0.4907, "step": 124 }, { "epoch": 0.5706134094151213, "grad_norm": 1.3517400921660416, "learning_rate": 7.995423252407275e-05, "loss": 0.4849, "step": 125 }, { "epoch": 0.5751783166904422, "grad_norm": 1.2964472361111674, "learning_rate": 7.99479281514114e-05, "loss": 0.4829, "step": 126 }, { "epoch": 0.5797432239657632, "grad_norm": 1.441528044357245, "learning_rate": 7.994121740918293e-05, "loss": 0.4888, "step": 127 }, { "epoch": 0.5843081312410842, "grad_norm": 1.2409245733342162, "learning_rate": 7.993410036565223e-05, "loss": 0.4776, "step": 128 }, { "epoch": 0.5888730385164052, "grad_norm": 1.255055709641733, "learning_rate": 7.992657709321728e-05, "loss": 0.4856, "step": 129 }, { "epoch": 0.5934379457917262, "grad_norm": 1.2617494433472416, "learning_rate": 7.991864766840846e-05, "loss": 0.4832, "step": 130 }, { "epoch": 0.598002853067047, "grad_norm": 1.3678248560826736, "learning_rate": 7.991031217188769e-05, "loss": 0.483, "step": 131 }, { "epoch": 0.602567760342368, "grad_norm": 1.5165238137397685, "learning_rate": 7.990157068844764e-05, "loss": 0.4762, "step": 132 }, { "epoch": 0.607132667617689, "grad_norm": 1.0122655955764246, "learning_rate": 7.989242330701089e-05, "loss": 0.4794, "step": 133 }, { "epoch": 0.61169757489301, "grad_norm": 1.4882369944794684, "learning_rate": 7.988287012062902e-05, "loss": 0.4772, "step": 134 }, { "epoch": 0.6162624821683309, "grad_norm": 0.9815809304445058, "learning_rate": 7.987291122648165e-05, "loss": 0.4844, "step": 135 }, { "epoch": 0.6208273894436519, "grad_norm": 1.8345734791985764, "learning_rate": 7.986254672587544e-05, "loss": 0.4872, "step": 136 }, { "epoch": 0.6253922967189729, "grad_norm": 1.2322860510435374, "learning_rate": 7.985177672424309e-05, "loss": 0.4742, "step": 137 }, { "epoch": 0.6299572039942939, "grad_norm": 1.0934283610887234, "learning_rate": 7.984060133114222e-05, "loss": 0.4828, "step": 138 }, { "epoch": 0.6345221112696149, "grad_norm": 1.0617428680491003, "learning_rate": 7.982902066025433e-05, "loss": 0.4841, "step": 139 }, { "epoch": 0.6390870185449358, "grad_norm": 1.4504986628664012, "learning_rate": 7.981703482938361e-05, "loss": 0.4765, "step": 140 }, { "epoch": 0.6436519258202568, "grad_norm": 1.2196737951896366, "learning_rate": 7.980464396045565e-05, "loss": 0.48, "step": 141 }, { "epoch": 0.6482168330955778, "grad_norm": 1.1746117544945598, "learning_rate": 7.979184817951638e-05, "loss": 0.472, "step": 142 }, { "epoch": 0.6527817403708988, "grad_norm": 1.462647851373356, "learning_rate": 7.977864761673062e-05, "loss": 0.4819, "step": 143 }, { "epoch": 0.6573466476462196, "grad_norm": 1.0920910565051827, "learning_rate": 7.976504240638088e-05, "loss": 0.4759, "step": 144 }, { "epoch": 0.6619115549215406, "grad_norm": 0.8915983200577166, "learning_rate": 7.975103268686587e-05, "loss": 0.4708, "step": 145 }, { "epoch": 0.6664764621968616, "grad_norm": 1.1195632191410694, "learning_rate": 7.973661860069925e-05, "loss": 0.481, "step": 146 }, { "epoch": 0.6710413694721826, "grad_norm": 0.9280898217043119, "learning_rate": 7.972180029450804e-05, "loss": 0.4771, "step": 147 }, { "epoch": 0.6756062767475036, "grad_norm": 1.2010937440977014, "learning_rate": 7.970657791903115e-05, "loss": 0.4766, "step": 148 }, { "epoch": 0.6801711840228245, "grad_norm": 1.1648502616511074, "learning_rate": 7.969095162911796e-05, "loss": 0.4765, "step": 149 }, { "epoch": 0.6847360912981455, "grad_norm": 1.629472140008341, "learning_rate": 7.967492158372659e-05, "loss": 0.4708, "step": 150 }, { "epoch": 0.6893009985734665, "grad_norm": 0.7685771258524148, "learning_rate": 7.965848794592241e-05, "loss": 0.4702, "step": 151 }, { "epoch": 0.6938659058487875, "grad_norm": 1.2979818888222598, "learning_rate": 7.964165088287627e-05, "loss": 0.4756, "step": 152 }, { "epoch": 0.6984308131241084, "grad_norm": 1.479596160746902, "learning_rate": 7.96244105658629e-05, "loss": 0.4684, "step": 153 }, { "epoch": 0.7029957203994294, "grad_norm": 0.9533216688157553, "learning_rate": 7.960676717025912e-05, "loss": 0.4674, "step": 154 }, { "epoch": 0.7075606276747504, "grad_norm": 1.1144585740648676, "learning_rate": 7.958872087554204e-05, "loss": 0.4759, "step": 155 }, { "epoch": 0.7121255349500714, "grad_norm": 1.1392667564805274, "learning_rate": 7.957027186528724e-05, "loss": 0.4792, "step": 156 }, { "epoch": 0.7166904422253922, "grad_norm": 1.576213566462258, "learning_rate": 7.955142032716696e-05, "loss": 0.4711, "step": 157 }, { "epoch": 0.7212553495007132, "grad_norm": 1.0153629340656216, "learning_rate": 7.953216645294813e-05, "loss": 0.4785, "step": 158 }, { "epoch": 0.7258202567760342, "grad_norm": 1.178946056258982, "learning_rate": 7.951251043849043e-05, "loss": 0.4703, "step": 159 }, { "epoch": 0.7303851640513552, "grad_norm": 1.2039627482563626, "learning_rate": 7.94924524837443e-05, "loss": 0.4691, "step": 160 }, { "epoch": 0.7349500713266762, "grad_norm": 1.2758536553065434, "learning_rate": 7.947199279274892e-05, "loss": 0.4719, "step": 161 }, { "epoch": 0.7395149786019971, "grad_norm": 0.7199695023812126, "learning_rate": 7.945113157363012e-05, "loss": 0.4705, "step": 162 }, { "epoch": 0.7440798858773181, "grad_norm": 1.187228986720961, "learning_rate": 7.942986903859826e-05, "loss": 0.476, "step": 163 }, { "epoch": 0.7486447931526391, "grad_norm": 1.2191468243222228, "learning_rate": 7.940820540394611e-05, "loss": 0.4685, "step": 164 }, { "epoch": 0.7532097004279601, "grad_norm": 1.763163381472643, "learning_rate": 7.938614089004659e-05, "loss": 0.4634, "step": 165 }, { "epoch": 0.757774607703281, "grad_norm": 0.7821277377021038, "learning_rate": 7.936367572135056e-05, "loss": 0.4741, "step": 166 }, { "epoch": 0.762339514978602, "grad_norm": 2.297761933831456, "learning_rate": 7.934081012638452e-05, "loss": 0.4801, "step": 167 }, { "epoch": 0.766904422253923, "grad_norm": 1.3332466148839683, "learning_rate": 7.931754433774835e-05, "loss": 0.4753, "step": 168 }, { "epoch": 0.771469329529244, "grad_norm": 2.149549037927675, "learning_rate": 7.929387859211283e-05, "loss": 0.4864, "step": 169 }, { "epoch": 0.776034236804565, "grad_norm": 1.5521858392131513, "learning_rate": 7.926981313021734e-05, "loss": 0.4845, "step": 170 }, { "epoch": 0.7805991440798858, "grad_norm": 1.5726199718313127, "learning_rate": 7.924534819686735e-05, "loss": 0.4807, "step": 171 }, { "epoch": 0.7851640513552068, "grad_norm": 1.25504183274313, "learning_rate": 7.922048404093193e-05, "loss": 0.4875, "step": 172 }, { "epoch": 0.7897289586305278, "grad_norm": 0.9918446302267643, "learning_rate": 7.919522091534125e-05, "loss": 0.4751, "step": 173 }, { "epoch": 0.7942938659058488, "grad_norm": 1.2133961064200827, "learning_rate": 7.916955907708403e-05, "loss": 0.4751, "step": 174 }, { "epoch": 0.7988587731811697, "grad_norm": 1.0816089763802976, "learning_rate": 7.91434987872048e-05, "loss": 0.467, "step": 175 }, { "epoch": 0.8034236804564907, "grad_norm": 0.8971343840889208, "learning_rate": 7.911704031080142e-05, "loss": 0.4734, "step": 176 }, { "epoch": 0.8079885877318117, "grad_norm": 0.9356730941151444, "learning_rate": 7.909018391702224e-05, "loss": 0.47, "step": 177 }, { "epoch": 0.8125534950071327, "grad_norm": 0.7910454206167575, "learning_rate": 7.906292987906343e-05, "loss": 0.4683, "step": 178 }, { "epoch": 0.8171184022824537, "grad_norm": 1.192520435930123, "learning_rate": 7.90352784741662e-05, "loss": 0.4683, "step": 179 }, { "epoch": 0.8216833095577746, "grad_norm": 1.0963304160109035, "learning_rate": 7.900722998361394e-05, "loss": 0.4667, "step": 180 }, { "epoch": 0.8262482168330956, "grad_norm": 1.1053490528947663, "learning_rate": 7.897878469272943e-05, "loss": 0.472, "step": 181 }, { "epoch": 0.8308131241084166, "grad_norm": 0.8848774693139051, "learning_rate": 7.894994289087187e-05, "loss": 0.4628, "step": 182 }, { "epoch": 0.8353780313837376, "grad_norm": 1.115946428081902, "learning_rate": 7.892070487143395e-05, "loss": 0.4621, "step": 183 }, { "epoch": 0.8399429386590584, "grad_norm": 1.0565554494854816, "learning_rate": 7.88910709318389e-05, "loss": 0.4665, "step": 184 }, { "epoch": 0.8445078459343794, "grad_norm": 1.2557316528836924, "learning_rate": 7.88610413735374e-05, "loss": 0.4629, "step": 185 }, { "epoch": 0.8490727532097004, "grad_norm": 0.9500438718947202, "learning_rate": 7.883061650200459e-05, "loss": 0.4671, "step": 186 }, { "epoch": 0.8536376604850214, "grad_norm": 1.0129984995159482, "learning_rate": 7.879979662673695e-05, "loss": 0.4613, "step": 187 }, { "epoch": 0.8582025677603423, "grad_norm": 1.2473784057361619, "learning_rate": 7.876858206124907e-05, "loss": 0.4697, "step": 188 }, { "epoch": 0.8627674750356633, "grad_norm": 0.8979338734910455, "learning_rate": 7.873697312307054e-05, "loss": 0.4696, "step": 189 }, { "epoch": 0.8673323823109843, "grad_norm": 0.7812764034273535, "learning_rate": 7.870497013374272e-05, "loss": 0.4639, "step": 190 }, { "epoch": 0.8718972895863053, "grad_norm": 1.012860566654511, "learning_rate": 7.867257341881542e-05, "loss": 0.4653, "step": 191 }, { "epoch": 0.8764621968616263, "grad_norm": 1.4227576277179481, "learning_rate": 7.863978330784364e-05, "loss": 0.4675, "step": 192 }, { "epoch": 0.8810271041369472, "grad_norm": 0.7511911038781995, "learning_rate": 7.860660013438418e-05, "loss": 0.4602, "step": 193 }, { "epoch": 0.8855920114122682, "grad_norm": 1.0008483459158608, "learning_rate": 7.857302423599225e-05, "loss": 0.4642, "step": 194 }, { "epoch": 0.8901569186875892, "grad_norm": 1.3635357598280822, "learning_rate": 7.853905595421808e-05, "loss": 0.4718, "step": 195 }, { "epoch": 0.8947218259629102, "grad_norm": 1.1150832530124606, "learning_rate": 7.850469563460339e-05, "loss": 0.4697, "step": 196 }, { "epoch": 0.899286733238231, "grad_norm": 1.306691006655152, "learning_rate": 7.84699436266779e-05, "loss": 0.4645, "step": 197 }, { "epoch": 0.903851640513552, "grad_norm": 0.5646785863195107, "learning_rate": 7.843480028395578e-05, "loss": 0.4598, "step": 198 }, { "epoch": 0.908416547788873, "grad_norm": 1.416929427327476, "learning_rate": 7.839926596393202e-05, "loss": 0.4613, "step": 199 }, { "epoch": 0.912981455064194, "grad_norm": 0.6815794459118021, "learning_rate": 7.836334102807886e-05, "loss": 0.4552, "step": 200 }, { "epoch": 0.917546362339515, "grad_norm": 1.3377535332836705, "learning_rate": 7.832702584184204e-05, "loss": 0.4609, "step": 201 }, { "epoch": 0.9221112696148359, "grad_norm": 0.7289345910500463, "learning_rate": 7.829032077463713e-05, "loss": 0.4614, "step": 202 }, { "epoch": 0.9266761768901569, "grad_norm": 0.7256574283679788, "learning_rate": 7.825322619984576e-05, "loss": 0.4583, "step": 203 }, { "epoch": 0.9312410841654779, "grad_norm": 0.7702106659855981, "learning_rate": 7.821574249481179e-05, "loss": 0.4568, "step": 204 }, { "epoch": 0.9358059914407989, "grad_norm": 0.9102238446313804, "learning_rate": 7.817787004083756e-05, "loss": 0.4586, "step": 205 }, { "epoch": 0.9403708987161198, "grad_norm": 1.7972329933237008, "learning_rate": 7.813960922317988e-05, "loss": 0.4604, "step": 206 }, { "epoch": 0.9449358059914408, "grad_norm": 0.6227351510323046, "learning_rate": 7.810096043104623e-05, "loss": 0.4622, "step": 207 }, { "epoch": 0.9495007132667618, "grad_norm": 1.7735653279890655, "learning_rate": 7.806192405759074e-05, "loss": 0.4649, "step": 208 }, { "epoch": 0.9540656205420828, "grad_norm": 1.0308472520464786, "learning_rate": 7.80225004999102e-05, "loss": 0.4664, "step": 209 }, { "epoch": 0.9586305278174037, "grad_norm": 0.8116494461929854, "learning_rate": 7.798269015904004e-05, "loss": 0.4617, "step": 210 }, { "epoch": 0.9631954350927246, "grad_norm": 1.047955603167114, "learning_rate": 7.79424934399502e-05, "loss": 0.4614, "step": 211 }, { "epoch": 0.9677603423680456, "grad_norm": 1.5898132874597775, "learning_rate": 7.790191075154109e-05, "loss": 0.4607, "step": 212 }, { "epoch": 0.9723252496433666, "grad_norm": 0.9545090947993602, "learning_rate": 7.786094250663936e-05, "loss": 0.4652, "step": 213 }, { "epoch": 0.9768901569186876, "grad_norm": 1.6571396240037084, "learning_rate": 7.781958912199372e-05, "loss": 0.4618, "step": 214 }, { "epoch": 0.9814550641940085, "grad_norm": 1.2024242198019108, "learning_rate": 7.777785101827073e-05, "loss": 0.4662, "step": 215 }, { "epoch": 0.9860199714693295, "grad_norm": 1.1021742864631379, "learning_rate": 7.773572862005048e-05, "loss": 0.4573, "step": 216 }, { "epoch": 0.9905848787446505, "grad_norm": 1.199436776468982, "learning_rate": 7.76932223558223e-05, "loss": 0.461, "step": 217 }, { "epoch": 0.9951497860199715, "grad_norm": 1.063128318792733, "learning_rate": 7.765033265798038e-05, "loss": 0.455, "step": 218 }, { "epoch": 0.9997146932952924, "grad_norm": 0.6606867422821489, "learning_rate": 7.760705996281937e-05, "loss": 0.4591, "step": 219 }, { "epoch": 1.0042796005706134, "grad_norm": 1.8960244617895399, "learning_rate": 7.756340471052998e-05, "loss": 0.8785, "step": 220 }, { "epoch": 1.0088445078459345, "grad_norm": 1.4080967185223767, "learning_rate": 7.751936734519448e-05, "loss": 0.4574, "step": 221 }, { "epoch": 1.0134094151212554, "grad_norm": 0.6771897541703611, "learning_rate": 7.747494831478214e-05, "loss": 0.4431, "step": 222 }, { "epoch": 1.0179743223965763, "grad_norm": 1.2677400355158506, "learning_rate": 7.743014807114475e-05, "loss": 0.4477, "step": 223 }, { "epoch": 1.0225392296718974, "grad_norm": 0.8527685554112466, "learning_rate": 7.738496707001195e-05, "loss": 0.4383, "step": 224 }, { "epoch": 1.0271041369472182, "grad_norm": 0.8122477750639683, "learning_rate": 7.733940577098666e-05, "loss": 0.4418, "step": 225 }, { "epoch": 1.0316690442225391, "grad_norm": 0.8426217905101726, "learning_rate": 7.729346463754035e-05, "loss": 0.4421, "step": 226 }, { "epoch": 1.0362339514978602, "grad_norm": 0.9171730058782802, "learning_rate": 7.724714413700836e-05, "loss": 0.4418, "step": 227 }, { "epoch": 1.0407988587731811, "grad_norm": 1.0216657454100806, "learning_rate": 7.720044474058515e-05, "loss": 0.4463, "step": 228 }, { "epoch": 1.0453637660485022, "grad_norm": 1.1305237470904832, "learning_rate": 7.715336692331944e-05, "loss": 0.4382, "step": 229 }, { "epoch": 1.0499286733238231, "grad_norm": 1.0426775975151317, "learning_rate": 7.71059111641095e-05, "loss": 0.4466, "step": 230 }, { "epoch": 1.054493580599144, "grad_norm": 1.0597968056899956, "learning_rate": 7.705807794569815e-05, "loss": 0.4486, "step": 231 }, { "epoch": 1.059058487874465, "grad_norm": 1.1716663560960234, "learning_rate": 7.700986775466792e-05, "loss": 0.4439, "step": 232 }, { "epoch": 1.063623395149786, "grad_norm": 0.9803996514179776, "learning_rate": 7.696128108143612e-05, "loss": 0.4461, "step": 233 }, { "epoch": 1.0681883024251069, "grad_norm": 1.2259077645154464, "learning_rate": 7.691231842024977e-05, "loss": 0.4489, "step": 234 }, { "epoch": 1.072753209700428, "grad_norm": 0.853759148071036, "learning_rate": 7.686298026918067e-05, "loss": 0.4421, "step": 235 }, { "epoch": 1.0773181169757489, "grad_norm": 0.7461561534105968, "learning_rate": 7.681326713012024e-05, "loss": 0.4389, "step": 236 }, { "epoch": 1.08188302425107, "grad_norm": 1.1421762335895749, "learning_rate": 7.676317950877446e-05, "loss": 0.4396, "step": 237 }, { "epoch": 1.0864479315263909, "grad_norm": 0.8992066222764188, "learning_rate": 7.671271791465877e-05, "loss": 0.4404, "step": 238 }, { "epoch": 1.0910128388017117, "grad_norm": 0.8169936643046695, "learning_rate": 7.666188286109279e-05, "loss": 0.4375, "step": 239 }, { "epoch": 1.0955777460770328, "grad_norm": 1.0409973585291696, "learning_rate": 7.66106748651952e-05, "loss": 0.4395, "step": 240 }, { "epoch": 1.1001426533523537, "grad_norm": 1.0016647356785184, "learning_rate": 7.655909444787837e-05, "loss": 0.4436, "step": 241 }, { "epoch": 1.1047075606276748, "grad_norm": 0.9158140898097673, "learning_rate": 7.650714213384317e-05, "loss": 0.4362, "step": 242 }, { "epoch": 1.1092724679029957, "grad_norm": 0.8583658856818915, "learning_rate": 7.645481845157353e-05, "loss": 0.4359, "step": 243 }, { "epoch": 1.1138373751783166, "grad_norm": 0.6939369526031025, "learning_rate": 7.640212393333117e-05, "loss": 0.4306, "step": 244 }, { "epoch": 1.1184022824536377, "grad_norm": 0.592410776540685, "learning_rate": 7.634905911515014e-05, "loss": 0.4354, "step": 245 }, { "epoch": 1.1229671897289586, "grad_norm": 0.8601769845121724, "learning_rate": 7.62956245368313e-05, "loss": 0.4355, "step": 246 }, { "epoch": 1.1275320970042797, "grad_norm": 0.7237332967962335, "learning_rate": 7.624182074193691e-05, "loss": 0.4399, "step": 247 }, { "epoch": 1.1320970042796006, "grad_norm": 0.6811877132981848, "learning_rate": 7.61876482777851e-05, "loss": 0.4411, "step": 248 }, { "epoch": 1.1366619115549215, "grad_norm": 0.8543886977059173, "learning_rate": 7.613310769544428e-05, "loss": 0.4355, "step": 249 }, { "epoch": 1.1412268188302426, "grad_norm": 0.8286191112581163, "learning_rate": 7.607819954972752e-05, "loss": 0.4383, "step": 250 }, { "epoch": 1.1457917261055635, "grad_norm": 1.022170225407258, "learning_rate": 7.60229243991869e-05, "loss": 0.4416, "step": 251 }, { "epoch": 1.1503566333808846, "grad_norm": 1.1340537050864723, "learning_rate": 7.59672828061079e-05, "loss": 0.4382, "step": 252 }, { "epoch": 1.1549215406562054, "grad_norm": 0.5980326106048516, "learning_rate": 7.591127533650362e-05, "loss": 0.4369, "step": 253 }, { "epoch": 1.1594864479315263, "grad_norm": 0.5810971453495677, "learning_rate": 7.585490256010899e-05, "loss": 0.4319, "step": 254 }, { "epoch": 1.1640513552068474, "grad_norm": 0.8641814822644016, "learning_rate": 7.579816505037505e-05, "loss": 0.4386, "step": 255 }, { "epoch": 1.1686162624821683, "grad_norm": 0.9787426567126707, "learning_rate": 7.574106338446309e-05, "loss": 0.4327, "step": 256 }, { "epoch": 1.1731811697574892, "grad_norm": 1.1084677725616903, "learning_rate": 7.568359814323876e-05, "loss": 0.4364, "step": 257 }, { "epoch": 1.1777460770328103, "grad_norm": 0.8794156667429015, "learning_rate": 7.562576991126616e-05, "loss": 0.4387, "step": 258 }, { "epoch": 1.1823109843081312, "grad_norm": 0.7016232671145584, "learning_rate": 7.556757927680192e-05, "loss": 0.4334, "step": 259 }, { "epoch": 1.1868758915834523, "grad_norm": 0.7028177191087993, "learning_rate": 7.550902683178923e-05, "loss": 0.4346, "step": 260 }, { "epoch": 1.1914407988587732, "grad_norm": 0.7601681237280019, "learning_rate": 7.545011317185172e-05, "loss": 0.4374, "step": 261 }, { "epoch": 1.196005706134094, "grad_norm": 0.9527083913769249, "learning_rate": 7.539083889628755e-05, "loss": 0.4394, "step": 262 }, { "epoch": 1.2005706134094152, "grad_norm": 1.144372718254701, "learning_rate": 7.53312046080632e-05, "loss": 0.4445, "step": 263 }, { "epoch": 1.205135520684736, "grad_norm": 0.9321132440752812, "learning_rate": 7.527121091380737e-05, "loss": 0.436, "step": 264 }, { "epoch": 1.209700427960057, "grad_norm": 1.0349154486726628, "learning_rate": 7.52108584238048e-05, "loss": 0.436, "step": 265 }, { "epoch": 1.214265335235378, "grad_norm": 1.0640705130270223, "learning_rate": 7.515014775199011e-05, "loss": 0.4394, "step": 266 }, { "epoch": 1.218830242510699, "grad_norm": 1.0477802116229236, "learning_rate": 7.508907951594149e-05, "loss": 0.4326, "step": 267 }, { "epoch": 1.22339514978602, "grad_norm": 1.0464613549121728, "learning_rate": 7.502765433687444e-05, "loss": 0.4377, "step": 268 }, { "epoch": 1.227960057061341, "grad_norm": 0.8450812112061008, "learning_rate": 7.496587283963549e-05, "loss": 0.4369, "step": 269 }, { "epoch": 1.2325249643366618, "grad_norm": 0.6557155968758558, "learning_rate": 7.490373565269575e-05, "loss": 0.4339, "step": 270 }, { "epoch": 1.237089871611983, "grad_norm": 0.5572121626093545, "learning_rate": 7.484124340814467e-05, "loss": 0.4344, "step": 271 }, { "epoch": 1.2416547788873038, "grad_norm": 0.5678369025757513, "learning_rate": 7.477839674168342e-05, "loss": 0.4256, "step": 272 }, { "epoch": 1.246219686162625, "grad_norm": 0.5209219264497257, "learning_rate": 7.471519629261859e-05, "loss": 0.4327, "step": 273 }, { "epoch": 1.2507845934379458, "grad_norm": 0.4721960308852361, "learning_rate": 7.465164270385558e-05, "loss": 0.4304, "step": 274 }, { "epoch": 1.2553495007132667, "grad_norm": 0.5657942084693068, "learning_rate": 7.45877366218921e-05, "loss": 0.4382, "step": 275 }, { "epoch": 1.2599144079885878, "grad_norm": 0.7196787321817991, "learning_rate": 7.452347869681159e-05, "loss": 0.4356, "step": 276 }, { "epoch": 1.2644793152639087, "grad_norm": 0.6908825865399278, "learning_rate": 7.445886958227665e-05, "loss": 0.4291, "step": 277 }, { "epoch": 1.2690442225392298, "grad_norm": 0.5440820157677637, "learning_rate": 7.439390993552227e-05, "loss": 0.4362, "step": 278 }, { "epoch": 1.2736091298145507, "grad_norm": 0.6147849930389729, "learning_rate": 7.43286004173493e-05, "loss": 0.4321, "step": 279 }, { "epoch": 1.2781740370898715, "grad_norm": 0.5681098435881913, "learning_rate": 7.426294169211762e-05, "loss": 0.4348, "step": 280 }, { "epoch": 1.2827389443651926, "grad_norm": 0.44708040023745554, "learning_rate": 7.419693442773937e-05, "loss": 0.4375, "step": 281 }, { "epoch": 1.2873038516405135, "grad_norm": 0.5713599240412557, "learning_rate": 7.413057929567227e-05, "loss": 0.4298, "step": 282 }, { "epoch": 1.2918687589158346, "grad_norm": 0.6323338024890408, "learning_rate": 7.406387697091269e-05, "loss": 0.4374, "step": 283 }, { "epoch": 1.2964336661911555, "grad_norm": 0.9056516523433347, "learning_rate": 7.399682813198879e-05, "loss": 0.4362, "step": 284 }, { "epoch": 1.3009985734664764, "grad_norm": 1.182899493470329, "learning_rate": 7.392943346095366e-05, "loss": 0.4361, "step": 285 }, { "epoch": 1.3055634807417975, "grad_norm": 0.7775593404006284, "learning_rate": 7.386169364337833e-05, "loss": 0.438, "step": 286 }, { "epoch": 1.3101283880171184, "grad_norm": 0.6198494591654299, "learning_rate": 7.379360936834492e-05, "loss": 0.429, "step": 287 }, { "epoch": 1.3146932952924395, "grad_norm": 0.7793764973737607, "learning_rate": 7.372518132843941e-05, "loss": 0.4385, "step": 288 }, { "epoch": 1.3192582025677604, "grad_norm": 0.9303862541132153, "learning_rate": 7.365641021974478e-05, "loss": 0.4335, "step": 289 }, { "epoch": 1.3238231098430813, "grad_norm": 0.8432690646689237, "learning_rate": 7.358729674183392e-05, "loss": 0.4317, "step": 290 }, { "epoch": 1.3283880171184022, "grad_norm": 0.8874816517796578, "learning_rate": 7.351784159776238e-05, "loss": 0.4304, "step": 291 }, { "epoch": 1.3329529243937233, "grad_norm": 0.868796496930731, "learning_rate": 7.344804549406135e-05, "loss": 0.4371, "step": 292 }, { "epoch": 1.3375178316690441, "grad_norm": 0.7852711094517002, "learning_rate": 7.337790914073043e-05, "loss": 0.4294, "step": 293 }, { "epoch": 1.3420827389443652, "grad_norm": 0.936953145650728, "learning_rate": 7.330743325123036e-05, "loss": 0.4391, "step": 294 }, { "epoch": 1.3466476462196861, "grad_norm": 0.9704040812645053, "learning_rate": 7.323661854247587e-05, "loss": 0.4349, "step": 295 }, { "epoch": 1.351212553495007, "grad_norm": 0.8823873892473993, "learning_rate": 7.316546573482828e-05, "loss": 0.4315, "step": 296 }, { "epoch": 1.3557774607703281, "grad_norm": 0.9155700667464731, "learning_rate": 7.309397555208817e-05, "loss": 0.4352, "step": 297 }, { "epoch": 1.360342368045649, "grad_norm": 0.9500779897069331, "learning_rate": 7.302214872148817e-05, "loss": 0.4341, "step": 298 }, { "epoch": 1.3649072753209701, "grad_norm": 0.715286965255515, "learning_rate": 7.29499859736854e-05, "loss": 0.43, "step": 299 }, { "epoch": 1.369472182596291, "grad_norm": 0.42900565746793906, "learning_rate": 7.287748804275406e-05, "loss": 0.4257, "step": 300 }, { "epoch": 1.3740370898716119, "grad_norm": 0.44848107810486365, "learning_rate": 7.280465566617804e-05, "loss": 0.4282, "step": 301 }, { "epoch": 1.378601997146933, "grad_norm": 0.5499399617735641, "learning_rate": 7.273148958484335e-05, "loss": 0.4342, "step": 302 }, { "epoch": 1.3831669044222539, "grad_norm": 0.7779728624601656, "learning_rate": 7.265799054303062e-05, "loss": 0.4338, "step": 303 }, { "epoch": 1.387731811697575, "grad_norm": 0.7555091815426274, "learning_rate": 7.258415928840749e-05, "loss": 0.4311, "step": 304 }, { "epoch": 1.3922967189728959, "grad_norm": 0.6206089776911131, "learning_rate": 7.250999657202107e-05, "loss": 0.4296, "step": 305 }, { "epoch": 1.3968616262482167, "grad_norm": 0.7912293160493962, "learning_rate": 7.24355031482902e-05, "loss": 0.436, "step": 306 }, { "epoch": 1.4014265335235379, "grad_norm": 1.125338548493621, "learning_rate": 7.236067977499791e-05, "loss": 0.4342, "step": 307 }, { "epoch": 1.4059914407988587, "grad_norm": 1.04913732233305, "learning_rate": 7.228552721328354e-05, "loss": 0.4388, "step": 308 }, { "epoch": 1.4105563480741798, "grad_norm": 0.8118850698239931, "learning_rate": 7.22100462276352e-05, "loss": 0.4257, "step": 309 }, { "epoch": 1.4151212553495007, "grad_norm": 0.6971634722121615, "learning_rate": 7.213423758588182e-05, "loss": 0.4314, "step": 310 }, { "epoch": 1.4196861626248216, "grad_norm": 0.9112349240874238, "learning_rate": 7.20581020591854e-05, "loss": 0.4311, "step": 311 }, { "epoch": 1.4242510699001427, "grad_norm": 0.8933771253980181, "learning_rate": 7.19816404220332e-05, "loss": 0.4337, "step": 312 }, { "epoch": 1.4288159771754636, "grad_norm": 0.7078422348562297, "learning_rate": 7.190485345222981e-05, "loss": 0.4297, "step": 313 }, { "epoch": 1.4333808844507847, "grad_norm": 0.659819970126346, "learning_rate": 7.18277419308893e-05, "loss": 0.4258, "step": 314 }, { "epoch": 1.4379457917261056, "grad_norm": 0.5948618977549561, "learning_rate": 7.17503066424272e-05, "loss": 0.4307, "step": 315 }, { "epoch": 1.4425106990014265, "grad_norm": 0.5522078226460925, "learning_rate": 7.167254837455254e-05, "loss": 0.423, "step": 316 }, { "epoch": 1.4470756062767476, "grad_norm": 0.46370094074478, "learning_rate": 7.15944679182599e-05, "loss": 0.4226, "step": 317 }, { "epoch": 1.4516405135520685, "grad_norm": 0.4538500924366929, "learning_rate": 7.15160660678213e-05, "loss": 0.4334, "step": 318 }, { "epoch": 1.4562054208273896, "grad_norm": 0.5046426426810093, "learning_rate": 7.143734362077809e-05, "loss": 0.4333, "step": 319 }, { "epoch": 1.4607703281027105, "grad_norm": 0.4578123337897367, "learning_rate": 7.135830137793295e-05, "loss": 0.4236, "step": 320 }, { "epoch": 1.4653352353780313, "grad_norm": 0.4655315773535026, "learning_rate": 7.127894014334163e-05, "loss": 0.4279, "step": 321 }, { "epoch": 1.4699001426533522, "grad_norm": 0.4651249472363875, "learning_rate": 7.119926072430485e-05, "loss": 0.4253, "step": 322 }, { "epoch": 1.4744650499286733, "grad_norm": 0.4978505684470615, "learning_rate": 7.111926393136002e-05, "loss": 0.428, "step": 323 }, { "epoch": 1.4790299572039942, "grad_norm": 0.536231847135624, "learning_rate": 7.103895057827308e-05, "loss": 0.43, "step": 324 }, { "epoch": 1.4835948644793153, "grad_norm": 0.3912812640636196, "learning_rate": 7.095832148203013e-05, "loss": 0.4295, "step": 325 }, { "epoch": 1.4881597717546362, "grad_norm": 0.40464250838428917, "learning_rate": 7.087737746282916e-05, "loss": 0.4277, "step": 326 }, { "epoch": 1.492724679029957, "grad_norm": 0.5277590990428639, "learning_rate": 7.079611934407173e-05, "loss": 0.4335, "step": 327 }, { "epoch": 1.4972895863052782, "grad_norm": 0.7191902847601642, "learning_rate": 7.071454795235457e-05, "loss": 0.428, "step": 328 }, { "epoch": 1.5018544935805993, "grad_norm": 0.8201553068276188, "learning_rate": 7.063266411746116e-05, "loss": 0.4243, "step": 329 }, { "epoch": 1.5064194008559202, "grad_norm": 0.8029687675080593, "learning_rate": 7.055046867235331e-05, "loss": 0.4297, "step": 330 }, { "epoch": 1.510984308131241, "grad_norm": 0.7677427204023601, "learning_rate": 7.046796245316267e-05, "loss": 0.428, "step": 331 }, { "epoch": 1.515549215406562, "grad_norm": 0.7499549959421509, "learning_rate": 7.038514629918228e-05, "loss": 0.4307, "step": 332 }, { "epoch": 1.520114122681883, "grad_norm": 0.8211692796656908, "learning_rate": 7.030202105285792e-05, "loss": 0.433, "step": 333 }, { "epoch": 1.524679029957204, "grad_norm": 1.0730200098155518, "learning_rate": 7.021858755977964e-05, "loss": 0.4241, "step": 334 }, { "epoch": 1.529243937232525, "grad_norm": 1.1727832985354492, "learning_rate": 7.013484666867312e-05, "loss": 0.4333, "step": 335 }, { "epoch": 1.533808844507846, "grad_norm": 0.7964684055522075, "learning_rate": 7.005079923139104e-05, "loss": 0.4317, "step": 336 }, { "epoch": 1.5383737517831668, "grad_norm": 0.7927868140861069, "learning_rate": 6.996644610290441e-05, "loss": 0.4293, "step": 337 }, { "epoch": 1.542938659058488, "grad_norm": 0.722547974680119, "learning_rate": 6.988178814129388e-05, "loss": 0.4297, "step": 338 }, { "epoch": 1.5475035663338088, "grad_norm": 0.5891193027337546, "learning_rate": 6.979682620774104e-05, "loss": 0.4301, "step": 339 }, { "epoch": 1.55206847360913, "grad_norm": 0.5088111344725053, "learning_rate": 6.971156116651958e-05, "loss": 0.4244, "step": 340 }, { "epoch": 1.5566333808844508, "grad_norm": 0.5076201573377456, "learning_rate": 6.962599388498657e-05, "loss": 0.4257, "step": 341 }, { "epoch": 1.5611982881597717, "grad_norm": 0.5666496245303375, "learning_rate": 6.954012523357362e-05, "loss": 0.4308, "step": 342 }, { "epoch": 1.5657631954350926, "grad_norm": 0.5307337032247661, "learning_rate": 6.945395608577801e-05, "loss": 0.4356, "step": 343 }, { "epoch": 1.5703281027104137, "grad_norm": 0.44314588391528825, "learning_rate": 6.936748731815382e-05, "loss": 0.4249, "step": 344 }, { "epoch": 1.5748930099857348, "grad_norm": 0.4804047628466697, "learning_rate": 6.9280719810303e-05, "loss": 0.4332, "step": 345 }, { "epoch": 1.5794579172610557, "grad_norm": 0.4642513033873333, "learning_rate": 6.919365444486644e-05, "loss": 0.4226, "step": 346 }, { "epoch": 1.5840228245363766, "grad_norm": 0.5568335056382052, "learning_rate": 6.910629210751497e-05, "loss": 0.4238, "step": 347 }, { "epoch": 1.5885877318116974, "grad_norm": 0.5686750155147203, "learning_rate": 6.901863368694036e-05, "loss": 0.4229, "step": 348 }, { "epoch": 1.5931526390870185, "grad_norm": 0.5594409908646193, "learning_rate": 6.893068007484628e-05, "loss": 0.431, "step": 349 }, { "epoch": 1.5977175463623396, "grad_norm": 0.7542738892463182, "learning_rate": 6.884243216593928e-05, "loss": 0.4278, "step": 350 }, { "epoch": 1.6022824536376605, "grad_norm": 0.909785720326975, "learning_rate": 6.875389085791956e-05, "loss": 0.4237, "step": 351 }, { "epoch": 1.6068473609129814, "grad_norm": 1.0518321051704833, "learning_rate": 6.866505705147195e-05, "loss": 0.4297, "step": 352 }, { "epoch": 1.6114122681883023, "grad_norm": 1.1017254090246513, "learning_rate": 6.857593165025674e-05, "loss": 0.4245, "step": 353 }, { "epoch": 1.6159771754636234, "grad_norm": 0.8187539090198782, "learning_rate": 6.848651556090042e-05, "loss": 0.4258, "step": 354 }, { "epoch": 1.6205420827389445, "grad_norm": 0.5573740074216886, "learning_rate": 6.839680969298653e-05, "loss": 0.4285, "step": 355 }, { "epoch": 1.6251069900142654, "grad_norm": 0.6164738882872269, "learning_rate": 6.830681495904637e-05, "loss": 0.4286, "step": 356 }, { "epoch": 1.6296718972895863, "grad_norm": 0.8801449131438988, "learning_rate": 6.821653227454973e-05, "loss": 0.4221, "step": 357 }, { "epoch": 1.6342368045649072, "grad_norm": 1.0792158764449444, "learning_rate": 6.812596255789553e-05, "loss": 0.4267, "step": 358 }, { "epoch": 1.6388017118402283, "grad_norm": 0.8680834731545233, "learning_rate": 6.80351067304026e-05, "loss": 0.424, "step": 359 }, { "epoch": 1.6433666191155494, "grad_norm": 0.6086251558676785, "learning_rate": 6.794396571630015e-05, "loss": 0.4293, "step": 360 }, { "epoch": 1.6479315263908703, "grad_norm": 0.501395339998093, "learning_rate": 6.785254044271848e-05, "loss": 0.4285, "step": 361 }, { "epoch": 1.6524964336661911, "grad_norm": 0.5979602956912436, "learning_rate": 6.776083183967951e-05, "loss": 0.4275, "step": 362 }, { "epoch": 1.657061340941512, "grad_norm": 0.6937203619342673, "learning_rate": 6.766884084008734e-05, "loss": 0.425, "step": 363 }, { "epoch": 1.6616262482168331, "grad_norm": 0.7101090354502062, "learning_rate": 6.757656837971872e-05, "loss": 0.4309, "step": 364 }, { "epoch": 1.666191155492154, "grad_norm": 0.643930488180388, "learning_rate": 6.748401539721353e-05, "loss": 0.4287, "step": 365 }, { "epoch": 1.6707560627674751, "grad_norm": 0.7810718246548566, "learning_rate": 6.739118283406533e-05, "loss": 0.4264, "step": 366 }, { "epoch": 1.675320970042796, "grad_norm": 0.9283460979463781, "learning_rate": 6.729807163461165e-05, "loss": 0.4319, "step": 367 }, { "epoch": 1.679885877318117, "grad_norm": 0.9210967683307637, "learning_rate": 6.720468274602446e-05, "loss": 0.4282, "step": 368 }, { "epoch": 1.684450784593438, "grad_norm": 0.6497876560161834, "learning_rate": 6.711101711830054e-05, "loss": 0.4284, "step": 369 }, { "epoch": 1.6890156918687589, "grad_norm": 0.4725196750675396, "learning_rate": 6.701707570425174e-05, "loss": 0.4262, "step": 370 }, { "epoch": 1.69358059914408, "grad_norm": 0.5985153130104276, "learning_rate": 6.69228594594954e-05, "loss": 0.4309, "step": 371 }, { "epoch": 1.6981455064194009, "grad_norm": 0.7045200474002776, "learning_rate": 6.682836934244452e-05, "loss": 0.4268, "step": 372 }, { "epoch": 1.7027104136947218, "grad_norm": 0.7522673563908682, "learning_rate": 6.67336063142981e-05, "loss": 0.429, "step": 373 }, { "epoch": 1.7072753209700426, "grad_norm": 0.5776326484944384, "learning_rate": 6.663857133903128e-05, "loss": 0.4243, "step": 374 }, { "epoch": 1.7118402282453637, "grad_norm": 0.34017458565072894, "learning_rate": 6.654326538338565e-05, "loss": 0.4235, "step": 375 }, { "epoch": 1.7164051355206849, "grad_norm": 0.5718314127403422, "learning_rate": 6.644768941685928e-05, "loss": 0.4223, "step": 376 }, { "epoch": 1.7209700427960057, "grad_norm": 0.7259779441818939, "learning_rate": 6.63518444116969e-05, "loss": 0.4257, "step": 377 }, { "epoch": 1.7255349500713266, "grad_norm": 0.5302976449413679, "learning_rate": 6.625573134288012e-05, "loss": 0.4156, "step": 378 }, { "epoch": 1.7300998573466475, "grad_norm": 0.4044589367438007, "learning_rate": 6.615935118811737e-05, "loss": 0.4217, "step": 379 }, { "epoch": 1.7346647646219686, "grad_norm": 0.6930212301593457, "learning_rate": 6.606270492783395e-05, "loss": 0.4228, "step": 380 }, { "epoch": 1.7392296718972897, "grad_norm": 0.7344014385124219, "learning_rate": 6.596579354516225e-05, "loss": 0.4232, "step": 381 }, { "epoch": 1.7437945791726106, "grad_norm": 0.5534474445480675, "learning_rate": 6.586861802593147e-05, "loss": 0.4233, "step": 382 }, { "epoch": 1.7483594864479315, "grad_norm": 0.6126829361696522, "learning_rate": 6.577117935865785e-05, "loss": 0.4268, "step": 383 }, { "epoch": 1.7529243937232524, "grad_norm": 0.6533034681110785, "learning_rate": 6.567347853453439e-05, "loss": 0.422, "step": 384 }, { "epoch": 1.7574893009985735, "grad_norm": 0.5424516205530866, "learning_rate": 6.557551654742099e-05, "loss": 0.4198, "step": 385 }, { "epoch": 1.7620542082738946, "grad_norm": 0.6137606774154565, "learning_rate": 6.547729439383414e-05, "loss": 0.4266, "step": 386 }, { "epoch": 1.7666191155492155, "grad_norm": 0.6945685962943688, "learning_rate": 6.537881307293691e-05, "loss": 0.4231, "step": 387 }, { "epoch": 1.7711840228245364, "grad_norm": 0.6051697889503573, "learning_rate": 6.528007358652871e-05, "loss": 0.424, "step": 388 }, { "epoch": 1.7757489300998572, "grad_norm": 0.497689232889849, "learning_rate": 6.518107693903519e-05, "loss": 0.4221, "step": 389 }, { "epoch": 1.7803138373751783, "grad_norm": 0.3881400873717001, "learning_rate": 6.50818241374979e-05, "loss": 0.4266, "step": 390 }, { "epoch": 1.7848787446504994, "grad_norm": 0.4053350043040276, "learning_rate": 6.498231619156416e-05, "loss": 0.4212, "step": 391 }, { "epoch": 1.7894436519258203, "grad_norm": 0.4228076243058947, "learning_rate": 6.488255411347673e-05, "loss": 0.421, "step": 392 }, { "epoch": 1.7940085592011412, "grad_norm": 0.3651462975399496, "learning_rate": 6.478253891806353e-05, "loss": 0.4203, "step": 393 }, { "epoch": 1.798573466476462, "grad_norm": 0.3813241127258593, "learning_rate": 6.468227162272726e-05, "loss": 0.4256, "step": 394 }, { "epoch": 1.8031383737517832, "grad_norm": 0.4876471377127721, "learning_rate": 6.45817532474352e-05, "loss": 0.4249, "step": 395 }, { "epoch": 1.807703281027104, "grad_norm": 0.5450947841751673, "learning_rate": 6.448098481470863e-05, "loss": 0.4203, "step": 396 }, { "epoch": 1.8122681883024252, "grad_norm": 0.6516167352248161, "learning_rate": 6.437996734961262e-05, "loss": 0.4306, "step": 397 }, { "epoch": 1.816833095577746, "grad_norm": 0.7968992579354538, "learning_rate": 6.427870187974548e-05, "loss": 0.4213, "step": 398 }, { "epoch": 1.821398002853067, "grad_norm": 0.8730538354851684, "learning_rate": 6.417718943522835e-05, "loss": 0.4315, "step": 399 }, { "epoch": 1.825962910128388, "grad_norm": 0.801131615179908, "learning_rate": 6.407543104869469e-05, "loss": 0.4209, "step": 400 }, { "epoch": 1.830527817403709, "grad_norm": 0.645824395128567, "learning_rate": 6.397342775527982e-05, "loss": 0.4277, "step": 401 }, { "epoch": 1.83509272467903, "grad_norm": 0.7410730591928698, "learning_rate": 6.38711805926104e-05, "loss": 0.4196, "step": 402 }, { "epoch": 1.839657631954351, "grad_norm": 0.9347780145308188, "learning_rate": 6.376869060079381e-05, "loss": 0.4226, "step": 403 }, { "epoch": 1.8442225392296718, "grad_norm": 0.8659116193168973, "learning_rate": 6.366595882240761e-05, "loss": 0.4255, "step": 404 }, { "epoch": 1.8487874465049927, "grad_norm": 0.7573788312065334, "learning_rate": 6.356298630248893e-05, "loss": 0.4319, "step": 405 }, { "epoch": 1.8533523537803138, "grad_norm": 0.698786388331663, "learning_rate": 6.345977408852383e-05, "loss": 0.423, "step": 406 }, { "epoch": 1.857917261055635, "grad_norm": 0.6540039704024841, "learning_rate": 6.335632323043671e-05, "loss": 0.4239, "step": 407 }, { "epoch": 1.8624821683309558, "grad_norm": 0.5449620669554056, "learning_rate": 6.325263478057947e-05, "loss": 0.4279, "step": 408 }, { "epoch": 1.8670470756062767, "grad_norm": 0.4777538440000814, "learning_rate": 6.314870979372102e-05, "loss": 0.4247, "step": 409 }, { "epoch": 1.8716119828815976, "grad_norm": 0.38933468433699886, "learning_rate": 6.304454932703633e-05, "loss": 0.4231, "step": 410 }, { "epoch": 1.8761768901569187, "grad_norm": 0.4120383799689869, "learning_rate": 6.29401544400959e-05, "loss": 0.4293, "step": 411 }, { "epoch": 1.8807417974322398, "grad_norm": 0.47381071482947645, "learning_rate": 6.283552619485476e-05, "loss": 0.4177, "step": 412 }, { "epoch": 1.8853067047075607, "grad_norm": 0.3580390483686331, "learning_rate": 6.27306656556418e-05, "loss": 0.4196, "step": 413 }, { "epoch": 1.8898716119828816, "grad_norm": 0.38078475072495094, "learning_rate": 6.2625573889149e-05, "loss": 0.4193, "step": 414 }, { "epoch": 1.8944365192582024, "grad_norm": 0.4278816772061641, "learning_rate": 6.25202519644204e-05, "loss": 0.418, "step": 415 }, { "epoch": 1.8990014265335236, "grad_norm": 0.4475822733130566, "learning_rate": 6.241470095284133e-05, "loss": 0.4262, "step": 416 }, { "epoch": 1.9035663338088447, "grad_norm": 0.5284917554953473, "learning_rate": 6.230892192812752e-05, "loss": 0.4241, "step": 417 }, { "epoch": 1.9081312410841655, "grad_norm": 0.5411660617767071, "learning_rate": 6.220291596631417e-05, "loss": 0.425, "step": 418 }, { "epoch": 1.9126961483594864, "grad_norm": 0.4446094839845814, "learning_rate": 6.209668414574502e-05, "loss": 0.4217, "step": 419 }, { "epoch": 1.9172610556348073, "grad_norm": 0.38671232428761154, "learning_rate": 6.199022754706127e-05, "loss": 0.4288, "step": 420 }, { "epoch": 1.9218259629101284, "grad_norm": 0.3611456455331726, "learning_rate": 6.188354725319074e-05, "loss": 0.4217, "step": 421 }, { "epoch": 1.9263908701854495, "grad_norm": 0.327267880459524, "learning_rate": 6.177664434933676e-05, "loss": 0.4239, "step": 422 }, { "epoch": 1.9309557774607704, "grad_norm": 0.2734657156850823, "learning_rate": 6.166951992296716e-05, "loss": 0.424, "step": 423 }, { "epoch": 1.9355206847360913, "grad_norm": 0.30352197416491244, "learning_rate": 6.15621750638032e-05, "loss": 0.422, "step": 424 }, { "epoch": 1.9400855920114122, "grad_norm": 0.2974349992819165, "learning_rate": 6.145461086380848e-05, "loss": 0.4251, "step": 425 }, { "epoch": 1.9446504992867333, "grad_norm": 0.3823668291519408, "learning_rate": 6.134682841717792e-05, "loss": 0.4165, "step": 426 }, { "epoch": 1.9492154065620542, "grad_norm": 0.6312244786498952, "learning_rate": 6.123882882032639e-05, "loss": 0.4194, "step": 427 }, { "epoch": 1.9537803138373753, "grad_norm": 0.8700056231822155, "learning_rate": 6.113061317187789e-05, "loss": 0.4231, "step": 428 }, { "epoch": 1.9583452211126962, "grad_norm": 0.9887590587117958, "learning_rate": 6.1022182572654063e-05, "loss": 0.4228, "step": 429 }, { "epoch": 1.962910128388017, "grad_norm": 0.9884807393198494, "learning_rate": 6.0913538125663236e-05, "loss": 0.4306, "step": 430 }, { "epoch": 1.967475035663338, "grad_norm": 0.911102518039347, "learning_rate": 6.0804680936089025e-05, "loss": 0.4225, "step": 431 }, { "epoch": 1.972039942938659, "grad_norm": 0.7402245060126661, "learning_rate": 6.069561211127919e-05, "loss": 0.4222, "step": 432 }, { "epoch": 1.9766048502139801, "grad_norm": 0.5083536699969557, "learning_rate": 6.05863327607344e-05, "loss": 0.4219, "step": 433 }, { "epoch": 1.981169757489301, "grad_norm": 0.3496862590580212, "learning_rate": 6.0476843996096795e-05, "loss": 0.4203, "step": 434 }, { "epoch": 1.985734664764622, "grad_norm": 0.4037616186832233, "learning_rate": 6.0367146931138866e-05, "loss": 0.4184, "step": 435 }, { "epoch": 1.9902995720399428, "grad_norm": 0.4436594227881451, "learning_rate": 6.025724268175197e-05, "loss": 0.4208, "step": 436 }, { "epoch": 1.994864479315264, "grad_norm": 0.4512851557242651, "learning_rate": 6.0147132365935065e-05, "loss": 0.4165, "step": 437 }, { "epoch": 1.999429386590585, "grad_norm": 0.4765645074474141, "learning_rate": 6.003681710378335e-05, "loss": 0.4225, "step": 438 }, { "epoch": 2.003994293865906, "grad_norm": 0.9681866786412074, "learning_rate": 5.9926298017476774e-05, "loss": 0.7716, "step": 439 }, { "epoch": 2.0085592011412268, "grad_norm": 1.5630521704235794, "learning_rate": 5.981557623126876e-05, "loss": 0.4014, "step": 440 }, { "epoch": 2.0131241084165477, "grad_norm": 0.571055488243649, "learning_rate": 5.970465287147461e-05, "loss": 0.4024, "step": 441 }, { "epoch": 2.017689015691869, "grad_norm": 1.7926013030849834, "learning_rate": 5.959352906646018e-05, "loss": 0.4049, "step": 442 }, { "epoch": 2.02225392296719, "grad_norm": 0.8802074425647698, "learning_rate": 5.948220594663035e-05, "loss": 0.4054, "step": 443 }, { "epoch": 2.0268188302425107, "grad_norm": 1.8078241844652807, "learning_rate": 5.93706846444175e-05, "loss": 0.4158, "step": 444 }, { "epoch": 2.0313837375178316, "grad_norm": 1.3366588545623441, "learning_rate": 5.925896629427006e-05, "loss": 0.4088, "step": 445 }, { "epoch": 2.0359486447931525, "grad_norm": 1.2979771341051034, "learning_rate": 5.9147052032640886e-05, "loss": 0.4112, "step": 446 }, { "epoch": 2.0405135520684734, "grad_norm": 1.0266378699260492, "learning_rate": 5.9034942997975744e-05, "loss": 0.4105, "step": 447 }, { "epoch": 2.0450784593437947, "grad_norm": 1.0531734053882162, "learning_rate": 5.8922640330701734e-05, "loss": 0.4069, "step": 448 }, { "epoch": 2.0496433666191156, "grad_norm": 0.7366171898891467, "learning_rate": 5.8810145173215694e-05, "loss": 0.3995, "step": 449 }, { "epoch": 2.0542082738944365, "grad_norm": 0.8679822170363295, "learning_rate": 5.869745866987256e-05, "loss": 0.4064, "step": 450 }, { "epoch": 2.0587731811697574, "grad_norm": 0.6944746602819868, "learning_rate": 5.8584581966973696e-05, "loss": 0.403, "step": 451 }, { "epoch": 2.0633380884450783, "grad_norm": 0.5466028481958466, "learning_rate": 5.847151621275531e-05, "loss": 0.3997, "step": 452 }, { "epoch": 2.0679029957203996, "grad_norm": 0.6055399950709338, "learning_rate": 5.8358262557376725e-05, "loss": 0.3994, "step": 453 }, { "epoch": 2.0724679029957205, "grad_norm": 0.4848917512592404, "learning_rate": 5.824482215290865e-05, "loss": 0.404, "step": 454 }, { "epoch": 2.0770328102710414, "grad_norm": 0.498636327804515, "learning_rate": 5.813119615332154e-05, "loss": 0.3993, "step": 455 }, { "epoch": 2.0815977175463622, "grad_norm": 0.46905229993366143, "learning_rate": 5.801738571447378e-05, "loss": 0.4053, "step": 456 }, { "epoch": 2.086162624821683, "grad_norm": 0.4431759724923076, "learning_rate": 5.79033919941e-05, "loss": 0.3966, "step": 457 }, { "epoch": 2.0907275320970045, "grad_norm": 0.5210110584044579, "learning_rate": 5.7789216151799196e-05, "loss": 0.3918, "step": 458 }, { "epoch": 2.0952924393723253, "grad_norm": 0.43971729383879754, "learning_rate": 5.7674859349023064e-05, "loss": 0.4008, "step": 459 }, { "epoch": 2.0998573466476462, "grad_norm": 0.3602233291051038, "learning_rate": 5.756032274906405e-05, "loss": 0.3985, "step": 460 }, { "epoch": 2.104422253922967, "grad_norm": 0.3514674260539934, "learning_rate": 5.7445607517043646e-05, "loss": 0.3948, "step": 461 }, { "epoch": 2.108987161198288, "grad_norm": 0.3740970628030136, "learning_rate": 5.733071481990046e-05, "loss": 0.3969, "step": 462 }, { "epoch": 2.1135520684736093, "grad_norm": 0.3848411215975852, "learning_rate": 5.721564582637829e-05, "loss": 0.3997, "step": 463 }, { "epoch": 2.11811697574893, "grad_norm": 0.36362834255516524, "learning_rate": 5.710040170701443e-05, "loss": 0.3941, "step": 464 }, { "epoch": 2.122681883024251, "grad_norm": 0.387749019583618, "learning_rate": 5.6984983634127534e-05, "loss": 0.3964, "step": 465 }, { "epoch": 2.127246790299572, "grad_norm": 0.28828513667702704, "learning_rate": 5.686939278180585e-05, "loss": 0.3947, "step": 466 }, { "epoch": 2.131811697574893, "grad_norm": 0.30878428184767404, "learning_rate": 5.675363032589521e-05, "loss": 0.4029, "step": 467 }, { "epoch": 2.1363766048502137, "grad_norm": 0.3090749309313414, "learning_rate": 5.6637697443987044e-05, "loss": 0.3957, "step": 468 }, { "epoch": 2.140941512125535, "grad_norm": 0.3150439666682486, "learning_rate": 5.6521595315406505e-05, "loss": 0.3982, "step": 469 }, { "epoch": 2.145506419400856, "grad_norm": 0.30763731181006676, "learning_rate": 5.640532512120036e-05, "loss": 0.3978, "step": 470 }, { "epoch": 2.150071326676177, "grad_norm": 0.3156068121770724, "learning_rate": 5.6288888044125005e-05, "loss": 0.3989, "step": 471 }, { "epoch": 2.1546362339514977, "grad_norm": 0.3220879180231534, "learning_rate": 5.6172285268634503e-05, "loss": 0.3966, "step": 472 }, { "epoch": 2.159201141226819, "grad_norm": 0.2977772434873077, "learning_rate": 5.6055517980868434e-05, "loss": 0.3953, "step": 473 }, { "epoch": 2.16376604850214, "grad_norm": 0.29810536611214883, "learning_rate": 5.59385873686399e-05, "loss": 0.3918, "step": 474 }, { "epoch": 2.168330955777461, "grad_norm": 0.2852081639489458, "learning_rate": 5.582149462142341e-05, "loss": 0.3917, "step": 475 }, { "epoch": 2.1728958630527817, "grad_norm": 0.3081209690427737, "learning_rate": 5.570424093034279e-05, "loss": 0.3968, "step": 476 }, { "epoch": 2.1774607703281026, "grad_norm": 0.29194114681550415, "learning_rate": 5.558682748815907e-05, "loss": 0.3976, "step": 477 }, { "epoch": 2.1820256776034235, "grad_norm": 0.2933275523587884, "learning_rate": 5.546925548925831e-05, "loss": 0.396, "step": 478 }, { "epoch": 2.186590584878745, "grad_norm": 0.3010159866072722, "learning_rate": 5.5351526129639556e-05, "loss": 0.3947, "step": 479 }, { "epoch": 2.1911554921540657, "grad_norm": 0.22593376332738507, "learning_rate": 5.523364060690253e-05, "loss": 0.3947, "step": 480 }, { "epoch": 2.1957203994293866, "grad_norm": 0.263407864251361, "learning_rate": 5.511560012023558e-05, "loss": 0.3968, "step": 481 }, { "epoch": 2.2002853067047075, "grad_norm": 0.2621235557428232, "learning_rate": 5.499740587040337e-05, "loss": 0.3957, "step": 482 }, { "epoch": 2.2048502139800283, "grad_norm": 0.28672274408401716, "learning_rate": 5.487905905973474e-05, "loss": 0.3982, "step": 483 }, { "epoch": 2.2094151212553497, "grad_norm": 0.2961044553045987, "learning_rate": 5.476056089211047e-05, "loss": 0.3953, "step": 484 }, { "epoch": 2.2139800285306706, "grad_norm": 0.24362816536099371, "learning_rate": 5.464191257295099e-05, "loss": 0.3947, "step": 485 }, { "epoch": 2.2185449358059914, "grad_norm": 0.20568524425771714, "learning_rate": 5.4523115309204154e-05, "loss": 0.3904, "step": 486 }, { "epoch": 2.2231098430813123, "grad_norm": 0.23433795861015624, "learning_rate": 5.440417030933296e-05, "loss": 0.3887, "step": 487 }, { "epoch": 2.227674750356633, "grad_norm": 0.2637906019753822, "learning_rate": 5.4285078783303204e-05, "loss": 0.398, "step": 488 }, { "epoch": 2.2322396576319545, "grad_norm": 0.27336753680428566, "learning_rate": 5.41658419425713e-05, "loss": 0.4012, "step": 489 }, { "epoch": 2.2368045649072754, "grad_norm": 0.36009442351022874, "learning_rate": 5.404646100007179e-05, "loss": 0.3946, "step": 490 }, { "epoch": 2.2413694721825963, "grad_norm": 0.34745386802364503, "learning_rate": 5.3926937170205147e-05, "loss": 0.3988, "step": 491 }, { "epoch": 2.245934379457917, "grad_norm": 0.2560762047673561, "learning_rate": 5.3807271668825336e-05, "loss": 0.3981, "step": 492 }, { "epoch": 2.250499286733238, "grad_norm": 0.3039971848868009, "learning_rate": 5.368746571322746e-05, "loss": 0.3983, "step": 493 }, { "epoch": 2.2550641940085594, "grad_norm": 0.3132362113371044, "learning_rate": 5.356752052213543e-05, "loss": 0.3949, "step": 494 }, { "epoch": 2.2596291012838803, "grad_norm": 0.24114011641199656, "learning_rate": 5.344743731568947e-05, "loss": 0.398, "step": 495 }, { "epoch": 2.264194008559201, "grad_norm": 0.3047112832257029, "learning_rate": 5.3327217315433836e-05, "loss": 0.3942, "step": 496 }, { "epoch": 2.268758915834522, "grad_norm": 0.2902868163178195, "learning_rate": 5.320686174430426e-05, "loss": 0.3968, "step": 497 }, { "epoch": 2.273323823109843, "grad_norm": 0.21554384652324848, "learning_rate": 5.30863718266156e-05, "loss": 0.3945, "step": 498 }, { "epoch": 2.277888730385164, "grad_norm": 0.27255050697912414, "learning_rate": 5.296574878804931e-05, "loss": 0.3968, "step": 499 }, { "epoch": 2.282453637660485, "grad_norm": 0.24417856593881826, "learning_rate": 5.284499385564105e-05, "loss": 0.3943, "step": 500 }, { "epoch": 2.287018544935806, "grad_norm": 0.2765091011577604, "learning_rate": 5.272410825776817e-05, "loss": 0.3977, "step": 501 }, { "epoch": 2.291583452211127, "grad_norm": 0.25251367644483425, "learning_rate": 5.260309322413717e-05, "loss": 0.3965, "step": 502 }, { "epoch": 2.296148359486448, "grad_norm": 0.225515136380665, "learning_rate": 5.2481949985771296e-05, "loss": 0.397, "step": 503 }, { "epoch": 2.300713266761769, "grad_norm": 0.2463270131913551, "learning_rate": 5.23606797749979e-05, "loss": 0.3911, "step": 504 }, { "epoch": 2.30527817403709, "grad_norm": 0.2004757886081446, "learning_rate": 5.223928382543599e-05, "loss": 0.4003, "step": 505 }, { "epoch": 2.309843081312411, "grad_norm": 0.2455120366926149, "learning_rate": 5.211776337198362e-05, "loss": 0.3966, "step": 506 }, { "epoch": 2.314407988587732, "grad_norm": 0.22468493984434562, "learning_rate": 5.199611965080539e-05, "loss": 0.3936, "step": 507 }, { "epoch": 2.3189728958630527, "grad_norm": 0.2272194472097314, "learning_rate": 5.187435389931984e-05, "loss": 0.3995, "step": 508 }, { "epoch": 2.3235378031383735, "grad_norm": 0.24682570308387836, "learning_rate": 5.175246735618681e-05, "loss": 0.3936, "step": 509 }, { "epoch": 2.328102710413695, "grad_norm": 0.27271797206857185, "learning_rate": 5.163046126129496e-05, "loss": 0.3979, "step": 510 }, { "epoch": 2.3326676176890158, "grad_norm": 0.32336247263945955, "learning_rate": 5.1508336855749046e-05, "loss": 0.3984, "step": 511 }, { "epoch": 2.3372325249643366, "grad_norm": 0.3117192297000078, "learning_rate": 5.138609538185732e-05, "loss": 0.3957, "step": 512 }, { "epoch": 2.3417974322396575, "grad_norm": 0.24977202926389208, "learning_rate": 5.126373808311897e-05, "loss": 0.401, "step": 513 }, { "epoch": 2.3463623395149784, "grad_norm": 0.2724907116929327, "learning_rate": 5.114126620421135e-05, "loss": 0.3992, "step": 514 }, { "epoch": 2.3509272467902997, "grad_norm": 0.30820474754190696, "learning_rate": 5.101868099097741e-05, "loss": 0.3891, "step": 515 }, { "epoch": 2.3554921540656206, "grad_norm": 0.3185274771440913, "learning_rate": 5.0895983690413013e-05, "loss": 0.3901, "step": 516 }, { "epoch": 2.3600570613409415, "grad_norm": 0.2581547898350787, "learning_rate": 5.077317555065417e-05, "loss": 0.3996, "step": 517 }, { "epoch": 2.3646219686162624, "grad_norm": 0.3277207154296582, "learning_rate": 5.065025782096443e-05, "loss": 0.4004, "step": 518 }, { "epoch": 2.3691868758915833, "grad_norm": 0.3507469445057821, "learning_rate": 5.052723175172216e-05, "loss": 0.3961, "step": 519 }, { "epoch": 2.3737517831669046, "grad_norm": 0.31296981719792, "learning_rate": 5.04040985944078e-05, "loss": 0.3916, "step": 520 }, { "epoch": 2.3783166904422255, "grad_norm": 0.25011058332771036, "learning_rate": 5.0280859601591134e-05, "loss": 0.3946, "step": 521 }, { "epoch": 2.3828815977175464, "grad_norm": 0.2843447982406519, "learning_rate": 5.015751602691853e-05, "loss": 0.3951, "step": 522 }, { "epoch": 2.3874465049928673, "grad_norm": 0.2611424665922288, "learning_rate": 5.003406912510028e-05, "loss": 0.3905, "step": 523 }, { "epoch": 2.392011412268188, "grad_norm": 0.24619751471031087, "learning_rate": 4.99105201518977e-05, "loss": 0.395, "step": 524 }, { "epoch": 2.3965763195435095, "grad_norm": 0.2808716720105363, "learning_rate": 4.9786870364110496e-05, "loss": 0.3999, "step": 525 }, { "epoch": 2.4011412268188304, "grad_norm": 0.2353990460137413, "learning_rate": 4.9663121019563825e-05, "loss": 0.3998, "step": 526 }, { "epoch": 2.4057061340941512, "grad_norm": 0.2143855784356749, "learning_rate": 4.953927337709564e-05, "loss": 0.39, "step": 527 }, { "epoch": 2.410271041369472, "grad_norm": 0.30814345667504994, "learning_rate": 4.941532869654383e-05, "loss": 0.3946, "step": 528 }, { "epoch": 2.414835948644793, "grad_norm": 0.2524848431138124, "learning_rate": 4.929128823873338e-05, "loss": 0.3916, "step": 529 }, { "epoch": 2.419400855920114, "grad_norm": 0.2949956098139477, "learning_rate": 4.916715326546356e-05, "loss": 0.3973, "step": 530 }, { "epoch": 2.423965763195435, "grad_norm": 0.2866631443858236, "learning_rate": 4.9042925039495126e-05, "loss": 0.4008, "step": 531 }, { "epoch": 2.428530670470756, "grad_norm": 0.24434538853731672, "learning_rate": 4.8918604824537426e-05, "loss": 0.3925, "step": 532 }, { "epoch": 2.433095577746077, "grad_norm": 0.27376916436365306, "learning_rate": 4.879419388523557e-05, "loss": 0.396, "step": 533 }, { "epoch": 2.437660485021398, "grad_norm": 0.29192166428895333, "learning_rate": 4.866969348715755e-05, "loss": 0.3939, "step": 534 }, { "epoch": 2.442225392296719, "grad_norm": 0.22678026571928478, "learning_rate": 4.8545104896781396e-05, "loss": 0.3931, "step": 535 }, { "epoch": 2.44679029957204, "grad_norm": 0.2714093005509255, "learning_rate": 4.8420429381482254e-05, "loss": 0.3918, "step": 536 }, { "epoch": 2.451355206847361, "grad_norm": 0.3672921491066907, "learning_rate": 4.829566820951953e-05, "loss": 0.3989, "step": 537 }, { "epoch": 2.455920114122682, "grad_norm": 0.37866544196679924, "learning_rate": 4.817082265002395e-05, "loss": 0.3945, "step": 538 }, { "epoch": 2.4604850213980027, "grad_norm": 0.342928509500153, "learning_rate": 4.804589397298467e-05, "loss": 0.3926, "step": 539 }, { "epoch": 2.4650499286733236, "grad_norm": 0.30273238621233484, "learning_rate": 4.792088344923639e-05, "loss": 0.3964, "step": 540 }, { "epoch": 2.469614835948645, "grad_norm": 0.2137070264105446, "learning_rate": 4.779579235044635e-05, "loss": 0.4013, "step": 541 }, { "epoch": 2.474179743223966, "grad_norm": 0.3118277046924234, "learning_rate": 4.767062194910147e-05, "loss": 0.3927, "step": 542 }, { "epoch": 2.4787446504992867, "grad_norm": 0.34415527393891965, "learning_rate": 4.7545373518495376e-05, "loss": 0.3981, "step": 543 }, { "epoch": 2.4833095577746076, "grad_norm": 0.25521983324842423, "learning_rate": 4.7420048332715424e-05, "loss": 0.3993, "step": 544 }, { "epoch": 2.4878744650499285, "grad_norm": 0.2861375081450235, "learning_rate": 4.7294647666629764e-05, "loss": 0.3962, "step": 545 }, { "epoch": 2.49243937232525, "grad_norm": 0.35327944054617416, "learning_rate": 4.716917279587438e-05, "loss": 0.3929, "step": 546 }, { "epoch": 2.4970042796005707, "grad_norm": 0.3065127258311644, "learning_rate": 4.704362499684009e-05, "loss": 0.3917, "step": 547 }, { "epoch": 2.5015691868758916, "grad_norm": 0.2971240514783471, "learning_rate": 4.691800554665959e-05, "loss": 0.3951, "step": 548 }, { "epoch": 2.5061340941512125, "grad_norm": 0.2526840084770543, "learning_rate": 4.679231572319442e-05, "loss": 0.3965, "step": 549 }, { "epoch": 2.5106990014265333, "grad_norm": 0.27506914680614103, "learning_rate": 4.666655680502203e-05, "loss": 0.3909, "step": 550 }, { "epoch": 2.5152639087018542, "grad_norm": 0.30990634996692296, "learning_rate": 4.654073007142268e-05, "loss": 0.3991, "step": 551 }, { "epoch": 2.5198288159771756, "grad_norm": 0.2418594039084614, "learning_rate": 4.641483680236654e-05, "loss": 0.3942, "step": 552 }, { "epoch": 2.5243937232524964, "grad_norm": 0.2176242576204114, "learning_rate": 4.628887827850056e-05, "loss": 0.3925, "step": 553 }, { "epoch": 2.5289586305278173, "grad_norm": 0.21882885808826003, "learning_rate": 4.6162855781135534e-05, "loss": 0.3967, "step": 554 }, { "epoch": 2.533523537803138, "grad_norm": 0.2753010159681235, "learning_rate": 4.6036770592233e-05, "loss": 0.3988, "step": 555 }, { "epoch": 2.5380884450784595, "grad_norm": 0.27865187586221307, "learning_rate": 4.591062399439223e-05, "loss": 0.3892, "step": 556 }, { "epoch": 2.5426533523537804, "grad_norm": 0.25469855214873266, "learning_rate": 4.578441727083718e-05, "loss": 0.3975, "step": 557 }, { "epoch": 2.5472182596291013, "grad_norm": 0.20957376603739114, "learning_rate": 4.5658151705403416e-05, "loss": 0.4014, "step": 558 }, { "epoch": 2.551783166904422, "grad_norm": 0.2600853257379872, "learning_rate": 4.553182858252514e-05, "loss": 0.3971, "step": 559 }, { "epoch": 2.556348074179743, "grad_norm": 0.2776565181123319, "learning_rate": 4.540544918722195e-05, "loss": 0.4039, "step": 560 }, { "epoch": 2.560912981455064, "grad_norm": 0.23379879408515905, "learning_rate": 4.527901480508595e-05, "loss": 0.3945, "step": 561 }, { "epoch": 2.5654778887303853, "grad_norm": 0.24124737895746404, "learning_rate": 4.515252672226858e-05, "loss": 0.3945, "step": 562 }, { "epoch": 2.570042796005706, "grad_norm": 0.2917521663677071, "learning_rate": 4.5025986225467533e-05, "loss": 0.3934, "step": 563 }, { "epoch": 2.574607703281027, "grad_norm": 0.2530412971141297, "learning_rate": 4.4899394601913724e-05, "loss": 0.3937, "step": 564 }, { "epoch": 2.579172610556348, "grad_norm": 0.22637706164941146, "learning_rate": 4.477275313935807e-05, "loss": 0.3926, "step": 565 }, { "epoch": 2.5837375178316693, "grad_norm": 0.23838840534635813, "learning_rate": 4.464606312605858e-05, "loss": 0.3977, "step": 566 }, { "epoch": 2.58830242510699, "grad_norm": 0.24947138899802643, "learning_rate": 4.451932585076707e-05, "loss": 0.3959, "step": 567 }, { "epoch": 2.592867332382311, "grad_norm": 0.27260964086966344, "learning_rate": 4.439254260271615e-05, "loss": 0.3914, "step": 568 }, { "epoch": 2.597432239657632, "grad_norm": 0.23394296464858344, "learning_rate": 4.426571467160609e-05, "loss": 0.3987, "step": 569 }, { "epoch": 2.601997146932953, "grad_norm": 0.24289503390540143, "learning_rate": 4.413884334759169e-05, "loss": 0.3939, "step": 570 }, { "epoch": 2.6065620542082737, "grad_norm": 0.29117956540296497, "learning_rate": 4.401192992126918e-05, "loss": 0.3956, "step": 571 }, { "epoch": 2.611126961483595, "grad_norm": 0.2237089907238584, "learning_rate": 4.3884975683663076e-05, "loss": 0.3955, "step": 572 }, { "epoch": 2.615691868758916, "grad_norm": 0.24264576691075865, "learning_rate": 4.375798192621298e-05, "loss": 0.3928, "step": 573 }, { "epoch": 2.620256776034237, "grad_norm": 0.2751685982168088, "learning_rate": 4.363094994076063e-05, "loss": 0.3966, "step": 574 }, { "epoch": 2.6248216833095577, "grad_norm": 0.28747813965799607, "learning_rate": 4.350388101953652e-05, "loss": 0.3943, "step": 575 }, { "epoch": 2.629386590584879, "grad_norm": 0.2805974268000736, "learning_rate": 4.337677645514696e-05, "loss": 0.3937, "step": 576 }, { "epoch": 2.6339514978602, "grad_norm": 0.25668107207724133, "learning_rate": 4.3249637540560775e-05, "loss": 0.3992, "step": 577 }, { "epoch": 2.6385164051355208, "grad_norm": 0.22123626260157714, "learning_rate": 4.312246556909625e-05, "loss": 0.3905, "step": 578 }, { "epoch": 2.6430813124108417, "grad_norm": 0.2889764064202931, "learning_rate": 4.299526183440795e-05, "loss": 0.3939, "step": 579 }, { "epoch": 2.6476462196861625, "grad_norm": 0.2813860421214819, "learning_rate": 4.286802763047351e-05, "loss": 0.3915, "step": 580 }, { "epoch": 2.6522111269614834, "grad_norm": 0.19910247285499882, "learning_rate": 4.274076425158056e-05, "loss": 0.3948, "step": 581 }, { "epoch": 2.6567760342368043, "grad_norm": 0.29602208357801046, "learning_rate": 4.2613472992313475e-05, "loss": 0.394, "step": 582 }, { "epoch": 2.6613409415121256, "grad_norm": 0.3211859467751027, "learning_rate": 4.2486155147540275e-05, "loss": 0.3952, "step": 583 }, { "epoch": 2.6659058487874465, "grad_norm": 0.2957713814210968, "learning_rate": 4.235881201239938e-05, "loss": 0.3944, "step": 584 }, { "epoch": 2.6704707560627674, "grad_norm": 0.19210627965781088, "learning_rate": 4.22314448822865e-05, "loss": 0.3973, "step": 585 }, { "epoch": 2.6750356633380883, "grad_norm": 0.22185230850956422, "learning_rate": 4.210405505284146e-05, "loss": 0.3913, "step": 586 }, { "epoch": 2.6796005706134096, "grad_norm": 0.26397988476550216, "learning_rate": 4.197664381993495e-05, "loss": 0.3933, "step": 587 }, { "epoch": 2.6841654778887305, "grad_norm": 0.2616617955056205, "learning_rate": 4.1849212479655404e-05, "loss": 0.3978, "step": 588 }, { "epoch": 2.6887303851640514, "grad_norm": 0.2067735440913514, "learning_rate": 4.172176232829579e-05, "loss": 0.3918, "step": 589 }, { "epoch": 2.6932952924393723, "grad_norm": 0.2129097941200269, "learning_rate": 4.159429466234042e-05, "loss": 0.3934, "step": 590 }, { "epoch": 2.697860199714693, "grad_norm": 0.19320324303685169, "learning_rate": 4.146681077845184e-05, "loss": 0.4005, "step": 591 }, { "epoch": 2.702425106990014, "grad_norm": 0.21243002834135477, "learning_rate": 4.133931197345747e-05, "loss": 0.3914, "step": 592 }, { "epoch": 2.7069900142653354, "grad_norm": 0.21832438259968404, "learning_rate": 4.1211799544336604e-05, "loss": 0.3969, "step": 593 }, { "epoch": 2.7115549215406562, "grad_norm": 0.22033236523137045, "learning_rate": 4.108427478820707e-05, "loss": 0.3914, "step": 594 }, { "epoch": 2.716119828815977, "grad_norm": 0.22106513057663857, "learning_rate": 4.095673900231212e-05, "loss": 0.3951, "step": 595 }, { "epoch": 2.720684736091298, "grad_norm": 0.22871838319618967, "learning_rate": 4.0829193484007216e-05, "loss": 0.3965, "step": 596 }, { "epoch": 2.7252496433666193, "grad_norm": 0.22285026812146833, "learning_rate": 4.070163953074676e-05, "loss": 0.3896, "step": 597 }, { "epoch": 2.7298145506419402, "grad_norm": 0.21511862495234116, "learning_rate": 4.0574078440071056e-05, "loss": 0.3908, "step": 598 }, { "epoch": 2.734379457917261, "grad_norm": 0.21107167137669283, "learning_rate": 4.044651150959294e-05, "loss": 0.3917, "step": 599 }, { "epoch": 2.738944365192582, "grad_norm": 0.23870814693322587, "learning_rate": 4.031894003698467e-05, "loss": 0.3955, "step": 600 }, { "epoch": 2.743509272467903, "grad_norm": 0.22220033003757905, "learning_rate": 4.0191365319964724e-05, "loss": 0.3917, "step": 601 }, { "epoch": 2.7480741797432238, "grad_norm": 0.19453278202618698, "learning_rate": 4.006378865628455e-05, "loss": 0.3893, "step": 602 }, { "epoch": 2.752639087018545, "grad_norm": 0.23239542400687424, "learning_rate": 3.993621134371545e-05, "loss": 0.3933, "step": 603 }, { "epoch": 2.757203994293866, "grad_norm": 0.24428871107116487, "learning_rate": 3.980863468003529e-05, "loss": 0.3935, "step": 604 }, { "epoch": 2.761768901569187, "grad_norm": 0.19760601046120077, "learning_rate": 3.968105996301535e-05, "loss": 0.3918, "step": 605 }, { "epoch": 2.7663338088445077, "grad_norm": 0.2501742444848214, "learning_rate": 3.955348849040707e-05, "loss": 0.3897, "step": 606 }, { "epoch": 2.770898716119829, "grad_norm": 0.2319084390296337, "learning_rate": 3.942592155992895e-05, "loss": 0.3961, "step": 607 }, { "epoch": 2.77546362339515, "grad_norm": 0.2244356749086854, "learning_rate": 3.929836046925323e-05, "loss": 0.3943, "step": 608 }, { "epoch": 2.780028530670471, "grad_norm": 0.21979544227639491, "learning_rate": 3.91708065159928e-05, "loss": 0.3955, "step": 609 }, { "epoch": 2.7845934379457917, "grad_norm": 0.19742195901212328, "learning_rate": 3.904326099768789e-05, "loss": 0.3949, "step": 610 }, { "epoch": 2.7891583452211126, "grad_norm": 0.22234755877545048, "learning_rate": 3.8915725211792944e-05, "loss": 0.3953, "step": 611 }, { "epoch": 2.7937232524964335, "grad_norm": 0.2231966829166695, "learning_rate": 3.8788200455663416e-05, "loss": 0.3881, "step": 612 }, { "epoch": 2.7982881597717544, "grad_norm": 0.23610363359578615, "learning_rate": 3.8660688026542544e-05, "loss": 0.3953, "step": 613 }, { "epoch": 2.8028530670470757, "grad_norm": 0.24368628185935592, "learning_rate": 3.853318922154818e-05, "loss": 0.3913, "step": 614 }, { "epoch": 2.8074179743223966, "grad_norm": 0.22986532483466307, "learning_rate": 3.840570533765959e-05, "loss": 0.393, "step": 615 }, { "epoch": 2.8119828815977175, "grad_norm": 0.2384209501930521, "learning_rate": 3.827823767170423e-05, "loss": 0.391, "step": 616 }, { "epoch": 2.8165477888730384, "grad_norm": 0.22125793484405842, "learning_rate": 3.815078752034461e-05, "loss": 0.3946, "step": 617 }, { "epoch": 2.8211126961483597, "grad_norm": 0.2535177819158247, "learning_rate": 3.802335618006506e-05, "loss": 0.3915, "step": 618 }, { "epoch": 2.8256776034236806, "grad_norm": 0.2396475170274598, "learning_rate": 3.7895944947158535e-05, "loss": 0.3958, "step": 619 }, { "epoch": 2.8302425106990015, "grad_norm": 0.20749976321329167, "learning_rate": 3.77685551177135e-05, "loss": 0.3952, "step": 620 }, { "epoch": 2.8348074179743223, "grad_norm": 0.16508755912089354, "learning_rate": 3.764118798760064e-05, "loss": 0.3844, "step": 621 }, { "epoch": 2.8393723252496432, "grad_norm": 0.20726854721909735, "learning_rate": 3.7513844852459745e-05, "loss": 0.3945, "step": 622 }, { "epoch": 2.843937232524964, "grad_norm": 0.18560003219863108, "learning_rate": 3.738652700768653e-05, "loss": 0.3974, "step": 623 }, { "epoch": 2.8485021398002854, "grad_norm": 0.17587194547537272, "learning_rate": 3.725923574841945e-05, "loss": 0.3903, "step": 624 }, { "epoch": 2.8530670470756063, "grad_norm": 0.2133689614478487, "learning_rate": 3.7131972369526505e-05, "loss": 0.3939, "step": 625 }, { "epoch": 2.857631954350927, "grad_norm": 0.17416385677862986, "learning_rate": 3.700473816559207e-05, "loss": 0.3913, "step": 626 }, { "epoch": 2.862196861626248, "grad_norm": 0.17910642220727754, "learning_rate": 3.687753443090375e-05, "loss": 0.3991, "step": 627 }, { "epoch": 2.8667617689015694, "grad_norm": 0.18645917268183274, "learning_rate": 3.675036245943923e-05, "loss": 0.3933, "step": 628 }, { "epoch": 2.8713266761768903, "grad_norm": 0.22702514829934878, "learning_rate": 3.662322354485306e-05, "loss": 0.3927, "step": 629 }, { "epoch": 2.875891583452211, "grad_norm": 0.19751990109383138, "learning_rate": 3.6496118980463486e-05, "loss": 0.3913, "step": 630 }, { "epoch": 2.880456490727532, "grad_norm": 0.17351220103657103, "learning_rate": 3.6369050059239384e-05, "loss": 0.3918, "step": 631 }, { "epoch": 2.885021398002853, "grad_norm": 0.20221410813950186, "learning_rate": 3.624201807378703e-05, "loss": 0.3934, "step": 632 }, { "epoch": 2.889586305278174, "grad_norm": 0.15933130426619596, "learning_rate": 3.6115024316336944e-05, "loss": 0.3891, "step": 633 }, { "epoch": 2.894151212553495, "grad_norm": 0.18767258317171362, "learning_rate": 3.598807007873083e-05, "loss": 0.3935, "step": 634 }, { "epoch": 2.898716119828816, "grad_norm": 0.17072434668538097, "learning_rate": 3.586115665240832e-05, "loss": 0.3923, "step": 635 }, { "epoch": 2.903281027104137, "grad_norm": 0.19788995991405794, "learning_rate": 3.573428532839392e-05, "loss": 0.3924, "step": 636 }, { "epoch": 2.907845934379458, "grad_norm": 0.18823937945201608, "learning_rate": 3.560745739728387e-05, "loss": 0.389, "step": 637 }, { "epoch": 2.912410841654779, "grad_norm": 0.16176660376720053, "learning_rate": 3.548067414923294e-05, "loss": 0.3974, "step": 638 }, { "epoch": 2.9169757489301, "grad_norm": 0.15959896392390563, "learning_rate": 3.5353936873941435e-05, "loss": 0.3888, "step": 639 }, { "epoch": 2.921540656205421, "grad_norm": 0.161034937587383, "learning_rate": 3.522724686064194e-05, "loss": 0.3953, "step": 640 }, { "epoch": 2.926105563480742, "grad_norm": 0.18385006824388012, "learning_rate": 3.5100605398086296e-05, "loss": 0.3923, "step": 641 }, { "epoch": 2.9306704707560627, "grad_norm": 0.16430295552203472, "learning_rate": 3.497401377453247e-05, "loss": 0.3916, "step": 642 }, { "epoch": 2.9352353780313836, "grad_norm": 0.15722062849324564, "learning_rate": 3.484747327773142e-05, "loss": 0.3887, "step": 643 }, { "epoch": 2.9398002853067045, "grad_norm": 0.14393903212852108, "learning_rate": 3.472098519491406e-05, "loss": 0.3914, "step": 644 }, { "epoch": 2.944365192582026, "grad_norm": 0.17232972585909226, "learning_rate": 3.459455081277806e-05, "loss": 0.393, "step": 645 }, { "epoch": 2.9489300998573467, "grad_norm": 0.16872545932202826, "learning_rate": 3.446817141747487e-05, "loss": 0.394, "step": 646 }, { "epoch": 2.9534950071326675, "grad_norm": 0.20373345038244411, "learning_rate": 3.434184829459659e-05, "loss": 0.3955, "step": 647 }, { "epoch": 2.9580599144079884, "grad_norm": 0.16582792220621265, "learning_rate": 3.421558272916284e-05, "loss": 0.3897, "step": 648 }, { "epoch": 2.9626248216833098, "grad_norm": 0.17195177157795527, "learning_rate": 3.408937600560778e-05, "loss": 0.3931, "step": 649 }, { "epoch": 2.9671897289586306, "grad_norm": 0.17099363542536267, "learning_rate": 3.3963229407767014e-05, "loss": 0.3932, "step": 650 }, { "epoch": 2.9717546362339515, "grad_norm": 0.18406454501706795, "learning_rate": 3.3837144218864466e-05, "loss": 0.3926, "step": 651 }, { "epoch": 2.9763195435092724, "grad_norm": 0.17696057601736395, "learning_rate": 3.371112172149945e-05, "loss": 0.3951, "step": 652 }, { "epoch": 2.9808844507845933, "grad_norm": 0.20588927188905604, "learning_rate": 3.358516319763348e-05, "loss": 0.3908, "step": 653 }, { "epoch": 2.985449358059914, "grad_norm": 0.15024406311611282, "learning_rate": 3.3459269928577326e-05, "loss": 0.3965, "step": 654 }, { "epoch": 2.9900142653352355, "grad_norm": 0.18315742998535253, "learning_rate": 3.3333443194977985e-05, "loss": 0.3878, "step": 655 }, { "epoch": 2.9945791726105564, "grad_norm": 0.15190454402880918, "learning_rate": 3.32076842768056e-05, "loss": 0.4001, "step": 656 }, { "epoch": 2.9991440798858773, "grad_norm": 0.1802522348541421, "learning_rate": 3.3081994453340425e-05, "loss": 0.3912, "step": 657 }, { "epoch": 3.003708987161198, "grad_norm": 0.4371593688422328, "learning_rate": 3.295637500315992e-05, "loss": 0.7621, "step": 658 }, { "epoch": 3.008273894436519, "grad_norm": 0.44775433837572887, "learning_rate": 3.283082720412563e-05, "loss": 0.3662, "step": 659 }, { "epoch": 3.0128388017118404, "grad_norm": 0.3890902219171831, "learning_rate": 3.270535233337024e-05, "loss": 0.3743, "step": 660 }, { "epoch": 3.0174037089871613, "grad_norm": 0.3003030553223454, "learning_rate": 3.2579951667284596e-05, "loss": 0.3721, "step": 661 }, { "epoch": 3.021968616262482, "grad_norm": 0.3515656966921728, "learning_rate": 3.245462648150463e-05, "loss": 0.3684, "step": 662 }, { "epoch": 3.026533523537803, "grad_norm": 0.2934678186313399, "learning_rate": 3.232937805089854e-05, "loss": 0.3688, "step": 663 }, { "epoch": 3.031098430813124, "grad_norm": 0.2665389758970002, "learning_rate": 3.2204207649553665e-05, "loss": 0.3687, "step": 664 }, { "epoch": 3.0356633380884452, "grad_norm": 0.2886505257763211, "learning_rate": 3.2079116550763624e-05, "loss": 0.3715, "step": 665 }, { "epoch": 3.040228245363766, "grad_norm": 0.26410869047435404, "learning_rate": 3.195410602701535e-05, "loss": 0.3755, "step": 666 }, { "epoch": 3.044793152639087, "grad_norm": 0.33726378634419146, "learning_rate": 3.182917734997607e-05, "loss": 0.3715, "step": 667 }, { "epoch": 3.049358059914408, "grad_norm": 0.23119146133187443, "learning_rate": 3.170433179048048e-05, "loss": 0.3752, "step": 668 }, { "epoch": 3.0539229671897288, "grad_norm": 0.317803893708356, "learning_rate": 3.157957061851775e-05, "loss": 0.3668, "step": 669 }, { "epoch": 3.05848787446505, "grad_norm": 0.32056814826697005, "learning_rate": 3.1454895103218604e-05, "loss": 0.3684, "step": 670 }, { "epoch": 3.063052781740371, "grad_norm": 0.2227493640724446, "learning_rate": 3.133030651284246e-05, "loss": 0.3708, "step": 671 }, { "epoch": 3.067617689015692, "grad_norm": 0.3474416944410334, "learning_rate": 3.1205806114764455e-05, "loss": 0.37, "step": 672 }, { "epoch": 3.0721825962910128, "grad_norm": 0.23598864945070333, "learning_rate": 3.108139517546259e-05, "loss": 0.3679, "step": 673 }, { "epoch": 3.0767475035663336, "grad_norm": 0.26225028839730374, "learning_rate": 3.095707496050489e-05, "loss": 0.3711, "step": 674 }, { "epoch": 3.081312410841655, "grad_norm": 0.30541556508523, "learning_rate": 3.083284673453645e-05, "loss": 0.3705, "step": 675 }, { "epoch": 3.085877318116976, "grad_norm": 0.2375554525248402, "learning_rate": 3.070871176126664e-05, "loss": 0.3674, "step": 676 }, { "epoch": 3.0904422253922967, "grad_norm": 0.24938738146004832, "learning_rate": 3.058467130345619e-05, "loss": 0.3693, "step": 677 }, { "epoch": 3.0950071326676176, "grad_norm": 0.194861014641011, "learning_rate": 3.0460726622904362e-05, "loss": 0.3709, "step": 678 }, { "epoch": 3.0995720399429385, "grad_norm": 0.22427886537482727, "learning_rate": 3.033687898043619e-05, "loss": 0.3674, "step": 679 }, { "epoch": 3.10413694721826, "grad_norm": 0.1931234684152515, "learning_rate": 3.0213129635889527e-05, "loss": 0.3707, "step": 680 }, { "epoch": 3.1087018544935807, "grad_norm": 0.19750887437108353, "learning_rate": 3.0089479848102302e-05, "loss": 0.3726, "step": 681 }, { "epoch": 3.1132667617689016, "grad_norm": 0.22984346500906291, "learning_rate": 2.9965930874899734e-05, "loss": 0.3691, "step": 682 }, { "epoch": 3.1178316690442225, "grad_norm": 0.16695271136501227, "learning_rate": 2.984248397308149e-05, "loss": 0.3669, "step": 683 }, { "epoch": 3.1223965763195434, "grad_norm": 0.17893989774867847, "learning_rate": 2.971914039840888e-05, "loss": 0.369, "step": 684 }, { "epoch": 3.1269614835948643, "grad_norm": 0.17571871130738057, "learning_rate": 2.9595901405592215e-05, "loss": 0.3716, "step": 685 }, { "epoch": 3.1315263908701856, "grad_norm": 0.1594921785853266, "learning_rate": 2.947276824827784e-05, "loss": 0.3712, "step": 686 }, { "epoch": 3.1360912981455065, "grad_norm": 0.17980710605526548, "learning_rate": 2.9349742179035575e-05, "loss": 0.3656, "step": 687 }, { "epoch": 3.1406562054208274, "grad_norm": 0.18019296867903034, "learning_rate": 2.9226824449345854e-05, "loss": 0.3711, "step": 688 }, { "epoch": 3.1452211126961482, "grad_norm": 0.20009151820526971, "learning_rate": 2.910401630958699e-05, "loss": 0.3696, "step": 689 }, { "epoch": 3.1497860199714696, "grad_norm": 0.19469556991313045, "learning_rate": 2.898131900902259e-05, "loss": 0.3664, "step": 690 }, { "epoch": 3.1543509272467904, "grad_norm": 0.1658297168263364, "learning_rate": 2.8858733795788666e-05, "loss": 0.3698, "step": 691 }, { "epoch": 3.1589158345221113, "grad_norm": 0.1770728229288905, "learning_rate": 2.873626191688104e-05, "loss": 0.3707, "step": 692 }, { "epoch": 3.163480741797432, "grad_norm": 0.158629076656555, "learning_rate": 2.8613904618142698e-05, "loss": 0.369, "step": 693 }, { "epoch": 3.168045649072753, "grad_norm": 0.15694838864187477, "learning_rate": 2.8491663144250964e-05, "loss": 0.3714, "step": 694 }, { "epoch": 3.172610556348074, "grad_norm": 0.17453143005622082, "learning_rate": 2.836953873870505e-05, "loss": 0.3724, "step": 695 }, { "epoch": 3.1771754636233953, "grad_norm": 0.1700537102783032, "learning_rate": 2.824753264381319e-05, "loss": 0.3692, "step": 696 }, { "epoch": 3.181740370898716, "grad_norm": 0.16759539668413778, "learning_rate": 2.812564610068017e-05, "loss": 0.371, "step": 697 }, { "epoch": 3.186305278174037, "grad_norm": 0.16867962773238068, "learning_rate": 2.800388034919461e-05, "loss": 0.3662, "step": 698 }, { "epoch": 3.190870185449358, "grad_norm": 0.19731656299531658, "learning_rate": 2.788223662801639e-05, "loss": 0.3731, "step": 699 }, { "epoch": 3.195435092724679, "grad_norm": 0.14390885288139355, "learning_rate": 2.776071617456402e-05, "loss": 0.3685, "step": 700 }, { "epoch": 3.2, "grad_norm": 0.17429954735264463, "learning_rate": 2.7639320225002108e-05, "loss": 0.3692, "step": 701 }, { "epoch": 3.204564907275321, "grad_norm": 0.16193213783821153, "learning_rate": 2.7518050014228707e-05, "loss": 0.3694, "step": 702 }, { "epoch": 3.209129814550642, "grad_norm": 0.17943687683108553, "learning_rate": 2.739690677586284e-05, "loss": 0.3735, "step": 703 }, { "epoch": 3.213694721825963, "grad_norm": 0.16115261882412688, "learning_rate": 2.7275891742231847e-05, "loss": 0.3688, "step": 704 }, { "epoch": 3.2182596291012837, "grad_norm": 0.15989318235835606, "learning_rate": 2.7155006144358958e-05, "loss": 0.3669, "step": 705 }, { "epoch": 3.222824536376605, "grad_norm": 0.1833740301603784, "learning_rate": 2.70342512119507e-05, "loss": 0.3681, "step": 706 }, { "epoch": 3.227389443651926, "grad_norm": 0.1701603517000583, "learning_rate": 2.691362817338442e-05, "loss": 0.3732, "step": 707 }, { "epoch": 3.231954350927247, "grad_norm": 0.17981663756006744, "learning_rate": 2.6793138255695743e-05, "loss": 0.3669, "step": 708 }, { "epoch": 3.2365192582025677, "grad_norm": 0.1611749828625592, "learning_rate": 2.6672782684566167e-05, "loss": 0.368, "step": 709 }, { "epoch": 3.2410841654778886, "grad_norm": 0.1704405344311701, "learning_rate": 2.6552562684310532e-05, "loss": 0.3746, "step": 710 }, { "epoch": 3.24564907275321, "grad_norm": 0.15452048356052153, "learning_rate": 2.6432479477864588e-05, "loss": 0.3668, "step": 711 }, { "epoch": 3.250213980028531, "grad_norm": 0.1636048167954058, "learning_rate": 2.6312534286772558e-05, "loss": 0.3665, "step": 712 }, { "epoch": 3.2547788873038517, "grad_norm": 0.15805813062246202, "learning_rate": 2.619272833117468e-05, "loss": 0.3671, "step": 713 }, { "epoch": 3.2593437945791726, "grad_norm": 0.15722251462834871, "learning_rate": 2.6073062829794863e-05, "loss": 0.3702, "step": 714 }, { "epoch": 3.2639087018544934, "grad_norm": 0.1515655825894355, "learning_rate": 2.5953538999928217e-05, "loss": 0.3698, "step": 715 }, { "epoch": 3.2684736091298143, "grad_norm": 0.15294949505967717, "learning_rate": 2.5834158057428704e-05, "loss": 0.3659, "step": 716 }, { "epoch": 3.2730385164051357, "grad_norm": 0.17430429089098687, "learning_rate": 2.5714921216696806e-05, "loss": 0.3658, "step": 717 }, { "epoch": 3.2776034236804565, "grad_norm": 0.15283446685063015, "learning_rate": 2.559582969066706e-05, "loss": 0.3655, "step": 718 }, { "epoch": 3.2821683309557774, "grad_norm": 0.15900008730134152, "learning_rate": 2.5476884690795853e-05, "loss": 0.3686, "step": 719 }, { "epoch": 3.2867332382310983, "grad_norm": 0.17376312137687447, "learning_rate": 2.5358087427049016e-05, "loss": 0.3679, "step": 720 }, { "epoch": 3.2912981455064196, "grad_norm": 0.16367071666419647, "learning_rate": 2.523943910788953e-05, "loss": 0.3652, "step": 721 }, { "epoch": 3.2958630527817405, "grad_norm": 0.1493453159768574, "learning_rate": 2.5120940940265276e-05, "loss": 0.3712, "step": 722 }, { "epoch": 3.3004279600570614, "grad_norm": 0.1600651037670884, "learning_rate": 2.500259412959665e-05, "loss": 0.368, "step": 723 }, { "epoch": 3.3049928673323823, "grad_norm": 0.15510229863064848, "learning_rate": 2.4884399879764437e-05, "loss": 0.3714, "step": 724 }, { "epoch": 3.309557774607703, "grad_norm": 0.1371776463153879, "learning_rate": 2.4766359393097476e-05, "loss": 0.3723, "step": 725 }, { "epoch": 3.314122681883024, "grad_norm": 0.17941656580285384, "learning_rate": 2.464847387036045e-05, "loss": 0.3702, "step": 726 }, { "epoch": 3.3186875891583454, "grad_norm": 0.13405327935015152, "learning_rate": 2.4530744510741703e-05, "loss": 0.3715, "step": 727 }, { "epoch": 3.3232524964336663, "grad_norm": 0.16167817588999067, "learning_rate": 2.4413172511840958e-05, "loss": 0.3693, "step": 728 }, { "epoch": 3.327817403708987, "grad_norm": 0.1509260817331625, "learning_rate": 2.429575906965722e-05, "loss": 0.3599, "step": 729 }, { "epoch": 3.332382310984308, "grad_norm": 0.144720817062596, "learning_rate": 2.4178505378576605e-05, "loss": 0.3708, "step": 730 }, { "epoch": 3.336947218259629, "grad_norm": 0.1424755317935734, "learning_rate": 2.4061412631360116e-05, "loss": 0.3647, "step": 731 }, { "epoch": 3.3415121255349503, "grad_norm": 0.1582677626986222, "learning_rate": 2.394448201913158e-05, "loss": 0.3656, "step": 732 }, { "epoch": 3.346077032810271, "grad_norm": 0.13727256015572376, "learning_rate": 2.3827714731365513e-05, "loss": 0.3669, "step": 733 }, { "epoch": 3.350641940085592, "grad_norm": 0.1482716204121623, "learning_rate": 2.3711111955875018e-05, "loss": 0.3661, "step": 734 }, { "epoch": 3.355206847360913, "grad_norm": 0.1429391445826853, "learning_rate": 2.3594674878799656e-05, "loss": 0.3702, "step": 735 }, { "epoch": 3.359771754636234, "grad_norm": 0.15144931311040605, "learning_rate": 2.347840468459351e-05, "loss": 0.3722, "step": 736 }, { "epoch": 3.364336661911555, "grad_norm": 0.1584841215631846, "learning_rate": 2.336230255601296e-05, "loss": 0.3651, "step": 737 }, { "epoch": 3.368901569186876, "grad_norm": 0.15860593207749524, "learning_rate": 2.324636967410481e-05, "loss": 0.3706, "step": 738 }, { "epoch": 3.373466476462197, "grad_norm": 0.16524287919377015, "learning_rate": 2.3130607218194153e-05, "loss": 0.3667, "step": 739 }, { "epoch": 3.3780313837375178, "grad_norm": 0.1553010070756002, "learning_rate": 2.3015016365872462e-05, "loss": 0.3694, "step": 740 }, { "epoch": 3.3825962910128387, "grad_norm": 0.19177517583446688, "learning_rate": 2.289959829298558e-05, "loss": 0.3684, "step": 741 }, { "epoch": 3.38716119828816, "grad_norm": 0.13228111850501834, "learning_rate": 2.2784354173621726e-05, "loss": 0.3695, "step": 742 }, { "epoch": 3.391726105563481, "grad_norm": 0.18742867451573136, "learning_rate": 2.266928518009957e-05, "loss": 0.3663, "step": 743 }, { "epoch": 3.3962910128388017, "grad_norm": 0.14121057196985098, "learning_rate": 2.2554392482956357e-05, "loss": 0.362, "step": 744 }, { "epoch": 3.4008559201141226, "grad_norm": 0.16109339289265964, "learning_rate": 2.243967725093595e-05, "loss": 0.3644, "step": 745 }, { "epoch": 3.4054208273894435, "grad_norm": 0.13332750913355695, "learning_rate": 2.2325140650976957e-05, "loss": 0.3696, "step": 746 }, { "epoch": 3.4099857346647644, "grad_norm": 0.14371453147838442, "learning_rate": 2.221078384820082e-05, "loss": 0.3672, "step": 747 }, { "epoch": 3.4145506419400857, "grad_norm": 0.12959264833893533, "learning_rate": 2.209660800590002e-05, "loss": 0.369, "step": 748 }, { "epoch": 3.4191155492154066, "grad_norm": 0.19007915393126823, "learning_rate": 2.1982614285526214e-05, "loss": 0.3695, "step": 749 }, { "epoch": 3.4236804564907275, "grad_norm": 0.1374069769687416, "learning_rate": 2.1868803846678475e-05, "loss": 0.364, "step": 750 }, { "epoch": 3.4282453637660484, "grad_norm": 0.16184614734626082, "learning_rate": 2.1755177847091357e-05, "loss": 0.3701, "step": 751 }, { "epoch": 3.4328102710413697, "grad_norm": 0.12874829449890968, "learning_rate": 2.1641737442623295e-05, "loss": 0.3702, "step": 752 }, { "epoch": 3.4373751783166906, "grad_norm": 0.15265797312556922, "learning_rate": 2.1528483787244695e-05, "loss": 0.3631, "step": 753 }, { "epoch": 3.4419400855920115, "grad_norm": 0.13901496827521181, "learning_rate": 2.1415418033026303e-05, "loss": 0.3698, "step": 754 }, { "epoch": 3.4465049928673324, "grad_norm": 0.14408559598989043, "learning_rate": 2.1302541330127456e-05, "loss": 0.3722, "step": 755 }, { "epoch": 3.4510699001426532, "grad_norm": 0.1553274863431962, "learning_rate": 2.1189854826784306e-05, "loss": 0.373, "step": 756 }, { "epoch": 3.455634807417974, "grad_norm": 0.13896160154548598, "learning_rate": 2.107735966929828e-05, "loss": 0.3674, "step": 757 }, { "epoch": 3.4601997146932955, "grad_norm": 0.14190658414204307, "learning_rate": 2.096505700202427e-05, "loss": 0.3713, "step": 758 }, { "epoch": 3.4647646219686163, "grad_norm": 0.13297816575448404, "learning_rate": 2.0852947967359124e-05, "loss": 0.3659, "step": 759 }, { "epoch": 3.4693295292439372, "grad_norm": 0.13293797385160067, "learning_rate": 2.0741033705729946e-05, "loss": 0.3688, "step": 760 }, { "epoch": 3.473894436519258, "grad_norm": 0.13476039599236855, "learning_rate": 2.0629315355582493e-05, "loss": 0.3734, "step": 761 }, { "epoch": 3.478459343794579, "grad_norm": 0.12322657009585795, "learning_rate": 2.0517794053369668e-05, "loss": 0.3674, "step": 762 }, { "epoch": 3.4830242510699003, "grad_norm": 0.11867881868833804, "learning_rate": 2.040647093353983e-05, "loss": 0.3656, "step": 763 }, { "epoch": 3.487589158345221, "grad_norm": 0.11899054622753805, "learning_rate": 2.02953471285254e-05, "loss": 0.3636, "step": 764 }, { "epoch": 3.492154065620542, "grad_norm": 0.12418349656227172, "learning_rate": 2.018442376873126e-05, "loss": 0.3717, "step": 765 }, { "epoch": 3.496718972895863, "grad_norm": 0.11310966967990665, "learning_rate": 2.007370198252324e-05, "loss": 0.3611, "step": 766 }, { "epoch": 3.501283880171184, "grad_norm": 0.12055741426438335, "learning_rate": 1.9963182896216667e-05, "loss": 0.3674, "step": 767 }, { "epoch": 3.5058487874465047, "grad_norm": 0.12618096595715805, "learning_rate": 1.985286763406494e-05, "loss": 0.3725, "step": 768 }, { "epoch": 3.510413694721826, "grad_norm": 0.11538183128314908, "learning_rate": 1.974275731824804e-05, "loss": 0.3703, "step": 769 }, { "epoch": 3.514978601997147, "grad_norm": 0.12599278064501612, "learning_rate": 1.9632853068861147e-05, "loss": 0.3628, "step": 770 }, { "epoch": 3.519543509272468, "grad_norm": 0.1139458312287587, "learning_rate": 1.9523156003903215e-05, "loss": 0.3664, "step": 771 }, { "epoch": 3.5241084165477887, "grad_norm": 0.14077407102475697, "learning_rate": 1.9413667239265615e-05, "loss": 0.3652, "step": 772 }, { "epoch": 3.52867332382311, "grad_norm": 0.13032016684712433, "learning_rate": 1.9304387888720804e-05, "loss": 0.3685, "step": 773 }, { "epoch": 3.533238231098431, "grad_norm": 0.14031858267414743, "learning_rate": 1.919531906391099e-05, "loss": 0.3733, "step": 774 }, { "epoch": 3.537803138373752, "grad_norm": 0.1486352640869334, "learning_rate": 1.9086461874336777e-05, "loss": 0.3713, "step": 775 }, { "epoch": 3.5423680456490727, "grad_norm": 0.1357215608169487, "learning_rate": 1.8977817427345946e-05, "loss": 0.3617, "step": 776 }, { "epoch": 3.5469329529243936, "grad_norm": 0.13543444529267776, "learning_rate": 1.8869386828122125e-05, "loss": 0.3723, "step": 777 }, { "epoch": 3.5514978601997145, "grad_norm": 0.14518880548921387, "learning_rate": 1.8761171179673604e-05, "loss": 0.3646, "step": 778 }, { "epoch": 3.556062767475036, "grad_norm": 0.1530615652429703, "learning_rate": 1.8653171582822104e-05, "loss": 0.3676, "step": 779 }, { "epoch": 3.5606276747503567, "grad_norm": 0.14320914259229844, "learning_rate": 1.854538913619151e-05, "loss": 0.3708, "step": 780 }, { "epoch": 3.5651925820256776, "grad_norm": 0.1371728530944999, "learning_rate": 1.8437824936196823e-05, "loss": 0.3749, "step": 781 }, { "epoch": 3.5697574893009985, "grad_norm": 0.15231861361219254, "learning_rate": 1.8330480077032858e-05, "loss": 0.3672, "step": 782 }, { "epoch": 3.57432239657632, "grad_norm": 0.11059078549485153, "learning_rate": 1.822335565066325e-05, "loss": 0.3677, "step": 783 }, { "epoch": 3.5788873038516407, "grad_norm": 0.1576919764066523, "learning_rate": 1.8116452746809275e-05, "loss": 0.367, "step": 784 }, { "epoch": 3.5834522111269616, "grad_norm": 0.12879507206191718, "learning_rate": 1.800977245293875e-05, "loss": 0.368, "step": 785 }, { "epoch": 3.5880171184022824, "grad_norm": 0.12629882573611204, "learning_rate": 1.7903315854254994e-05, "loss": 0.3654, "step": 786 }, { "epoch": 3.5925820256776033, "grad_norm": 0.13427520014143732, "learning_rate": 1.779708403368582e-05, "loss": 0.3713, "step": 787 }, { "epoch": 3.597146932952924, "grad_norm": 0.1176497941246944, "learning_rate": 1.7691078071872477e-05, "loss": 0.3711, "step": 788 }, { "epoch": 3.601711840228245, "grad_norm": 0.13036521061876197, "learning_rate": 1.7585299047158688e-05, "loss": 0.3703, "step": 789 }, { "epoch": 3.6062767475035664, "grad_norm": 0.12201908677886805, "learning_rate": 1.7479748035579625e-05, "loss": 0.3664, "step": 790 }, { "epoch": 3.6108416547788873, "grad_norm": 0.11159322212904035, "learning_rate": 1.7374426110851e-05, "loss": 0.3726, "step": 791 }, { "epoch": 3.615406562054208, "grad_norm": 0.12179562963863867, "learning_rate": 1.726933434435819e-05, "loss": 0.3598, "step": 792 }, { "epoch": 3.6199714693295295, "grad_norm": 0.12279846137825234, "learning_rate": 1.716447380514526e-05, "loss": 0.3643, "step": 793 }, { "epoch": 3.6245363766048504, "grad_norm": 0.12287836059833575, "learning_rate": 1.7059845559904115e-05, "loss": 0.3702, "step": 794 }, { "epoch": 3.6291012838801713, "grad_norm": 0.11039673465914712, "learning_rate": 1.695545067296368e-05, "loss": 0.3675, "step": 795 }, { "epoch": 3.633666191155492, "grad_norm": 0.12022240785762481, "learning_rate": 1.6851290206279e-05, "loss": 0.3695, "step": 796 }, { "epoch": 3.638231098430813, "grad_norm": 0.11435555984531806, "learning_rate": 1.674736521942053e-05, "loss": 0.365, "step": 797 }, { "epoch": 3.642796005706134, "grad_norm": 0.131712471568275, "learning_rate": 1.664367676956331e-05, "loss": 0.3713, "step": 798 }, { "epoch": 3.647360912981455, "grad_norm": 0.11477057659282204, "learning_rate": 1.6540225911476172e-05, "loss": 0.3648, "step": 799 }, { "epoch": 3.651925820256776, "grad_norm": 0.11030623092197417, "learning_rate": 1.643701369751109e-05, "loss": 0.3646, "step": 800 }, { "epoch": 3.656490727532097, "grad_norm": 0.13051313133439071, "learning_rate": 1.6334041177592403e-05, "loss": 0.3711, "step": 801 }, { "epoch": 3.661055634807418, "grad_norm": 0.12099586891245266, "learning_rate": 1.623130939920619e-05, "loss": 0.3665, "step": 802 }, { "epoch": 3.665620542082739, "grad_norm": 0.12353194281368282, "learning_rate": 1.6128819407389606e-05, "loss": 0.3698, "step": 803 }, { "epoch": 3.67018544935806, "grad_norm": 0.12746501918260686, "learning_rate": 1.602657224472018e-05, "loss": 0.3651, "step": 804 }, { "epoch": 3.674750356633381, "grad_norm": 0.10793911581481992, "learning_rate": 1.5924568951305328e-05, "loss": 0.3651, "step": 805 }, { "epoch": 3.679315263908702, "grad_norm": 0.13767950934095166, "learning_rate": 1.5822810564771663e-05, "loss": 0.3681, "step": 806 }, { "epoch": 3.683880171184023, "grad_norm": 0.11149734780728035, "learning_rate": 1.5721298120254514e-05, "loss": 0.372, "step": 807 }, { "epoch": 3.6884450784593437, "grad_norm": 0.1187146085257134, "learning_rate": 1.562003265038738e-05, "loss": 0.3698, "step": 808 }, { "epoch": 3.6930099857346645, "grad_norm": 0.12127988390555491, "learning_rate": 1.551901518529138e-05, "loss": 0.3692, "step": 809 }, { "epoch": 3.697574893009986, "grad_norm": 0.11702704275441794, "learning_rate": 1.541824675256482e-05, "loss": 0.3661, "step": 810 }, { "epoch": 3.7021398002853068, "grad_norm": 0.11306685620844899, "learning_rate": 1.531772837727274e-05, "loss": 0.3695, "step": 811 }, { "epoch": 3.7067047075606276, "grad_norm": 0.11312484841891161, "learning_rate": 1.5217461081936478e-05, "loss": 0.3671, "step": 812 }, { "epoch": 3.7112696148359485, "grad_norm": 0.11428174495717341, "learning_rate": 1.5117445886523272e-05, "loss": 0.3674, "step": 813 }, { "epoch": 3.71583452211127, "grad_norm": 0.10987603615873456, "learning_rate": 1.501768380843585e-05, "loss": 0.3673, "step": 814 }, { "epoch": 3.7203994293865907, "grad_norm": 0.11178040039287562, "learning_rate": 1.4918175862502104e-05, "loss": 0.3719, "step": 815 }, { "epoch": 3.7249643366619116, "grad_norm": 0.10859792201293215, "learning_rate": 1.4818923060964814e-05, "loss": 0.3615, "step": 816 }, { "epoch": 3.7295292439372325, "grad_norm": 0.13172517052119273, "learning_rate": 1.471992641347129e-05, "loss": 0.3694, "step": 817 }, { "epoch": 3.7340941512125534, "grad_norm": 0.12757152194459273, "learning_rate": 1.4621186927063095e-05, "loss": 0.3649, "step": 818 }, { "epoch": 3.7386590584878743, "grad_norm": 0.1154895484553408, "learning_rate": 1.4522705606165865e-05, "loss": 0.3694, "step": 819 }, { "epoch": 3.743223965763195, "grad_norm": 0.11273080460771058, "learning_rate": 1.4424483452579012e-05, "loss": 0.3625, "step": 820 }, { "epoch": 3.7477888730385165, "grad_norm": 0.13138142426919347, "learning_rate": 1.4326521465465604e-05, "loss": 0.366, "step": 821 }, { "epoch": 3.7523537803138374, "grad_norm": 0.11883743670221694, "learning_rate": 1.4228820641342172e-05, "loss": 0.3682, "step": 822 }, { "epoch": 3.7569186875891583, "grad_norm": 0.10968178906111543, "learning_rate": 1.4131381974068533e-05, "loss": 0.3655, "step": 823 }, { "epoch": 3.7614835948644796, "grad_norm": 0.1326313322060334, "learning_rate": 1.4034206454837768e-05, "loss": 0.3715, "step": 824 }, { "epoch": 3.7660485021398005, "grad_norm": 0.12319894462182598, "learning_rate": 1.3937295072166061e-05, "loss": 0.3739, "step": 825 }, { "epoch": 3.7706134094151214, "grad_norm": 0.11752644369102541, "learning_rate": 1.3840648811882646e-05, "loss": 0.3641, "step": 826 }, { "epoch": 3.7751783166904422, "grad_norm": 0.10779720073463613, "learning_rate": 1.3744268657119886e-05, "loss": 0.3672, "step": 827 }, { "epoch": 3.779743223965763, "grad_norm": 0.12396985602516226, "learning_rate": 1.3648155588303097e-05, "loss": 0.3682, "step": 828 }, { "epoch": 3.784308131241084, "grad_norm": 0.13035541174831453, "learning_rate": 1.3552310583140744e-05, "loss": 0.3695, "step": 829 }, { "epoch": 3.788873038516405, "grad_norm": 0.11977978636195129, "learning_rate": 1.3456734616614369e-05, "loss": 0.3693, "step": 830 }, { "epoch": 3.793437945791726, "grad_norm": 0.10612949145458676, "learning_rate": 1.3361428660968713e-05, "loss": 0.3654, "step": 831 }, { "epoch": 3.798002853067047, "grad_norm": 0.12185371327777074, "learning_rate": 1.3266393685701919e-05, "loss": 0.3644, "step": 832 }, { "epoch": 3.802567760342368, "grad_norm": 0.10533358955019909, "learning_rate": 1.3171630657555499e-05, "loss": 0.3688, "step": 833 }, { "epoch": 3.807132667617689, "grad_norm": 0.1124577538423527, "learning_rate": 1.3077140540504614e-05, "loss": 0.3638, "step": 834 }, { "epoch": 3.81169757489301, "grad_norm": 0.11642170801308592, "learning_rate": 1.2982924295748274e-05, "loss": 0.3684, "step": 835 }, { "epoch": 3.816262482168331, "grad_norm": 0.10695378952599915, "learning_rate": 1.2888982881699472e-05, "loss": 0.3691, "step": 836 }, { "epoch": 3.820827389443652, "grad_norm": 0.13400187716659637, "learning_rate": 1.2795317253975537e-05, "loss": 0.3691, "step": 837 }, { "epoch": 3.825392296718973, "grad_norm": 0.09951593298438159, "learning_rate": 1.270192836538836e-05, "loss": 0.3697, "step": 838 }, { "epoch": 3.8299572039942937, "grad_norm": 0.10935426565797807, "learning_rate": 1.2608817165934681e-05, "loss": 0.3674, "step": 839 }, { "epoch": 3.8345221112696146, "grad_norm": 0.11667620915560488, "learning_rate": 1.2515984602786487e-05, "loss": 0.3662, "step": 840 }, { "epoch": 3.839087018544936, "grad_norm": 0.10968676357221958, "learning_rate": 1.2423431620281306e-05, "loss": 0.3618, "step": 841 }, { "epoch": 3.843651925820257, "grad_norm": 0.12006579571295849, "learning_rate": 1.2331159159912667e-05, "loss": 0.3703, "step": 842 }, { "epoch": 3.8482168330955777, "grad_norm": 0.11231326039006076, "learning_rate": 1.22391681603205e-05, "loss": 0.3674, "step": 843 }, { "epoch": 3.8527817403708986, "grad_norm": 0.11239420460642152, "learning_rate": 1.2147459557281543e-05, "loss": 0.3731, "step": 844 }, { "epoch": 3.85734664764622, "grad_norm": 0.10105192704809327, "learning_rate": 1.2056034283699866e-05, "loss": 0.3634, "step": 845 }, { "epoch": 3.861911554921541, "grad_norm": 0.1324600771108394, "learning_rate": 1.1964893269597408e-05, "loss": 0.3698, "step": 846 }, { "epoch": 3.8664764621968617, "grad_norm": 0.10492225515470834, "learning_rate": 1.1874037442104464e-05, "loss": 0.3656, "step": 847 }, { "epoch": 3.8710413694721826, "grad_norm": 0.1049947478937808, "learning_rate": 1.1783467725450288e-05, "loss": 0.3673, "step": 848 }, { "epoch": 3.8756062767475035, "grad_norm": 0.12343105623506188, "learning_rate": 1.1693185040953647e-05, "loss": 0.3692, "step": 849 }, { "epoch": 3.8801711840228243, "grad_norm": 0.10907582190242261, "learning_rate": 1.1603190307013485e-05, "loss": 0.3689, "step": 850 }, { "epoch": 3.8847360912981452, "grad_norm": 0.1048746000015905, "learning_rate": 1.1513484439099592e-05, "loss": 0.3688, "step": 851 }, { "epoch": 3.8893009985734666, "grad_norm": 0.110532809391589, "learning_rate": 1.1424068349743282e-05, "loss": 0.3707, "step": 852 }, { "epoch": 3.8938659058487874, "grad_norm": 0.1082453458710121, "learning_rate": 1.133494294852806e-05, "loss": 0.3684, "step": 853 }, { "epoch": 3.8984308131241083, "grad_norm": 0.10345660407308066, "learning_rate": 1.1246109142080463e-05, "loss": 0.3678, "step": 854 }, { "epoch": 3.9029957203994297, "grad_norm": 0.10898385130369857, "learning_rate": 1.1157567834060732e-05, "loss": 0.3703, "step": 855 }, { "epoch": 3.9075606276747505, "grad_norm": 0.10868802232496799, "learning_rate": 1.1069319925153716e-05, "loss": 0.3672, "step": 856 }, { "epoch": 3.9121255349500714, "grad_norm": 0.1001172075462793, "learning_rate": 1.098136631305966e-05, "loss": 0.3647, "step": 857 }, { "epoch": 3.9166904422253923, "grad_norm": 0.10002086132257204, "learning_rate": 1.0893707892485046e-05, "loss": 0.3687, "step": 858 }, { "epoch": 3.921255349500713, "grad_norm": 0.11817525489059527, "learning_rate": 1.080634555513358e-05, "loss": 0.3681, "step": 859 }, { "epoch": 3.925820256776034, "grad_norm": 0.10005386509374838, "learning_rate": 1.0719280189697012e-05, "loss": 0.371, "step": 860 }, { "epoch": 3.930385164051355, "grad_norm": 0.09735019950539485, "learning_rate": 1.0632512681846188e-05, "loss": 0.3647, "step": 861 }, { "epoch": 3.9349500713266763, "grad_norm": 0.1096708423704249, "learning_rate": 1.0546043914222004e-05, "loss": 0.3689, "step": 862 }, { "epoch": 3.939514978601997, "grad_norm": 0.10947940483538458, "learning_rate": 1.045987476642639e-05, "loss": 0.3653, "step": 863 }, { "epoch": 3.944079885877318, "grad_norm": 0.10334503635714308, "learning_rate": 1.0374006115013446e-05, "loss": 0.3705, "step": 864 }, { "epoch": 3.948644793152639, "grad_norm": 0.09613211541479365, "learning_rate": 1.0288438833480434e-05, "loss": 0.3653, "step": 865 }, { "epoch": 3.9532097004279603, "grad_norm": 0.11391043223040556, "learning_rate": 1.0203173792258964e-05, "loss": 0.3709, "step": 866 }, { "epoch": 3.957774607703281, "grad_norm": 0.1112000931274841, "learning_rate": 1.0118211858706126e-05, "loss": 0.3695, "step": 867 }, { "epoch": 3.962339514978602, "grad_norm": 0.09623531613971865, "learning_rate": 1.0033553897095611e-05, "loss": 0.3673, "step": 868 }, { "epoch": 3.966904422253923, "grad_norm": 0.12054451116095541, "learning_rate": 9.949200768608978e-06, "loss": 0.3662, "step": 869 }, { "epoch": 3.971469329529244, "grad_norm": 0.10469643561676546, "learning_rate": 9.865153331326888e-06, "loss": 0.3722, "step": 870 }, { "epoch": 3.9760342368045647, "grad_norm": 0.10369572994121254, "learning_rate": 9.781412440220364e-06, "loss": 0.3658, "step": 871 }, { "epoch": 3.980599144079886, "grad_norm": 0.11265124183687726, "learning_rate": 9.697978947142083e-06, "loss": 0.366, "step": 872 }, { "epoch": 3.985164051355207, "grad_norm": 0.09909112429172705, "learning_rate": 9.61485370081773e-06, "loss": 0.3655, "step": 873 }, { "epoch": 3.989728958630528, "grad_norm": 0.10458100196949884, "learning_rate": 9.532037546837328e-06, "loss": 0.3669, "step": 874 }, { "epoch": 3.9942938659058487, "grad_norm": 0.10060786891352515, "learning_rate": 9.4495313276467e-06, "loss": 0.3684, "step": 875 }, { "epoch": 3.99885877318117, "grad_norm": 0.11243962749211547, "learning_rate": 9.367335882538859e-06, "loss": 0.3651, "step": 876 }, { "epoch": 4.003423680456491, "grad_norm": 0.37014150627969, "learning_rate": 9.285452047645447e-06, "loss": 0.6984, "step": 877 }, { "epoch": 4.007988587731812, "grad_norm": 0.15671543024887574, "learning_rate": 9.20388065592829e-06, "loss": 0.3516, "step": 878 }, { "epoch": 4.012553495007133, "grad_norm": 0.16714819900057928, "learning_rate": 9.122622537170858e-06, "loss": 0.3535, "step": 879 }, { "epoch": 4.0171184022824535, "grad_norm": 0.15835660325889345, "learning_rate": 9.041678517969878e-06, "loss": 0.3516, "step": 880 }, { "epoch": 4.021683309557774, "grad_norm": 0.1420672767585784, "learning_rate": 8.961049421726927e-06, "loss": 0.3484, "step": 881 }, { "epoch": 4.026248216833095, "grad_norm": 0.13783878715714176, "learning_rate": 8.880736068639972e-06, "loss": 0.3476, "step": 882 }, { "epoch": 4.030813124108416, "grad_norm": 0.13731172171411218, "learning_rate": 8.800739275695162e-06, "loss": 0.3559, "step": 883 }, { "epoch": 4.035378031383738, "grad_norm": 0.14178993491605563, "learning_rate": 8.721059856658374e-06, "loss": 0.3505, "step": 884 }, { "epoch": 4.039942938659059, "grad_norm": 0.12951830366700642, "learning_rate": 8.641698622067056e-06, "loss": 0.3483, "step": 885 }, { "epoch": 4.04450784593438, "grad_norm": 0.13767174076249047, "learning_rate": 8.56265637922192e-06, "loss": 0.3516, "step": 886 }, { "epoch": 4.049072753209701, "grad_norm": 0.13591547948547417, "learning_rate": 8.483933932178714e-06, "loss": 0.3569, "step": 887 }, { "epoch": 4.0536376604850215, "grad_norm": 0.12191863460297671, "learning_rate": 8.405532081740104e-06, "loss": 0.3479, "step": 888 }, { "epoch": 4.058202567760342, "grad_norm": 0.1248887971732098, "learning_rate": 8.327451625447462e-06, "loss": 0.3494, "step": 889 }, { "epoch": 4.062767475035663, "grad_norm": 0.13804862882003327, "learning_rate": 8.24969335757281e-06, "loss": 0.3508, "step": 890 }, { "epoch": 4.067332382310984, "grad_norm": 0.11333955902964997, "learning_rate": 8.17225806911071e-06, "loss": 0.3515, "step": 891 }, { "epoch": 4.071897289586305, "grad_norm": 0.11550369078052372, "learning_rate": 8.095146547770202e-06, "loss": 0.3523, "step": 892 }, { "epoch": 4.076462196861626, "grad_norm": 0.12066342905233718, "learning_rate": 8.018359577966822e-06, "loss": 0.351, "step": 893 }, { "epoch": 4.081027104136947, "grad_norm": 0.12020962194954402, "learning_rate": 7.941897940814613e-06, "loss": 0.3511, "step": 894 }, { "epoch": 4.085592011412269, "grad_norm": 0.10735766251661075, "learning_rate": 7.865762414118197e-06, "loss": 0.3515, "step": 895 }, { "epoch": 4.0901569186875895, "grad_norm": 0.10152101132548039, "learning_rate": 7.7899537723648e-06, "loss": 0.3494, "step": 896 }, { "epoch": 4.09472182596291, "grad_norm": 0.11175247171368977, "learning_rate": 7.71447278671646e-06, "loss": 0.3434, "step": 897 }, { "epoch": 4.099286733238231, "grad_norm": 0.11008323120104475, "learning_rate": 7.639320225002106e-06, "loss": 0.3562, "step": 898 }, { "epoch": 4.103851640513552, "grad_norm": 0.10079393628102, "learning_rate": 7.564496851709799e-06, "loss": 0.3462, "step": 899 }, { "epoch": 4.108416547788873, "grad_norm": 0.10224032651804278, "learning_rate": 7.490003427978947e-06, "loss": 0.352, "step": 900 }, { "epoch": 4.112981455064194, "grad_norm": 0.10528872332896734, "learning_rate": 7.415840711592515e-06, "loss": 0.3583, "step": 901 }, { "epoch": 4.117546362339515, "grad_norm": 0.09810153639928686, "learning_rate": 7.342009456969394e-06, "loss": 0.3507, "step": 902 }, { "epoch": 4.122111269614836, "grad_norm": 0.09377946822299223, "learning_rate": 7.26851041515666e-06, "loss": 0.3478, "step": 903 }, { "epoch": 4.1266761768901565, "grad_norm": 0.09941074294315165, "learning_rate": 7.1953443338219635e-06, "loss": 0.3474, "step": 904 }, { "epoch": 4.131241084165478, "grad_norm": 0.09923689941405467, "learning_rate": 7.12251195724595e-06, "loss": 0.3497, "step": 905 }, { "epoch": 4.135805991440799, "grad_norm": 0.09632085335191928, "learning_rate": 7.0500140263146085e-06, "loss": 0.3502, "step": 906 }, { "epoch": 4.14037089871612, "grad_norm": 0.09318553780432773, "learning_rate": 6.977851278511831e-06, "loss": 0.3519, "step": 907 }, { "epoch": 4.144935805991441, "grad_norm": 0.1055662018772662, "learning_rate": 6.9060244479118325e-06, "loss": 0.3447, "step": 908 }, { "epoch": 4.149500713266762, "grad_norm": 0.09794765647352846, "learning_rate": 6.8345342651717415e-06, "loss": 0.3521, "step": 909 }, { "epoch": 4.154065620542083, "grad_norm": 0.10262064175384165, "learning_rate": 6.763381457524137e-06, "loss": 0.3467, "step": 910 }, { "epoch": 4.158630527817404, "grad_norm": 0.0924225272063823, "learning_rate": 6.692566748769645e-06, "loss": 0.348, "step": 911 }, { "epoch": 4.1631954350927245, "grad_norm": 0.09396615551983964, "learning_rate": 6.622090859269579e-06, "loss": 0.3485, "step": 912 }, { "epoch": 4.167760342368045, "grad_norm": 0.1085909553244327, "learning_rate": 6.5519545059386495e-06, "loss": 0.3503, "step": 913 }, { "epoch": 4.172325249643366, "grad_norm": 0.10045646940751854, "learning_rate": 6.482158402237622e-06, "loss": 0.3515, "step": 914 }, { "epoch": 4.176890156918688, "grad_norm": 0.10030605637198417, "learning_rate": 6.412703258166089e-06, "loss": 0.3513, "step": 915 }, { "epoch": 4.181455064194009, "grad_norm": 0.08816024777952751, "learning_rate": 6.343589780255226e-06, "loss": 0.3463, "step": 916 }, { "epoch": 4.18601997146933, "grad_norm": 0.10373572668229089, "learning_rate": 6.274818671560612e-06, "loss": 0.3521, "step": 917 }, { "epoch": 4.190584878744651, "grad_norm": 0.09661236890820955, "learning_rate": 6.2063906316550944e-06, "loss": 0.3535, "step": 918 }, { "epoch": 4.195149786019972, "grad_norm": 0.09307921133377361, "learning_rate": 6.138306356621666e-06, "loss": 0.352, "step": 919 }, { "epoch": 4.1997146932952925, "grad_norm": 0.09643011644388448, "learning_rate": 6.0705665390463545e-06, "loss": 0.3495, "step": 920 }, { "epoch": 4.204279600570613, "grad_norm": 0.09380570498729787, "learning_rate": 6.003171868011226e-06, "loss": 0.351, "step": 921 }, { "epoch": 4.208844507845934, "grad_norm": 0.09110098326246356, "learning_rate": 5.9361230290873175e-06, "loss": 0.3501, "step": 922 }, { "epoch": 4.213409415121255, "grad_norm": 0.10109563700718154, "learning_rate": 5.869420704327722e-06, "loss": 0.3523, "step": 923 }, { "epoch": 4.217974322396576, "grad_norm": 0.08878313714056194, "learning_rate": 5.803065572260633e-06, "loss": 0.3482, "step": 924 }, { "epoch": 4.222539229671897, "grad_norm": 0.08818694187121523, "learning_rate": 5.737058307882391e-06, "loss": 0.3528, "step": 925 }, { "epoch": 4.227104136947219, "grad_norm": 0.09295440449783292, "learning_rate": 5.671399582650705e-06, "loss": 0.3461, "step": 926 }, { "epoch": 4.2316690442225395, "grad_norm": 0.09685863269312232, "learning_rate": 5.606090064477738e-06, "loss": 0.3503, "step": 927 }, { "epoch": 4.23623395149786, "grad_norm": 0.09009726502122058, "learning_rate": 5.541130417723359e-06, "loss": 0.3439, "step": 928 }, { "epoch": 4.240798858773181, "grad_norm": 0.08660914537757003, "learning_rate": 5.476521303188414e-06, "loss": 0.353, "step": 929 }, { "epoch": 4.245363766048502, "grad_norm": 0.09086935202462741, "learning_rate": 5.4122633781079135e-06, "loss": 0.3523, "step": 930 }, { "epoch": 4.249928673323823, "grad_norm": 0.09963483067768689, "learning_rate": 5.348357296144437e-06, "loss": 0.3528, "step": 931 }, { "epoch": 4.254493580599144, "grad_norm": 0.08898518315403384, "learning_rate": 5.2848037073814255e-06, "loss": 0.3492, "step": 932 }, { "epoch": 4.259058487874465, "grad_norm": 0.09983473387703683, "learning_rate": 5.221603258316577e-06, "loss": 0.3537, "step": 933 }, { "epoch": 4.263623395149786, "grad_norm": 0.09231469544481466, "learning_rate": 5.158756591855336e-06, "loss": 0.3505, "step": 934 }, { "epoch": 4.268188302425107, "grad_norm": 0.08932201492732685, "learning_rate": 5.0962643473042536e-06, "loss": 0.3506, "step": 935 }, { "epoch": 4.2727532097004275, "grad_norm": 0.09165674817192716, "learning_rate": 5.034127160364528e-06, "loss": 0.3542, "step": 936 }, { "epoch": 4.277318116975749, "grad_norm": 0.09295144892311318, "learning_rate": 4.972345663125575e-06, "loss": 0.3458, "step": 937 }, { "epoch": 4.28188302425107, "grad_norm": 0.09773667025776654, "learning_rate": 4.910920484058519e-06, "loss": 0.3489, "step": 938 }, { "epoch": 4.286447931526391, "grad_norm": 0.09478260876178425, "learning_rate": 4.849852248009899e-06, "loss": 0.3518, "step": 939 }, { "epoch": 4.291012838801712, "grad_norm": 0.08633818761277473, "learning_rate": 4.789141576195207e-06, "loss": 0.3481, "step": 940 }, { "epoch": 4.295577746077033, "grad_norm": 0.09290651843039628, "learning_rate": 4.72878908619264e-06, "loss": 0.3459, "step": 941 }, { "epoch": 4.300142653352354, "grad_norm": 0.0901405739554656, "learning_rate": 4.668795391936805e-06, "loss": 0.3438, "step": 942 }, { "epoch": 4.304707560627675, "grad_norm": 0.09095933811292778, "learning_rate": 4.609161103712447e-06, "loss": 0.35, "step": 943 }, { "epoch": 4.3092724679029955, "grad_norm": 0.08514226082856571, "learning_rate": 4.54988682814828e-06, "loss": 0.3514, "step": 944 }, { "epoch": 4.313837375178316, "grad_norm": 0.09103184637713459, "learning_rate": 4.490973168210788e-06, "loss": 0.351, "step": 945 }, { "epoch": 4.318402282453638, "grad_norm": 0.08771070966673426, "learning_rate": 4.43242072319809e-06, "loss": 0.3514, "step": 946 }, { "epoch": 4.322967189728959, "grad_norm": 0.08785642021314082, "learning_rate": 4.374230088733855e-06, "loss": 0.349, "step": 947 }, { "epoch": 4.32753209700428, "grad_norm": 0.09441411098621398, "learning_rate": 4.3164018567612495e-06, "loss": 0.3505, "step": 948 }, { "epoch": 4.332097004279601, "grad_norm": 0.08194712702951042, "learning_rate": 4.2589366155369125e-06, "loss": 0.3487, "step": 949 }, { "epoch": 4.336661911554922, "grad_norm": 0.08937141345166866, "learning_rate": 4.201834949624957e-06, "loss": 0.3523, "step": 950 }, { "epoch": 4.3412268188302425, "grad_norm": 0.08875313880651403, "learning_rate": 4.145097439891026e-06, "loss": 0.3529, "step": 951 }, { "epoch": 4.345791726105563, "grad_norm": 0.08216164995036938, "learning_rate": 4.088724663496391e-06, "loss": 0.3487, "step": 952 }, { "epoch": 4.350356633380884, "grad_norm": 0.0844163738277272, "learning_rate": 4.032717193892097e-06, "loss": 0.3536, "step": 953 }, { "epoch": 4.354921540656205, "grad_norm": 0.07958475981719833, "learning_rate": 3.977075600813112e-06, "loss": 0.3524, "step": 954 }, { "epoch": 4.359486447931526, "grad_norm": 0.08312472608098892, "learning_rate": 3.921800450272497e-06, "loss": 0.3564, "step": 955 }, { "epoch": 4.364051355206847, "grad_norm": 0.09090031843488945, "learning_rate": 3.866892304555729e-06, "loss": 0.3503, "step": 956 }, { "epoch": 4.368616262482169, "grad_norm": 0.08646905004538202, "learning_rate": 3.8123517222149064e-06, "loss": 0.3539, "step": 957 }, { "epoch": 4.37318116975749, "grad_norm": 0.08630929584464324, "learning_rate": 3.7581792580630995e-06, "loss": 0.3508, "step": 958 }, { "epoch": 4.3777460770328105, "grad_norm": 0.08420773366472539, "learning_rate": 3.7043754631687168e-06, "loss": 0.3506, "step": 959 }, { "epoch": 4.382310984308131, "grad_norm": 0.08626254558605037, "learning_rate": 3.650940884849865e-06, "loss": 0.3496, "step": 960 }, { "epoch": 4.386875891583452, "grad_norm": 0.08202547191435747, "learning_rate": 3.5978760666688283e-06, "loss": 0.3463, "step": 961 }, { "epoch": 4.391440798858773, "grad_norm": 0.08400830894322005, "learning_rate": 3.545181548426482e-06, "loss": 0.3534, "step": 962 }, { "epoch": 4.396005706134094, "grad_norm": 0.08174812448321671, "learning_rate": 3.4928578661568513e-06, "loss": 0.3505, "step": 963 }, { "epoch": 4.400570613409415, "grad_norm": 0.08855138380230874, "learning_rate": 3.4409055521216472e-06, "loss": 0.3484, "step": 964 }, { "epoch": 4.405135520684736, "grad_norm": 0.08806266323171609, "learning_rate": 3.3893251348048107e-06, "loss": 0.3517, "step": 965 }, { "epoch": 4.409700427960057, "grad_norm": 0.08598387207772194, "learning_rate": 3.3381171389072155e-06, "loss": 0.3464, "step": 966 }, { "epoch": 4.414265335235378, "grad_norm": 0.08257564200380783, "learning_rate": 3.287282085341237e-06, "loss": 0.3434, "step": 967 }, { "epoch": 4.418830242510699, "grad_norm": 0.08050286636135041, "learning_rate": 3.236820491225543e-06, "loss": 0.3507, "step": 968 }, { "epoch": 4.42339514978602, "grad_norm": 0.0804766651869741, "learning_rate": 3.1867328698797784e-06, "loss": 0.3559, "step": 969 }, { "epoch": 4.427960057061341, "grad_norm": 0.08321920927722377, "learning_rate": 3.1370197308193464e-06, "loss": 0.3495, "step": 970 }, { "epoch": 4.432524964336662, "grad_norm": 0.08340309487334537, "learning_rate": 3.08768157975023e-06, "loss": 0.3485, "step": 971 }, { "epoch": 4.437089871611983, "grad_norm": 0.08628533559205914, "learning_rate": 3.0387189185638877e-06, "loss": 0.3465, "step": 972 }, { "epoch": 4.441654778887304, "grad_norm": 0.0839367124944577, "learning_rate": 2.99013224533208e-06, "loss": 0.3514, "step": 973 }, { "epoch": 4.446219686162625, "grad_norm": 0.08431513019450357, "learning_rate": 2.9419220543018647e-06, "loss": 0.35, "step": 974 }, { "epoch": 4.4507845934379455, "grad_norm": 0.08138210314265082, "learning_rate": 2.894088835890512e-06, "loss": 0.3503, "step": 975 }, { "epoch": 4.455349500713266, "grad_norm": 0.08209216743333449, "learning_rate": 2.846633076680565e-06, "loss": 0.3501, "step": 976 }, { "epoch": 4.459914407988588, "grad_norm": 0.07980157930120314, "learning_rate": 2.7995552594148613e-06, "loss": 0.3477, "step": 977 }, { "epoch": 4.464479315263909, "grad_norm": 0.08454274847735264, "learning_rate": 2.7528558629916457e-06, "loss": 0.3508, "step": 978 }, { "epoch": 4.46904422253923, "grad_norm": 0.07827269612143122, "learning_rate": 2.706535362459657e-06, "loss": 0.3541, "step": 979 }, { "epoch": 4.473609129814551, "grad_norm": 0.08066212537936188, "learning_rate": 2.6605942290133515e-06, "loss": 0.3468, "step": 980 }, { "epoch": 4.478174037089872, "grad_norm": 0.08038509522306664, "learning_rate": 2.615032929988055e-06, "loss": 0.3493, "step": 981 }, { "epoch": 4.482738944365193, "grad_norm": 0.08121370144409773, "learning_rate": 2.569851928855256e-06, "loss": 0.3486, "step": 982 }, { "epoch": 4.4873038516405135, "grad_norm": 0.07856168793082442, "learning_rate": 2.525051685217865e-06, "loss": 0.3507, "step": 983 }, { "epoch": 4.491868758915834, "grad_norm": 0.07958873197319534, "learning_rate": 2.4806326548055238e-06, "loss": 0.3493, "step": 984 }, { "epoch": 4.496433666191155, "grad_norm": 0.08171369971311804, "learning_rate": 2.436595289470023e-06, "loss": 0.3508, "step": 985 }, { "epoch": 4.500998573466476, "grad_norm": 0.08120306757647351, "learning_rate": 2.3929400371806377e-06, "loss": 0.3521, "step": 986 }, { "epoch": 4.505563480741797, "grad_norm": 0.07637699134693174, "learning_rate": 2.3496673420196326e-06, "loss": 0.3466, "step": 987 }, { "epoch": 4.510128388017119, "grad_norm": 0.08223264882483422, "learning_rate": 2.306777644177709e-06, "loss": 0.3507, "step": 988 }, { "epoch": 4.51469329529244, "grad_norm": 0.07915711031881265, "learning_rate": 2.2642713799495207e-06, "loss": 0.3554, "step": 989 }, { "epoch": 4.519258202567761, "grad_norm": 0.08144647282074215, "learning_rate": 2.222148981729273e-06, "loss": 0.3529, "step": 990 }, { "epoch": 4.5238231098430814, "grad_norm": 0.08053252018883734, "learning_rate": 2.1804108780062805e-06, "loss": 0.3464, "step": 991 }, { "epoch": 4.528388017118402, "grad_norm": 0.08178510466321848, "learning_rate": 2.139057493360643e-06, "loss": 0.3477, "step": 992 }, { "epoch": 4.532952924393723, "grad_norm": 0.08178556560050178, "learning_rate": 2.098089248458912e-06, "loss": 0.3485, "step": 993 }, { "epoch": 4.537517831669044, "grad_norm": 0.08220979435959991, "learning_rate": 2.0575065600498067e-06, "loss": 0.3517, "step": 994 }, { "epoch": 4.542082738944365, "grad_norm": 0.07762389992501054, "learning_rate": 2.0173098409599757e-06, "loss": 0.3504, "step": 995 }, { "epoch": 4.546647646219686, "grad_norm": 0.07845697347936922, "learning_rate": 1.977499500089808e-06, "loss": 0.3473, "step": 996 }, { "epoch": 4.551212553495007, "grad_norm": 0.08243028759784551, "learning_rate": 1.9380759424092722e-06, "loss": 0.3488, "step": 997 }, { "epoch": 4.555777460770328, "grad_norm": 0.08482588178810972, "learning_rate": 1.899039568953782e-06, "loss": 0.3485, "step": 998 }, { "epoch": 4.560342368045649, "grad_norm": 0.08930393433101652, "learning_rate": 1.8603907768201335e-06, "loss": 0.3477, "step": 999 }, { "epoch": 4.56490727532097, "grad_norm": 0.07646082270023902, "learning_rate": 1.8221299591624531e-06, "loss": 0.3541, "step": 1000 }, { "epoch": 4.569472182596291, "grad_norm": 0.08228826529784623, "learning_rate": 1.7842575051882117e-06, "loss": 0.3455, "step": 1001 }, { "epoch": 4.574037089871612, "grad_norm": 0.0826320638970819, "learning_rate": 1.7467738001542534e-06, "loss": 0.3541, "step": 1002 }, { "epoch": 4.578601997146933, "grad_norm": 0.07749734600577586, "learning_rate": 1.7096792253628747e-06, "loss": 0.3508, "step": 1003 }, { "epoch": 4.583166904422254, "grad_norm": 0.07849844115759268, "learning_rate": 1.6729741581579695e-06, "loss": 0.3483, "step": 1004 }, { "epoch": 4.587731811697575, "grad_norm": 0.08061411035349854, "learning_rate": 1.6366589719211478e-06, "loss": 0.3459, "step": 1005 }, { "epoch": 4.592296718972896, "grad_norm": 0.07707354315110565, "learning_rate": 1.6007340360679835e-06, "loss": 0.3472, "step": 1006 }, { "epoch": 4.5968616262482165, "grad_norm": 0.07564998473974219, "learning_rate": 1.56519971604423e-06, "loss": 0.3536, "step": 1007 }, { "epoch": 4.601426533523538, "grad_norm": 0.07973487309962904, "learning_rate": 1.5300563733220997e-06, "loss": 0.3524, "step": 1008 }, { "epoch": 4.605991440798859, "grad_norm": 0.0798405736175932, "learning_rate": 1.4953043653966125e-06, "loss": 0.3437, "step": 1009 }, { "epoch": 4.61055634807418, "grad_norm": 0.07661822938491461, "learning_rate": 1.4609440457819201e-06, "loss": 0.3505, "step": 1010 }, { "epoch": 4.615121255349501, "grad_norm": 0.0803917140814825, "learning_rate": 1.4269757640077474e-06, "loss": 0.3473, "step": 1011 }, { "epoch": 4.619686162624822, "grad_norm": 0.08142790095124294, "learning_rate": 1.393399865615832e-06, "loss": 0.356, "step": 1012 }, { "epoch": 4.624251069900143, "grad_norm": 0.07941212240240511, "learning_rate": 1.3602166921563709e-06, "loss": 0.3469, "step": 1013 }, { "epoch": 4.628815977175464, "grad_norm": 0.07567140453271615, "learning_rate": 1.3274265811845877e-06, "loss": 0.3565, "step": 1014 }, { "epoch": 4.633380884450784, "grad_norm": 0.07693089631655367, "learning_rate": 1.2950298662572914e-06, "loss": 0.3511, "step": 1015 }, { "epoch": 4.637945791726105, "grad_norm": 0.07649326760766532, "learning_rate": 1.2630268769294695e-06, "loss": 0.3488, "step": 1016 }, { "epoch": 4.642510699001426, "grad_norm": 0.07907653808032476, "learning_rate": 1.2314179387509451e-06, "loss": 0.3498, "step": 1017 }, { "epoch": 4.647075606276747, "grad_norm": 0.08218341042555163, "learning_rate": 1.2002033732630624e-06, "loss": 0.3456, "step": 1018 }, { "epoch": 4.651640513552069, "grad_norm": 0.078114066869552, "learning_rate": 1.169383497995411e-06, "loss": 0.3519, "step": 1019 }, { "epoch": 4.65620542082739, "grad_norm": 0.08045459715620887, "learning_rate": 1.1389586264626141e-06, "loss": 0.3509, "step": 1020 }, { "epoch": 4.660770328102711, "grad_norm": 0.07667337101962392, "learning_rate": 1.108929068161122e-06, "loss": 0.3497, "step": 1021 }, { "epoch": 4.6653352353780315, "grad_norm": 0.08000069107101468, "learning_rate": 1.0792951285660601e-06, "loss": 0.3496, "step": 1022 }, { "epoch": 4.669900142653352, "grad_norm": 0.07789612908736167, "learning_rate": 1.0500571091281375e-06, "loss": 0.3493, "step": 1023 }, { "epoch": 4.674465049928673, "grad_norm": 0.08267558856337517, "learning_rate": 1.0212153072705732e-06, "loss": 0.3479, "step": 1024 }, { "epoch": 4.679029957203994, "grad_norm": 0.07420363546139669, "learning_rate": 9.927700163860642e-07, "loss": 0.3533, "step": 1025 }, { "epoch": 4.683594864479315, "grad_norm": 0.07447011872925986, "learning_rate": 9.647215258338138e-07, "loss": 0.3546, "step": 1026 }, { "epoch": 4.688159771754636, "grad_norm": 0.07716250968421168, "learning_rate": 9.370701209365784e-07, "loss": 0.3482, "step": 1027 }, { "epoch": 4.692724679029957, "grad_norm": 0.07514426209617693, "learning_rate": 9.098160829777724e-07, "loss": 0.3455, "step": 1028 }, { "epoch": 4.697289586305278, "grad_norm": 0.079548149699182, "learning_rate": 8.829596891985859e-07, "loss": 0.3511, "step": 1029 }, { "epoch": 4.7018544935805995, "grad_norm": 0.07934376344480484, "learning_rate": 8.565012127951955e-07, "loss": 0.3519, "step": 1030 }, { "epoch": 4.70641940085592, "grad_norm": 0.07581124441915535, "learning_rate": 8.304409229159804e-07, "loss": 0.346, "step": 1031 }, { "epoch": 4.710984308131241, "grad_norm": 0.07394023460690179, "learning_rate": 8.047790846587467e-07, "loss": 0.3533, "step": 1032 }, { "epoch": 4.715549215406562, "grad_norm": 0.07412942941301379, "learning_rate": 7.7951595906808e-07, "loss": 0.3525, "step": 1033 }, { "epoch": 4.720114122681883, "grad_norm": 0.07682082219491011, "learning_rate": 7.546518031326644e-07, "loss": 0.3515, "step": 1034 }, { "epoch": 4.724679029957204, "grad_norm": 0.07823025700829145, "learning_rate": 7.301868697826608e-07, "loss": 0.3492, "step": 1035 }, { "epoch": 4.729243937232525, "grad_norm": 0.0766178375837658, "learning_rate": 7.061214078871725e-07, "loss": 0.3519, "step": 1036 }, { "epoch": 4.733808844507846, "grad_norm": 0.0760779964551882, "learning_rate": 6.824556622516599e-07, "loss": 0.3512, "step": 1037 }, { "epoch": 4.7383737517831666, "grad_norm": 0.07486994881455176, "learning_rate": 6.591898736154801e-07, "loss": 0.3508, "step": 1038 }, { "epoch": 4.742938659058488, "grad_norm": 0.0759371546012345, "learning_rate": 6.363242786494539e-07, "loss": 0.3489, "step": 1039 }, { "epoch": 4.747503566333809, "grad_norm": 0.0749018727178018, "learning_rate": 6.138591099534141e-07, "loss": 0.3493, "step": 1040 }, { "epoch": 4.75206847360913, "grad_norm": 0.0753265709026964, "learning_rate": 5.917945960538918e-07, "loss": 0.3466, "step": 1041 }, { "epoch": 4.756633380884451, "grad_norm": 0.07840244976592427, "learning_rate": 5.701309614017447e-07, "loss": 0.3505, "step": 1042 }, { "epoch": 4.761198288159772, "grad_norm": 0.07383075199179212, "learning_rate": 5.488684263698929e-07, "loss": 0.3536, "step": 1043 }, { "epoch": 4.765763195435093, "grad_norm": 0.11122241090091968, "learning_rate": 5.280072072510933e-07, "loss": 0.3461, "step": 1044 }, { "epoch": 4.770328102710414, "grad_norm": 0.0762041664722857, "learning_rate": 5.075475162557109e-07, "loss": 0.3506, "step": 1045 }, { "epoch": 4.7748930099857345, "grad_norm": 0.07751822838032038, "learning_rate": 4.874895615095776e-07, "loss": 0.3492, "step": 1046 }, { "epoch": 4.779457917261055, "grad_norm": 0.0719584207148954, "learning_rate": 4.6783354705187466e-07, "loss": 0.3466, "step": 1047 }, { "epoch": 4.784022824536376, "grad_norm": 0.07737673543131036, "learning_rate": 4.485796728330449e-07, "loss": 0.3541, "step": 1048 }, { "epoch": 4.788587731811697, "grad_norm": 0.07610517221913907, "learning_rate": 4.29728134712768e-07, "loss": 0.3515, "step": 1049 }, { "epoch": 4.793152639087019, "grad_norm": 0.07503429819011145, "learning_rate": 4.11279124457975e-07, "loss": 0.3499, "step": 1050 }, { "epoch": 4.79771754636234, "grad_norm": 0.07627319972442853, "learning_rate": 3.9323282974088164e-07, "loss": 0.3487, "step": 1051 }, { "epoch": 4.802282453637661, "grad_norm": 0.07282031998275167, "learning_rate": 3.7558943413709583e-07, "loss": 0.3465, "step": 1052 }, { "epoch": 4.806847360912982, "grad_norm": 0.07525573928294778, "learning_rate": 3.5834911712373076e-07, "loss": 0.3488, "step": 1053 }, { "epoch": 4.8114122681883025, "grad_norm": 0.07245362406747542, "learning_rate": 3.4151205407759736e-07, "loss": 0.3515, "step": 1054 }, { "epoch": 4.815977175463623, "grad_norm": 0.0732430683885161, "learning_rate": 3.2507841627341e-07, "loss": 0.3531, "step": 1055 }, { "epoch": 4.820542082738944, "grad_norm": 0.0745846212106986, "learning_rate": 3.090483708820502e-07, "loss": 0.3497, "step": 1056 }, { "epoch": 4.825106990014265, "grad_norm": 0.07347343844152147, "learning_rate": 2.934220809688526e-07, "loss": 0.3518, "step": 1057 }, { "epoch": 4.829671897289586, "grad_norm": 0.07558186917350596, "learning_rate": 2.7819970549197937e-07, "loss": 0.3511, "step": 1058 }, { "epoch": 4.834236804564907, "grad_norm": 0.07275618304565004, "learning_rate": 2.63381399300755e-07, "loss": 0.3472, "step": 1059 }, { "epoch": 4.838801711840228, "grad_norm": 0.07287512649205556, "learning_rate": 2.489673131341297e-07, "loss": 0.3462, "step": 1060 }, { "epoch": 4.8433666191155496, "grad_norm": 0.07194032638043274, "learning_rate": 2.349575936191384e-07, "loss": 0.3476, "step": 1061 }, { "epoch": 4.84793152639087, "grad_norm": 0.07547182977690173, "learning_rate": 2.2135238326938646e-07, "loss": 0.3471, "step": 1062 }, { "epoch": 4.852496433666191, "grad_norm": 0.07389752198521447, "learning_rate": 2.0815182048362858e-07, "loss": 0.3511, "step": 1063 }, { "epoch": 4.857061340941512, "grad_norm": 0.07948970245950392, "learning_rate": 1.953560395443521e-07, "loss": 0.3515, "step": 1064 }, { "epoch": 4.861626248216833, "grad_norm": 0.07428844813444034, "learning_rate": 1.829651706164004e-07, "loss": 0.3481, "step": 1065 }, { "epoch": 4.866191155492154, "grad_norm": 0.07304235969500948, "learning_rate": 1.7097933974566272e-07, "loss": 0.3553, "step": 1066 }, { "epoch": 4.870756062767475, "grad_norm": 0.07616979320061058, "learning_rate": 1.5939866885778198e-07, "loss": 0.3532, "step": 1067 }, { "epoch": 4.875320970042796, "grad_norm": 0.07825400387440254, "learning_rate": 1.4822327575692464e-07, "loss": 0.3479, "step": 1068 }, { "epoch": 4.879885877318117, "grad_norm": 0.08117745285619632, "learning_rate": 1.374532741245682e-07, "loss": 0.3512, "step": 1069 }, { "epoch": 4.884450784593438, "grad_norm": 0.07977853089608672, "learning_rate": 1.2708877351835569e-07, "loss": 0.3485, "step": 1070 }, { "epoch": 4.889015691868759, "grad_norm": 0.07367012787142511, "learning_rate": 1.1712987937098519e-07, "loss": 0.3518, "step": 1071 }, { "epoch": 4.89358059914408, "grad_norm": 0.07383920187031665, "learning_rate": 1.0757669298912199e-07, "loss": 0.3478, "step": 1072 }, { "epoch": 4.898145506419401, "grad_norm": 0.07294907888019937, "learning_rate": 9.842931155238156e-08, "loss": 0.3528, "step": 1073 }, { "epoch": 4.902710413694722, "grad_norm": 0.07451910212338729, "learning_rate": 8.96878281123259e-08, "loss": 0.349, "step": 1074 }, { "epoch": 4.907275320970043, "grad_norm": 0.07200140543046635, "learning_rate": 8.135233159154431e-08, "loss": 0.3485, "step": 1075 }, { "epoch": 4.911840228245364, "grad_norm": 0.07562528202585733, "learning_rate": 7.342290678272079e-08, "loss": 0.3498, "step": 1076 }, { "epoch": 4.916405135520685, "grad_norm": 0.07457317771382703, "learning_rate": 6.58996343477769e-08, "loss": 0.3537, "step": 1077 }, { "epoch": 4.9209700427960055, "grad_norm": 0.07363984134436599, "learning_rate": 5.878259081707249e-08, "loss": 0.3543, "step": 1078 }, { "epoch": 4.925534950071326, "grad_norm": 0.07252929837329791, "learning_rate": 5.2071848588601815e-08, "loss": 0.3473, "step": 1079 }, { "epoch": 4.930099857346647, "grad_norm": 0.07518239735309995, "learning_rate": 4.576747592726083e-08, "loss": 0.3503, "step": 1080 }, { "epoch": 4.934664764621969, "grad_norm": 0.07799050676264847, "learning_rate": 3.9869536964167734e-08, "loss": 0.3548, "step": 1081 }, { "epoch": 4.93922967189729, "grad_norm": 0.075339876329051, "learning_rate": 3.437809169600126e-08, "loss": 0.3502, "step": 1082 }, { "epoch": 4.943794579172611, "grad_norm": 0.07479700944498346, "learning_rate": 2.9293195984383405e-08, "loss": 0.3468, "step": 1083 }, { "epoch": 4.948359486447932, "grad_norm": 0.07210974118530611, "learning_rate": 2.461490155532875e-08, "loss": 0.3507, "step": 1084 }, { "epoch": 4.9529243937232525, "grad_norm": 0.07652036354657446, "learning_rate": 2.03432559986938e-08, "loss": 0.3476, "step": 1085 }, { "epoch": 4.957489300998573, "grad_norm": 0.07127307671646077, "learning_rate": 1.6478302767719555e-08, "loss": 0.3504, "step": 1086 }, { "epoch": 4.962054208273894, "grad_norm": 0.07366450733389644, "learning_rate": 1.3020081178574117e-08, "loss": 0.3488, "step": 1087 }, { "epoch": 4.966619115549215, "grad_norm": 0.07323536126136222, "learning_rate": 9.968626409948556e-09, "loss": 0.353, "step": 1088 }, { "epoch": 4.971184022824536, "grad_norm": 0.07340553491000143, "learning_rate": 7.323969502710526e-09, "loss": 0.3501, "step": 1089 }, { "epoch": 4.975748930099857, "grad_norm": 0.07588823565965415, "learning_rate": 5.0861373595889605e-09, "loss": 0.349, "step": 1090 }, { "epoch": 4.980313837375178, "grad_norm": 0.07676438153761625, "learning_rate": 3.255152744885415e-09, "loss": 0.3526, "step": 1091 }, { "epoch": 4.9848787446505, "grad_norm": 0.07181335005694361, "learning_rate": 1.831034284260902e-09, "loss": 0.3477, "step": 1092 }, { "epoch": 4.9894436519258205, "grad_norm": 0.07171739407306418, "learning_rate": 8.137964645316132e-10, "loss": 0.3434, "step": 1093 }, { "epoch": 4.994008559201141, "grad_norm": 0.07293042506861117, "learning_rate": 2.0344963353124969e-10, "loss": 0.3491, "step": 1094 }, { "epoch": 4.998573466476462, "grad_norm": 0.07498693212494786, "learning_rate": 0.0, "loss": 0.3471, "step": 1095 }, { "epoch": 4.998573466476462, "step": 1095, "total_flos": 2.8177610658514207e+19, "train_loss": 0.41344334662777105, "train_runtime": 239120.7765, "train_samples_per_second": 2.345, "train_steps_per_second": 0.005 } ], "logging_steps": 1.0, "max_steps": 1095, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8177610658514207e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }