{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993247805536799, "eval_steps": 500, "global_step": 370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 21.415120223600994, "learning_rate": 4.1666666666666665e-05, "loss": 8.6562, "step": 1 }, { "epoch": 0.01, "grad_norm": 13.773433414832953, "learning_rate": 8.333333333333333e-05, "loss": 8.7148, "step": 2 }, { "epoch": 0.01, "grad_norm": 5.8226925278473605, "learning_rate": 0.000125, "loss": 6.1523, "step": 3 }, { "epoch": 0.01, "grad_norm": 6.913578305289376, "learning_rate": 0.00016666666666666666, "loss": 4.8105, "step": 4 }, { "epoch": 0.01, "grad_norm": 7.564457947295859, "learning_rate": 0.00020833333333333335, "loss": 4.7148, "step": 5 }, { "epoch": 0.02, "grad_norm": 3.4572492282918157, "learning_rate": 0.00025, "loss": 4.3516, "step": 6 }, { "epoch": 0.02, "grad_norm": 2.1297987720642504, "learning_rate": 0.0002916666666666667, "loss": 3.9824, "step": 7 }, { "epoch": 0.02, "grad_norm": 1.1136160564583724, "learning_rate": 0.0003333333333333333, "loss": 3.7812, "step": 8 }, { "epoch": 0.02, "grad_norm": 0.8904819206285229, "learning_rate": 0.000375, "loss": 3.7344, "step": 9 }, { "epoch": 0.03, "grad_norm": 2.11038283184139, "learning_rate": 0.0004166666666666667, "loss": 3.7041, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.802508097218576, "learning_rate": 0.0004583333333333333, "loss": 3.6377, "step": 11 }, { "epoch": 0.03, "grad_norm": 0.4105103213318609, "learning_rate": 0.0005, "loss": 3.5898, "step": 12 }, { "epoch": 0.04, "grad_norm": 0.28889375510308085, "learning_rate": 0.0004999903741094759, "loss": 3.5059, "step": 13 }, { "epoch": 0.04, "grad_norm": 0.1903491115661804, "learning_rate": 0.0004999614971791658, "loss": 3.5312, "step": 14 }, { "epoch": 0.04, "grad_norm": 0.18406971686632234, "learning_rate": 0.0004999133714327992, "loss": 3.5264, "step": 15 }, { "epoch": 0.04, "grad_norm": 0.19187377072404146, "learning_rate": 0.0004998460005764011, "loss": 3.4863, "step": 16 }, { "epoch": 0.05, "grad_norm": 0.18894966967720278, "learning_rate": 0.0004997593897980076, "loss": 3.5098, "step": 17 }, { "epoch": 0.05, "grad_norm": 0.16990770049278048, "learning_rate": 0.0004996535457672657, "loss": 3.46, "step": 18 }, { "epoch": 0.05, "grad_norm": 0.14341359539921433, "learning_rate": 0.0004995284766349196, "loss": 3.4102, "step": 19 }, { "epoch": 0.05, "grad_norm": 0.13853022483240476, "learning_rate": 0.0004993841920321838, "loss": 3.3721, "step": 20 }, { "epoch": 0.06, "grad_norm": 0.14882163330247777, "learning_rate": 0.0004992207030700005, "loss": 3.4062, "step": 21 }, { "epoch": 0.06, "grad_norm": 0.12082174803609903, "learning_rate": 0.0004990380223381846, "loss": 3.3682, "step": 22 }, { "epoch": 0.06, "grad_norm": 0.11072980253911602, "learning_rate": 0.0004988361639044537, "loss": 3.335, "step": 23 }, { "epoch": 0.06, "grad_norm": 0.10689352906545974, "learning_rate": 0.0004986151433133455, "loss": 3.3447, "step": 24 }, { "epoch": 0.07, "grad_norm": 0.10490544779612036, "learning_rate": 0.0004983749775850199, "loss": 3.3252, "step": 25 }, { "epoch": 0.07, "grad_norm": 0.10182457932796082, "learning_rate": 0.0004981156852139491, "loss": 3.3037, "step": 26 }, { "epoch": 0.07, "grad_norm": 0.1056648298216298, "learning_rate": 0.0004978372861674928, "loss": 3.3164, "step": 27 }, { "epoch": 0.08, "grad_norm": 0.08205846396464361, "learning_rate": 0.0004975398018843612, "loss": 3.2939, "step": 28 }, { "epoch": 0.08, "grad_norm": 0.08913381428371997, "learning_rate": 0.0004972232552729632, "loss": 3.2949, "step": 29 }, { "epoch": 0.08, "grad_norm": 0.07539237478136081, "learning_rate": 0.000496887670709643, "loss": 3.2461, "step": 30 }, { "epoch": 0.08, "grad_norm": 0.07628363158131596, "learning_rate": 0.0004965330740368029, "loss": 3.2578, "step": 31 }, { "epoch": 0.09, "grad_norm": 0.07982185655162036, "learning_rate": 0.0004961594925609129, "loss": 3.2451, "step": 32 }, { "epoch": 0.09, "grad_norm": 0.08695759890018029, "learning_rate": 0.000495766955050408, "loss": 3.2607, "step": 33 }, { "epoch": 0.09, "grad_norm": 0.06301153004824052, "learning_rate": 0.0004953554917334731, "loss": 3.2061, "step": 34 }, { "epoch": 0.09, "grad_norm": 0.06474729877299198, "learning_rate": 0.0004949251342957151, "loss": 3.25, "step": 35 }, { "epoch": 0.1, "grad_norm": 0.07883442367683367, "learning_rate": 0.0004944759158777223, "loss": 3.2236, "step": 36 }, { "epoch": 0.1, "grad_norm": 0.07015055529327788, "learning_rate": 0.0004940078710725135, "loss": 3.2227, "step": 37 }, { "epoch": 0.1, "grad_norm": 0.07119572625376963, "learning_rate": 0.0004935210359228729, "loss": 3.2031, "step": 38 }, { "epoch": 0.11, "grad_norm": 0.06367666239627473, "learning_rate": 0.0004930154479185756, "loss": 3.252, "step": 39 }, { "epoch": 0.11, "grad_norm": 0.06300151867440586, "learning_rate": 0.0004924911459934997, "loss": 3.1777, "step": 40 }, { "epoch": 0.11, "grad_norm": 0.07472850979326924, "learning_rate": 0.0004919481705226285, "loss": 3.2314, "step": 41 }, { "epoch": 0.11, "grad_norm": 0.08114730974850456, "learning_rate": 0.0004913865633189419, "loss": 3.2119, "step": 42 }, { "epoch": 0.12, "grad_norm": 0.05895132991107217, "learning_rate": 0.0004908063676301953, "loss": 3.2188, "step": 43 }, { "epoch": 0.12, "grad_norm": 0.05010637153257128, "learning_rate": 0.0004902076281355902, "loss": 3.1689, "step": 44 }, { "epoch": 0.12, "grad_norm": 0.06316741853726789, "learning_rate": 0.0004895903909423332, "loss": 3.1904, "step": 45 }, { "epoch": 0.12, "grad_norm": 0.069290214948921, "learning_rate": 0.0004889547035820856, "loss": 3.1689, "step": 46 }, { "epoch": 0.13, "grad_norm": 0.06229058085155359, "learning_rate": 0.0004883006150073028, "loss": 3.1514, "step": 47 }, { "epoch": 0.13, "grad_norm": 0.05042088665341411, "learning_rate": 0.000487628175587465, "loss": 3.1816, "step": 48 }, { "epoch": 0.13, "grad_norm": 0.0545027205027528, "learning_rate": 0.00048693743710519807, "loss": 3.1875, "step": 49 }, { "epoch": 0.14, "grad_norm": 0.06583086092960536, "learning_rate": 0.00048622845275228614, "loss": 3.1816, "step": 50 }, { "epoch": 0.14, "grad_norm": 0.06171597888295892, "learning_rate": 0.00048550127712557515, "loss": 3.1621, "step": 51 }, { "epoch": 0.14, "grad_norm": 0.05936904165187056, "learning_rate": 0.0004847559662227691, "loss": 3.1689, "step": 52 }, { "epoch": 0.14, "grad_norm": 0.053617959255968184, "learning_rate": 0.0004839925774381172, "loss": 3.1826, "step": 53 }, { "epoch": 0.15, "grad_norm": 0.0584940883576519, "learning_rate": 0.00048321116955799426, "loss": 3.1445, "step": 54 }, { "epoch": 0.15, "grad_norm": 0.05534676948352036, "learning_rate": 0.0004824118027563742, "loss": 3.1367, "step": 55 }, { "epoch": 0.15, "grad_norm": 0.05079053816193199, "learning_rate": 0.00048159453859019535, "loss": 3.1221, "step": 56 }, { "epoch": 0.15, "grad_norm": 0.051247127284195455, "learning_rate": 0.000480759439994621, "loss": 3.1533, "step": 57 }, { "epoch": 0.16, "grad_norm": 0.05840283966287256, "learning_rate": 0.00047990657127819234, "loss": 3.1592, "step": 58 }, { "epoch": 0.16, "grad_norm": 0.06275081599011795, "learning_rate": 0.0004790359981178766, "loss": 3.1309, "step": 59 }, { "epoch": 0.16, "grad_norm": 0.061480429010200704, "learning_rate": 0.0004781477875540091, "loss": 3.1484, "step": 60 }, { "epoch": 0.16, "grad_norm": 0.06296237796551907, "learning_rate": 0.0004772420079851313, "loss": 3.1338, "step": 61 }, { "epoch": 0.17, "grad_norm": 0.05939876669569374, "learning_rate": 0.00047631872916272265, "loss": 3.1758, "step": 62 }, { "epoch": 0.17, "grad_norm": 0.05697500202970297, "learning_rate": 0.00047537802218583036, "loss": 3.1631, "step": 63 }, { "epoch": 0.17, "grad_norm": 0.05235594093980006, "learning_rate": 0.00047441995949559323, "loss": 3.1592, "step": 64 }, { "epoch": 0.18, "grad_norm": 0.05136699721805155, "learning_rate": 0.0004734446148696641, "loss": 3.1348, "step": 65 }, { "epoch": 0.18, "grad_norm": 0.056572106168923254, "learning_rate": 0.00047245206341652734, "loss": 3.1475, "step": 66 }, { "epoch": 0.18, "grad_norm": 0.0680276012617714, "learning_rate": 0.0004714423815697162, "loss": 3.1221, "step": 67 }, { "epoch": 0.18, "grad_norm": 0.058203239195036796, "learning_rate": 0.0004704156470819261, "loss": 3.1387, "step": 68 }, { "epoch": 0.19, "grad_norm": 0.05654758029632964, "learning_rate": 0.00046937193901902703, "loss": 3.1152, "step": 69 }, { "epoch": 0.19, "grad_norm": 0.07079606102138672, "learning_rate": 0.0004683113377539756, "loss": 3.0811, "step": 70 }, { "epoch": 0.19, "grad_norm": 0.06311116636541605, "learning_rate": 0.00046723392496062513, "loss": 3.1299, "step": 71 }, { "epoch": 0.19, "grad_norm": 0.04930899192235485, "learning_rate": 0.0004661397836074362, "loss": 3.0898, "step": 72 }, { "epoch": 0.2, "grad_norm": 0.07242568044263648, "learning_rate": 0.0004650289979510883, "loss": 3.1328, "step": 73 }, { "epoch": 0.2, "grad_norm": 0.06815171525302943, "learning_rate": 0.00046390165352998996, "loss": 3.0713, "step": 74 }, { "epoch": 0.2, "grad_norm": 0.06138422128904437, "learning_rate": 0.0004627578371576933, "loss": 3.0703, "step": 75 }, { "epoch": 0.21, "grad_norm": 0.059644963475315685, "learning_rate": 0.0004615976369162077, "loss": 3.0898, "step": 76 }, { "epoch": 0.21, "grad_norm": 0.06496043006168827, "learning_rate": 0.00046042114214921723, "loss": 3.0986, "step": 77 }, { "epoch": 0.21, "grad_norm": 0.0634821846883854, "learning_rate": 0.00045922844345520046, "loss": 3.1025, "step": 78 }, { "epoch": 0.21, "grad_norm": 0.06014574410746669, "learning_rate": 0.000458019632680454, "loss": 3.1055, "step": 79 }, { "epoch": 0.22, "grad_norm": 0.06497852199883221, "learning_rate": 0.00045679480291201916, "loss": 3.0781, "step": 80 }, { "epoch": 0.22, "grad_norm": 0.07113303653242763, "learning_rate": 0.00045555404847051406, "loss": 3.0742, "step": 81 }, { "epoch": 0.22, "grad_norm": 0.06890895616862315, "learning_rate": 0.0004542974649028701, "loss": 3.0605, "step": 82 }, { "epoch": 0.22, "grad_norm": 0.06609605331363849, "learning_rate": 0.00045302514897497414, "loss": 3.0859, "step": 83 }, { "epoch": 0.23, "grad_norm": 0.06783983704880324, "learning_rate": 0.0004517371986642168, "loss": 3.0693, "step": 84 }, { "epoch": 0.23, "grad_norm": 0.07647246769844311, "learning_rate": 0.0004504337131519477, "loss": 3.0918, "step": 85 }, { "epoch": 0.23, "grad_norm": 0.07603324518310302, "learning_rate": 0.00044911479281583747, "loss": 3.0547, "step": 86 }, { "epoch": 0.23, "grad_norm": 0.08035265561784409, "learning_rate": 0.0004477805392221483, "loss": 3.0547, "step": 87 }, { "epoch": 0.24, "grad_norm": 0.07818094146866135, "learning_rate": 0.00044643105511791236, "loss": 3.0938, "step": 88 }, { "epoch": 0.24, "grad_norm": 0.08705128277730868, "learning_rate": 0.0004450664444230197, "loss": 3.0811, "step": 89 }, { "epoch": 0.24, "grad_norm": 0.08927687330837643, "learning_rate": 0.00044368681222221553, "loss": 3.0566, "step": 90 }, { "epoch": 0.25, "grad_norm": 0.08533504666800651, "learning_rate": 0.00044229226475700815, "loss": 3.0889, "step": 91 }, { "epoch": 0.25, "grad_norm": 0.08432813563363031, "learning_rate": 0.0004408829094174872, "loss": 3.0312, "step": 92 }, { "epoch": 0.25, "grad_norm": 0.09460079385850353, "learning_rate": 0.0004394588547340546, "loss": 3.041, "step": 93 }, { "epoch": 0.25, "grad_norm": 0.12021954624176756, "learning_rate": 0.00043802021036906604, "loss": 3.0293, "step": 94 }, { "epoch": 0.26, "grad_norm": 0.09741352747719122, "learning_rate": 0.00043656708710838687, "loss": 3.0381, "step": 95 }, { "epoch": 0.26, "grad_norm": 0.1005763508223488, "learning_rate": 0.00043509959685286047, "loss": 3.043, "step": 96 }, { "epoch": 0.26, "grad_norm": 0.11242499386359363, "learning_rate": 0.00043361785260969126, "loss": 3.0674, "step": 97 }, { "epoch": 0.26, "grad_norm": 0.10975775405702991, "learning_rate": 0.0004321219684837422, "loss": 3.0312, "step": 98 }, { "epoch": 0.27, "grad_norm": 0.11737592618628766, "learning_rate": 0.00043061205966874784, "loss": 3.0547, "step": 99 }, { "epoch": 0.27, "grad_norm": 0.13051148309222815, "learning_rate": 0.00042908824243844383, "loss": 3.0225, "step": 100 }, { "epoch": 0.27, "grad_norm": 0.1293140100108899, "learning_rate": 0.00042755063413761304, "loss": 3.0352, "step": 101 }, { "epoch": 0.28, "grad_norm": 0.11252955969439887, "learning_rate": 0.0004259993531730486, "loss": 3.0303, "step": 102 }, { "epoch": 0.28, "grad_norm": 0.1328900419617491, "learning_rate": 0.00042443451900443666, "loss": 3.0146, "step": 103 }, { "epoch": 0.28, "grad_norm": 0.12693126792922263, "learning_rate": 0.0004228562521351562, "loss": 3.0244, "step": 104 }, { "epoch": 0.28, "grad_norm": 0.1251814736102281, "learning_rate": 0.0004212646741030001, "loss": 3.0264, "step": 105 }, { "epoch": 0.29, "grad_norm": 0.13416576734949723, "learning_rate": 0.0004196599074708156, "loss": 3.0166, "step": 106 }, { "epoch": 0.29, "grad_norm": 0.12552446417030455, "learning_rate": 0.0004180420758170659, "loss": 3.0332, "step": 107 }, { "epoch": 0.29, "grad_norm": 0.13647763499691762, "learning_rate": 0.00041641130372631417, "loss": 3.0049, "step": 108 }, { "epoch": 0.29, "grad_norm": 0.12497753797378944, "learning_rate": 0.0004147677167796294, "loss": 2.9766, "step": 109 }, { "epoch": 0.3, "grad_norm": 0.16181335617568174, "learning_rate": 0.00041311144154491567, "loss": 3.0078, "step": 110 }, { "epoch": 0.3, "grad_norm": 0.1231265891953706, "learning_rate": 0.0004114426055671655, "loss": 3.0371, "step": 111 }, { "epoch": 0.3, "grad_norm": 0.1500040260456989, "learning_rate": 0.0004097613373586385, "loss": 3.0137, "step": 112 }, { "epoch": 0.31, "grad_norm": 0.14493053467762143, "learning_rate": 0.0004080677663889644, "loss": 3.0059, "step": 113 }, { "epoch": 0.31, "grad_norm": 0.13498422720011877, "learning_rate": 0.00040636202307517313, "loss": 2.9834, "step": 114 }, { "epoch": 0.31, "grad_norm": 0.13758614368752517, "learning_rate": 0.0004046442387716519, "loss": 2.9883, "step": 115 }, { "epoch": 0.31, "grad_norm": 0.1547910759873873, "learning_rate": 0.0004029145457600298, "loss": 3.0146, "step": 116 }, { "epoch": 0.32, "grad_norm": 0.1114547142061138, "learning_rate": 0.0004011730772389916, "loss": 2.9502, "step": 117 }, { "epoch": 0.32, "grad_norm": 0.158851085984572, "learning_rate": 0.0003994199673140199, "loss": 2.9707, "step": 118 }, { "epoch": 0.32, "grad_norm": 0.15035124478352418, "learning_rate": 0.0003976553509870683, "loss": 2.959, "step": 119 }, { "epoch": 0.32, "grad_norm": 0.18562089234260126, "learning_rate": 0.00039587936414616556, "loss": 2.9893, "step": 120 }, { "epoch": 0.33, "grad_norm": 0.1500050006066447, "learning_rate": 0.00039409214355495093, "loss": 2.9912, "step": 121 }, { "epoch": 0.33, "grad_norm": 0.15173547245397362, "learning_rate": 0.00039229382684214233, "loss": 2.9971, "step": 122 }, { "epoch": 0.33, "grad_norm": 0.17021016573674075, "learning_rate": 0.0003904845524909382, "loss": 2.9766, "step": 123 }, { "epoch": 0.33, "grad_norm": 0.16160325963256653, "learning_rate": 0.0003886644598283533, "loss": 2.9463, "step": 124 }, { "epoch": 0.34, "grad_norm": 0.16620594275883835, "learning_rate": 0.00038683368901448934, "loss": 2.9951, "step": 125 }, { "epoch": 0.34, "grad_norm": 0.18996567972484682, "learning_rate": 0.00038499238103174166, "loss": 2.9492, "step": 126 }, { "epoch": 0.34, "grad_norm": 0.17307569767495753, "learning_rate": 0.00038314067767394286, "loss": 2.9805, "step": 127 }, { "epoch": 0.35, "grad_norm": 0.19685088775869003, "learning_rate": 0.0003812787215354433, "loss": 2.9424, "step": 128 }, { "epoch": 0.35, "grad_norm": 0.16189642626507808, "learning_rate": 0.00037940665600013057, "loss": 2.9512, "step": 129 }, { "epoch": 0.35, "grad_norm": 0.17612947304733076, "learning_rate": 0.00037752462523038793, "loss": 2.9639, "step": 130 }, { "epoch": 0.35, "grad_norm": 0.18179176164130736, "learning_rate": 0.0003756327741559925, "loss": 2.9688, "step": 131 }, { "epoch": 0.36, "grad_norm": 0.16591267900078252, "learning_rate": 0.00037373124846295505, "loss": 2.9482, "step": 132 }, { "epoch": 0.36, "grad_norm": 0.18798955074723858, "learning_rate": 0.00037182019458230066, "loss": 2.9395, "step": 133 }, { "epoch": 0.36, "grad_norm": 0.17543591147515358, "learning_rate": 0.000369899759678793, "loss": 2.9404, "step": 134 }, { "epoch": 0.36, "grad_norm": 0.18490619032970695, "learning_rate": 0.0003679700916396009, "loss": 2.9736, "step": 135 }, { "epoch": 0.37, "grad_norm": 0.1804894673337707, "learning_rate": 0.00036603133906291095, "loss": 2.9561, "step": 136 }, { "epoch": 0.37, "grad_norm": 0.18154491542277257, "learning_rate": 0.0003640836512464837, "loss": 2.9609, "step": 137 }, { "epoch": 0.37, "grad_norm": 0.15353929430307406, "learning_rate": 0.00036212717817615635, "loss": 2.9355, "step": 138 }, { "epoch": 0.38, "grad_norm": 0.21621902507983504, "learning_rate": 0.000360162070514294, "loss": 2.957, "step": 139 }, { "epoch": 0.38, "grad_norm": 0.1829941353882955, "learning_rate": 0.0003581884795881861, "loss": 2.9521, "step": 140 }, { "epoch": 0.38, "grad_norm": 0.18716180371474261, "learning_rate": 0.0003562065573783945, "loss": 2.9434, "step": 141 }, { "epoch": 0.38, "grad_norm": 0.2275322416310555, "learning_rate": 0.0003542164565070486, "loss": 2.9365, "step": 142 }, { "epoch": 0.39, "grad_norm": 0.19125954171713724, "learning_rate": 0.00035221833022609366, "loss": 2.9121, "step": 143 }, { "epoch": 0.39, "grad_norm": 0.23978782914992927, "learning_rate": 0.0003502123324054882, "loss": 2.9473, "step": 144 }, { "epoch": 0.39, "grad_norm": 0.18956138126965327, "learning_rate": 0.0003481986175213556, "loss": 2.9531, "step": 145 }, { "epoch": 0.39, "grad_norm": 0.20333907762683046, "learning_rate": 0.0003461773406440879, "loss": 2.9229, "step": 146 }, { "epoch": 0.4, "grad_norm": 0.21650596272683872, "learning_rate": 0.0003441486574264048, "loss": 2.9639, "step": 147 }, { "epoch": 0.4, "grad_norm": 0.20068147784646773, "learning_rate": 0.00034211272409136667, "loss": 2.9395, "step": 148 }, { "epoch": 0.4, "grad_norm": 0.20053669008991853, "learning_rate": 0.00034006969742034465, "loss": 2.9541, "step": 149 }, { "epoch": 0.41, "grad_norm": 0.19123221320191458, "learning_rate": 0.00033801973474094745, "loss": 2.9668, "step": 150 }, { "epoch": 0.41, "grad_norm": 0.20055890299268714, "learning_rate": 0.00033596299391490565, "loss": 2.918, "step": 151 }, { "epoch": 0.41, "grad_norm": 0.21961462852023858, "learning_rate": 0.0003338996333259155, "loss": 2.8887, "step": 152 }, { "epoch": 0.41, "grad_norm": 0.19923388725160776, "learning_rate": 0.000331829811867442, "loss": 2.915, "step": 153 }, { "epoch": 0.42, "grad_norm": 0.2202467020445118, "learning_rate": 0.0003297536889304835, "loss": 2.9072, "step": 154 }, { "epoch": 0.42, "grad_norm": 0.1896310412062489, "learning_rate": 0.0003276714243912966, "loss": 2.9219, "step": 155 }, { "epoch": 0.42, "grad_norm": 0.24063275517469532, "learning_rate": 0.0003255831785990854, "loss": 2.9268, "step": 156 }, { "epoch": 0.42, "grad_norm": 0.2093352162026971, "learning_rate": 0.000323489112363653, "loss": 2.9102, "step": 157 }, { "epoch": 0.43, "grad_norm": 0.19084865959587785, "learning_rate": 0.000321389386943018, "loss": 2.916, "step": 158 }, { "epoch": 0.43, "grad_norm": 0.24890451630400445, "learning_rate": 0.0003192841640309966, "loss": 2.9492, "step": 159 }, { "epoch": 0.43, "grad_norm": 0.18002995253459828, "learning_rate": 0.0003171736057447511, "loss": 2.9111, "step": 160 }, { "epoch": 0.43, "grad_norm": 0.22447733600257028, "learning_rate": 0.00031505787461230553, "loss": 2.8936, "step": 161 }, { "epoch": 0.44, "grad_norm": 0.2467901168622779, "learning_rate": 0.00031293713356003, "loss": 2.9316, "step": 162 }, { "epoch": 0.44, "grad_norm": 0.20550840515489216, "learning_rate": 0.00031081154590009404, "loss": 2.9453, "step": 163 }, { "epoch": 0.44, "grad_norm": 0.21166602849404573, "learning_rate": 0.0003086812753178907, "loss": 2.916, "step": 164 }, { "epoch": 0.45, "grad_norm": 0.3602799328889876, "learning_rate": 0.0003065464858594311, "loss": 2.9102, "step": 165 }, { "epoch": 0.45, "grad_norm": 0.18298535380931488, "learning_rate": 0.00030440734191871234, "loss": 2.9043, "step": 166 }, { "epoch": 0.45, "grad_norm": 0.2572700716773119, "learning_rate": 0.00030226400822505736, "loss": 2.9033, "step": 167 }, { "epoch": 0.45, "grad_norm": 0.25835419996657133, "learning_rate": 0.00030011664983043025, "loss": 2.9062, "step": 168 }, { "epoch": 0.46, "grad_norm": 0.17683277759979313, "learning_rate": 0.0002979654320967255, "loss": 2.916, "step": 169 }, { "epoch": 0.46, "grad_norm": 0.2894378726624145, "learning_rate": 0.0002958105206830343, "loss": 2.9053, "step": 170 }, { "epoch": 0.46, "grad_norm": 0.23685763825987682, "learning_rate": 0.0002936520815328876, "loss": 2.8857, "step": 171 }, { "epoch": 0.46, "grad_norm": 0.2008573100452373, "learning_rate": 0.000291490280861477, "loss": 2.9297, "step": 172 }, { "epoch": 0.47, "grad_norm": 0.26444755166927053, "learning_rate": 0.00028932528514285534, "loss": 2.9316, "step": 173 }, { "epoch": 0.47, "grad_norm": 0.24379542789804157, "learning_rate": 0.00028715726109711665, "loss": 2.9131, "step": 174 }, { "epoch": 0.47, "grad_norm": 0.20673216784596224, "learning_rate": 0.00028498637567755817, "loss": 2.9082, "step": 175 }, { "epoch": 0.48, "grad_norm": 0.21944969615374402, "learning_rate": 0.0002828127960578227, "loss": 2.9199, "step": 176 }, { "epoch": 0.48, "grad_norm": 0.20478834301866145, "learning_rate": 0.00028063668961902623, "loss": 2.9033, "step": 177 }, { "epoch": 0.48, "grad_norm": 0.21481876173786985, "learning_rate": 0.00027845822393686724, "loss": 2.8809, "step": 178 }, { "epoch": 0.48, "grad_norm": 0.22811931240082148, "learning_rate": 0.00027627756676872344, "loss": 2.8916, "step": 179 }, { "epoch": 0.49, "grad_norm": 0.25707311204201183, "learning_rate": 0.00027409488604073184, "loss": 2.9258, "step": 180 }, { "epoch": 0.49, "grad_norm": 0.22655991900141323, "learning_rate": 0.0002719103498348586, "loss": 2.8936, "step": 181 }, { "epoch": 0.49, "grad_norm": 0.2462832484307767, "learning_rate": 0.00026972412637595445, "loss": 2.8789, "step": 182 }, { "epoch": 0.49, "grad_norm": 0.16995673551677068, "learning_rate": 0.00026753638401880094, "loss": 2.876, "step": 183 }, { "epoch": 0.5, "grad_norm": 0.2536950503045075, "learning_rate": 0.0002653472912351454, "loss": 2.9189, "step": 184 }, { "epoch": 0.5, "grad_norm": 0.20069386810955497, "learning_rate": 0.0002631570166007276, "loss": 2.8838, "step": 185 }, { "epoch": 0.5, "grad_norm": 0.17449546614470834, "learning_rate": 0.00026096572878229855, "loss": 2.8936, "step": 186 }, { "epoch": 0.51, "grad_norm": 0.20654114988441505, "learning_rate": 0.00025877359652463124, "loss": 2.8965, "step": 187 }, { "epoch": 0.51, "grad_norm": 0.19593843356526874, "learning_rate": 0.0002565807886375267, "loss": 2.8984, "step": 188 }, { "epoch": 0.51, "grad_norm": 0.20292351001271516, "learning_rate": 0.00025438747398281433, "loss": 2.9004, "step": 189 }, { "epoch": 0.51, "grad_norm": 0.2043498399445655, "learning_rate": 0.00025219382146134806, "loss": 2.8516, "step": 190 }, { "epoch": 0.52, "grad_norm": 0.1661636250320572, "learning_rate": 0.00025, "loss": 2.8545, "step": 191 }, { "epoch": 0.52, "grad_norm": 0.19022198317016406, "learning_rate": 0.00024780617853865195, "loss": 2.9092, "step": 192 }, { "epoch": 0.52, "grad_norm": 0.22844383852438707, "learning_rate": 0.0002456125260171857, "loss": 2.9111, "step": 193 }, { "epoch": 0.52, "grad_norm": 0.17676802172340947, "learning_rate": 0.00024341921136247338, "loss": 2.8857, "step": 194 }, { "epoch": 0.53, "grad_norm": 0.17406205780039047, "learning_rate": 0.00024122640347536885, "loss": 2.8896, "step": 195 }, { "epoch": 0.53, "grad_norm": 0.21596180880748264, "learning_rate": 0.00023903427121770148, "loss": 2.8643, "step": 196 }, { "epoch": 0.53, "grad_norm": 0.2302727016974318, "learning_rate": 0.0002368429833992724, "loss": 2.8936, "step": 197 }, { "epoch": 0.53, "grad_norm": 0.193768899062196, "learning_rate": 0.00023465270876485474, "loss": 2.8496, "step": 198 }, { "epoch": 0.54, "grad_norm": 0.1784013570789268, "learning_rate": 0.0002324636159811991, "loss": 2.8379, "step": 199 }, { "epoch": 0.54, "grad_norm": 0.20783430292995747, "learning_rate": 0.00023027587362404558, "loss": 2.8799, "step": 200 }, { "epoch": 0.54, "grad_norm": 0.20944704580582277, "learning_rate": 0.00022808965016514143, "loss": 2.8965, "step": 201 }, { "epoch": 0.55, "grad_norm": 0.21496187154467147, "learning_rate": 0.00022590511395926811, "loss": 2.8672, "step": 202 }, { "epoch": 0.55, "grad_norm": 0.21633737213344018, "learning_rate": 0.00022372243323127663, "loss": 2.875, "step": 203 }, { "epoch": 0.55, "grad_norm": 0.19865551825102143, "learning_rate": 0.00022154177606313277, "loss": 2.8672, "step": 204 }, { "epoch": 0.55, "grad_norm": 0.18963996378916614, "learning_rate": 0.00021936331038097386, "loss": 2.9023, "step": 205 }, { "epoch": 0.56, "grad_norm": 0.20425004554589565, "learning_rate": 0.00021718720394217725, "loss": 2.8574, "step": 206 }, { "epoch": 0.56, "grad_norm": 0.2555973427913448, "learning_rate": 0.00021501362432244193, "loss": 2.8701, "step": 207 }, { "epoch": 0.56, "grad_norm": 0.1953962081104273, "learning_rate": 0.00021284273890288336, "loss": 2.8779, "step": 208 }, { "epoch": 0.56, "grad_norm": 0.2248900981224502, "learning_rate": 0.00021067471485714472, "loss": 2.8398, "step": 209 }, { "epoch": 0.57, "grad_norm": 0.26993244613823164, "learning_rate": 0.00020850971913852307, "loss": 2.8613, "step": 210 }, { "epoch": 0.57, "grad_norm": 0.2022265920119534, "learning_rate": 0.00020634791846711243, "loss": 2.8574, "step": 211 }, { "epoch": 0.57, "grad_norm": 0.24664106252564721, "learning_rate": 0.00020418947931696575, "loss": 2.8564, "step": 212 }, { "epoch": 0.58, "grad_norm": 0.1967992968536126, "learning_rate": 0.00020203456790327453, "loss": 2.8555, "step": 213 }, { "epoch": 0.58, "grad_norm": 0.22710937800094108, "learning_rate": 0.0001998833501695698, "loss": 2.8594, "step": 214 }, { "epoch": 0.58, "grad_norm": 0.2609592105615878, "learning_rate": 0.00019773599177494265, "loss": 2.8682, "step": 215 }, { "epoch": 0.58, "grad_norm": 0.20377916758575193, "learning_rate": 0.00019559265808128778, "loss": 2.8496, "step": 216 }, { "epoch": 0.59, "grad_norm": 0.2298131342082367, "learning_rate": 0.00019345351414056894, "loss": 2.8398, "step": 217 }, { "epoch": 0.59, "grad_norm": 0.234771024880693, "learning_rate": 0.00019131872468210938, "loss": 2.8467, "step": 218 }, { "epoch": 0.59, "grad_norm": 0.23844110318408976, "learning_rate": 0.00018918845409990592, "loss": 2.917, "step": 219 }, { "epoch": 0.59, "grad_norm": 0.19036660800238525, "learning_rate": 0.00018706286643996997, "loss": 2.8564, "step": 220 }, { "epoch": 0.6, "grad_norm": 0.20622165918252797, "learning_rate": 0.0001849421253876945, "loss": 2.8848, "step": 221 }, { "epoch": 0.6, "grad_norm": 0.1883236349273076, "learning_rate": 0.000182826394255249, "loss": 2.8643, "step": 222 }, { "epoch": 0.6, "grad_norm": 0.22937231306412564, "learning_rate": 0.00018071583596900343, "loss": 2.8623, "step": 223 }, { "epoch": 0.6, "grad_norm": 0.21538020857939882, "learning_rate": 0.00017861061305698205, "loss": 2.8438, "step": 224 }, { "epoch": 0.61, "grad_norm": 0.1918870218278195, "learning_rate": 0.0001765108876363471, "loss": 2.8467, "step": 225 }, { "epoch": 0.61, "grad_norm": 0.21091693539511583, "learning_rate": 0.00017441682140091464, "loss": 2.8281, "step": 226 }, { "epoch": 0.61, "grad_norm": 0.30476893084177187, "learning_rate": 0.0001723285756087034, "loss": 2.873, "step": 227 }, { "epoch": 0.62, "grad_norm": 0.17816221080916358, "learning_rate": 0.0001702463110695166, "loss": 2.8525, "step": 228 }, { "epoch": 0.62, "grad_norm": 0.195493340516346, "learning_rate": 0.00016817018813255796, "loss": 2.8359, "step": 229 }, { "epoch": 0.62, "grad_norm": 0.23565534271641125, "learning_rate": 0.00016610036667408462, "loss": 2.8535, "step": 230 }, { "epoch": 0.62, "grad_norm": 0.2033674101509759, "learning_rate": 0.00016403700608509438, "loss": 2.832, "step": 231 }, { "epoch": 0.63, "grad_norm": 0.1643046553266504, "learning_rate": 0.00016198026525905256, "loss": 2.8525, "step": 232 }, { "epoch": 0.63, "grad_norm": 0.18680871871908325, "learning_rate": 0.00015993030257965536, "loss": 2.8496, "step": 233 }, { "epoch": 0.63, "grad_norm": 0.21879472472240535, "learning_rate": 0.00015788727590863345, "loss": 2.8457, "step": 234 }, { "epoch": 0.63, "grad_norm": 0.189459681090247, "learning_rate": 0.00015585134257359523, "loss": 2.8252, "step": 235 }, { "epoch": 0.64, "grad_norm": 0.20698851721297937, "learning_rate": 0.00015382265935591212, "loss": 2.8545, "step": 236 }, { "epoch": 0.64, "grad_norm": 0.21230475441950047, "learning_rate": 0.00015180138247864445, "loss": 2.8682, "step": 237 }, { "epoch": 0.64, "grad_norm": 0.2170131840363288, "learning_rate": 0.00014978766759451187, "loss": 2.835, "step": 238 }, { "epoch": 0.65, "grad_norm": 0.1747044380388941, "learning_rate": 0.00014778166977390643, "loss": 2.8193, "step": 239 }, { "epoch": 0.65, "grad_norm": 0.2195091552174841, "learning_rate": 0.00014578354349295138, "loss": 2.8428, "step": 240 }, { "epoch": 0.65, "grad_norm": 0.17839712623841394, "learning_rate": 0.0001437934426216056, "loss": 2.8496, "step": 241 }, { "epoch": 0.65, "grad_norm": 0.2218268901944312, "learning_rate": 0.0001418115204118139, "loss": 2.8379, "step": 242 }, { "epoch": 0.66, "grad_norm": 0.22291238199967986, "learning_rate": 0.00013983792948570606, "loss": 2.8418, "step": 243 }, { "epoch": 0.66, "grad_norm": 0.2314705382253311, "learning_rate": 0.00013787282182384363, "loss": 2.835, "step": 244 }, { "epoch": 0.66, "grad_norm": 0.2062566807258234, "learning_rate": 0.0001359163487535164, "loss": 2.8174, "step": 245 }, { "epoch": 0.66, "grad_norm": 0.20906998538613936, "learning_rate": 0.00013396866093708898, "loss": 2.8184, "step": 246 }, { "epoch": 0.67, "grad_norm": 0.2687448764467888, "learning_rate": 0.0001320299083603992, "loss": 2.835, "step": 247 }, { "epoch": 0.67, "grad_norm": 0.1990426441612401, "learning_rate": 0.00013010024032120718, "loss": 2.877, "step": 248 }, { "epoch": 0.67, "grad_norm": 0.20269787106945644, "learning_rate": 0.00012817980541769935, "loss": 2.8564, "step": 249 }, { "epoch": 0.68, "grad_norm": 0.19805581453093807, "learning_rate": 0.000126268751537045, "loss": 2.8447, "step": 250 }, { "epoch": 0.68, "grad_norm": 0.204837890537741, "learning_rate": 0.00012436722584400751, "loss": 2.8418, "step": 251 }, { "epoch": 0.68, "grad_norm": 0.169050938877304, "learning_rate": 0.00012247537476961214, "loss": 2.8311, "step": 252 }, { "epoch": 0.68, "grad_norm": 0.21928573521752834, "learning_rate": 0.00012059334399986949, "loss": 2.8066, "step": 253 }, { "epoch": 0.69, "grad_norm": 0.18055284433976476, "learning_rate": 0.00011872127846455672, "loss": 2.8125, "step": 254 }, { "epoch": 0.69, "grad_norm": 0.20659092646221383, "learning_rate": 0.00011685932232605718, "loss": 2.8066, "step": 255 }, { "epoch": 0.69, "grad_norm": 0.20299552954457484, "learning_rate": 0.00011500761896825842, "loss": 2.8398, "step": 256 }, { "epoch": 0.69, "grad_norm": 0.21714725029231918, "learning_rate": 0.00011316631098551067, "loss": 2.832, "step": 257 }, { "epoch": 0.7, "grad_norm": 0.21963153956688958, "learning_rate": 0.00011133554017164671, "loss": 2.8193, "step": 258 }, { "epoch": 0.7, "grad_norm": 0.19726411127227286, "learning_rate": 0.00010951544750906181, "loss": 2.8213, "step": 259 }, { "epoch": 0.7, "grad_norm": 0.21949146356088167, "learning_rate": 0.00010770617315785766, "loss": 2.8418, "step": 260 }, { "epoch": 0.7, "grad_norm": 0.2126404293875657, "learning_rate": 0.00010590785644504919, "loss": 2.833, "step": 261 }, { "epoch": 0.71, "grad_norm": 0.2071663035323768, "learning_rate": 0.00010412063585383452, "loss": 2.8174, "step": 262 }, { "epoch": 0.71, "grad_norm": 0.222005661427221, "learning_rate": 0.00010234464901293173, "loss": 2.8076, "step": 263 }, { "epoch": 0.71, "grad_norm": 0.2233059412032165, "learning_rate": 0.00010058003268598015, "loss": 2.8389, "step": 264 }, { "epoch": 0.72, "grad_norm": 0.20234119110348878, "learning_rate": 9.882692276100841e-05, "loss": 2.8242, "step": 265 }, { "epoch": 0.72, "grad_norm": 0.22629912766899482, "learning_rate": 9.70854542399702e-05, "loss": 2.8232, "step": 266 }, { "epoch": 0.72, "grad_norm": 0.19123353391124664, "learning_rate": 9.535576122834819e-05, "loss": 2.8418, "step": 267 }, { "epoch": 0.72, "grad_norm": 0.20149824977959827, "learning_rate": 9.363797692482695e-05, "loss": 2.8115, "step": 268 }, { "epoch": 0.73, "grad_norm": 0.17961635360231348, "learning_rate": 9.193223361103556e-05, "loss": 2.8291, "step": 269 }, { "epoch": 0.73, "grad_norm": 0.2121856691428259, "learning_rate": 9.023866264136154e-05, "loss": 2.8457, "step": 270 }, { "epoch": 0.73, "grad_norm": 0.18851239383885068, "learning_rate": 8.85573944328345e-05, "loss": 2.7979, "step": 271 }, { "epoch": 0.73, "grad_norm": 0.17521924172534067, "learning_rate": 8.688855845508442e-05, "loss": 2.8086, "step": 272 }, { "epoch": 0.74, "grad_norm": 0.18640071329178245, "learning_rate": 8.523228322037063e-05, "loss": 2.8369, "step": 273 }, { "epoch": 0.74, "grad_norm": 0.16959390971032334, "learning_rate": 8.358869627368584e-05, "loss": 2.8398, "step": 274 }, { "epoch": 0.74, "grad_norm": 0.21111788917507904, "learning_rate": 8.195792418293416e-05, "loss": 2.7852, "step": 275 }, { "epoch": 0.75, "grad_norm": 0.17179812719144402, "learning_rate": 8.03400925291845e-05, "loss": 2.8115, "step": 276 }, { "epoch": 0.75, "grad_norm": 0.20491937385204168, "learning_rate": 7.873532589699988e-05, "loss": 2.791, "step": 277 }, { "epoch": 0.75, "grad_norm": 0.16357238878820704, "learning_rate": 7.71437478648438e-05, "loss": 2.8291, "step": 278 }, { "epoch": 0.75, "grad_norm": 0.1835990750271033, "learning_rate": 7.556548099556338e-05, "loss": 2.7998, "step": 279 }, { "epoch": 0.76, "grad_norm": 0.1907507274896681, "learning_rate": 7.40006468269514e-05, "loss": 2.8213, "step": 280 }, { "epoch": 0.76, "grad_norm": 0.18564884527181547, "learning_rate": 7.244936586238704e-05, "loss": 2.8301, "step": 281 }, { "epoch": 0.76, "grad_norm": 0.17700032167669388, "learning_rate": 7.091175756155618e-05, "loss": 2.8115, "step": 282 }, { "epoch": 0.76, "grad_norm": 0.16847559163220463, "learning_rate": 6.938794033125225e-05, "loss": 2.833, "step": 283 }, { "epoch": 0.77, "grad_norm": 0.18723560047102772, "learning_rate": 6.787803151625791e-05, "loss": 2.8281, "step": 284 }, { "epoch": 0.77, "grad_norm": 0.18633675001733044, "learning_rate": 6.63821473903087e-05, "loss": 2.8184, "step": 285 }, { "epoch": 0.77, "grad_norm": 0.1822056204472567, "learning_rate": 6.490040314713952e-05, "loss": 2.834, "step": 286 }, { "epoch": 0.78, "grad_norm": 0.19871618333174051, "learning_rate": 6.343291289161316e-05, "loss": 2.8174, "step": 287 }, { "epoch": 0.78, "grad_norm": 0.20546173536543255, "learning_rate": 6.197978963093401e-05, "loss": 2.8242, "step": 288 }, { "epoch": 0.78, "grad_norm": 0.19374612820702158, "learning_rate": 6.054114526594548e-05, "loss": 2.835, "step": 289 }, { "epoch": 0.78, "grad_norm": 0.18924589658433175, "learning_rate": 5.911709058251283e-05, "loss": 2.7988, "step": 290 }, { "epoch": 0.79, "grad_norm": 0.22644302618589385, "learning_rate": 5.770773524299192e-05, "loss": 2.8525, "step": 291 }, { "epoch": 0.79, "grad_norm": 0.18156812551515011, "learning_rate": 5.631318777778452e-05, "loss": 2.8047, "step": 292 }, { "epoch": 0.79, "grad_norm": 0.19435635715822475, "learning_rate": 5.493355557698029e-05, "loss": 2.8203, "step": 293 }, { "epoch": 0.79, "grad_norm": 0.20414237163125332, "learning_rate": 5.3568944882087604e-05, "loss": 2.8369, "step": 294 }, { "epoch": 0.8, "grad_norm": 0.18567184675458911, "learning_rate": 5.2219460777851695e-05, "loss": 2.8398, "step": 295 }, { "epoch": 0.8, "grad_norm": 0.1783969629345075, "learning_rate": 5.088520718416254e-05, "loss": 2.8057, "step": 296 }, { "epoch": 0.8, "grad_norm": 0.18247845423738968, "learning_rate": 4.9566286848052363e-05, "loss": 2.835, "step": 297 }, { "epoch": 0.8, "grad_norm": 0.18785911029833752, "learning_rate": 4.826280133578323e-05, "loss": 2.8242, "step": 298 }, { "epoch": 0.81, "grad_norm": 0.18953403976774194, "learning_rate": 4.697485102502588e-05, "loss": 2.8252, "step": 299 }, { "epoch": 0.81, "grad_norm": 0.18190067624900982, "learning_rate": 4.5702535097129906e-05, "loss": 2.835, "step": 300 }, { "epoch": 0.81, "grad_norm": 0.20251912717942616, "learning_rate": 4.444595152948605e-05, "loss": 2.8242, "step": 301 }, { "epoch": 0.82, "grad_norm": 0.18960132345131622, "learning_rate": 4.32051970879809e-05, "loss": 2.8418, "step": 302 }, { "epoch": 0.82, "grad_norm": 0.20472286168933038, "learning_rate": 4.198036731954605e-05, "loss": 2.79, "step": 303 }, { "epoch": 0.82, "grad_norm": 0.1885255817179192, "learning_rate": 4.077155654479955e-05, "loss": 2.8076, "step": 304 }, { "epoch": 0.82, "grad_norm": 0.17197377072124265, "learning_rate": 3.957885785078277e-05, "loss": 2.8125, "step": 305 }, { "epoch": 0.83, "grad_norm": 0.17243813839090957, "learning_rate": 3.840236308379233e-05, "loss": 2.7998, "step": 306 }, { "epoch": 0.83, "grad_norm": 0.187479341506351, "learning_rate": 3.7242162842306724e-05, "loss": 2.8076, "step": 307 }, { "epoch": 0.83, "grad_norm": 0.19619960697754343, "learning_rate": 3.609834647001001e-05, "loss": 2.8125, "step": 308 }, { "epoch": 0.83, "grad_norm": 0.1969308587109652, "learning_rate": 3.497100204891177e-05, "loss": 2.7891, "step": 309 }, { "epoch": 0.84, "grad_norm": 0.1856744385771311, "learning_rate": 3.386021639256373e-05, "loss": 2.8076, "step": 310 }, { "epoch": 0.84, "grad_norm": 0.19472326340578114, "learning_rate": 3.2766075039374926e-05, "loss": 2.8135, "step": 311 }, { "epoch": 0.84, "grad_norm": 0.16977810395348839, "learning_rate": 3.168866224602443e-05, "loss": 2.8008, "step": 312 }, { "epoch": 0.85, "grad_norm": 0.16737273719653672, "learning_rate": 3.0628060980972956e-05, "loss": 2.8096, "step": 313 }, { "epoch": 0.85, "grad_norm": 0.18706543053983507, "learning_rate": 2.958435291807396e-05, "loss": 2.8223, "step": 314 }, { "epoch": 0.85, "grad_norm": 0.20138299122195422, "learning_rate": 2.8557618430283845e-05, "loss": 2.8145, "step": 315 }, { "epoch": 0.85, "grad_norm": 0.1825577738379651, "learning_rate": 2.7547936583472682e-05, "loss": 2.8135, "step": 316 }, { "epoch": 0.86, "grad_norm": 0.1681442717695743, "learning_rate": 2.655538513033598e-05, "loss": 2.8174, "step": 317 }, { "epoch": 0.86, "grad_norm": 0.1886334015299787, "learning_rate": 2.5580040504406736e-05, "loss": 2.8105, "step": 318 }, { "epoch": 0.86, "grad_norm": 0.18472284304096284, "learning_rate": 2.4621977814169668e-05, "loss": 2.8076, "step": 319 }, { "epoch": 0.86, "grad_norm": 0.16272769219517882, "learning_rate": 2.3681270837277365e-05, "loss": 2.8174, "step": 320 }, { "epoch": 0.87, "grad_norm": 0.19968898028239582, "learning_rate": 2.2757992014868806e-05, "loss": 2.8242, "step": 321 }, { "epoch": 0.87, "grad_norm": 0.17043048449106335, "learning_rate": 2.1852212445990894e-05, "loss": 2.8135, "step": 322 }, { "epoch": 0.87, "grad_norm": 0.1644894369897433, "learning_rate": 2.0964001882123466e-05, "loss": 2.7998, "step": 323 }, { "epoch": 0.88, "grad_norm": 0.16881143272811544, "learning_rate": 2.0093428721807694e-05, "loss": 2.8252, "step": 324 }, { "epoch": 0.88, "grad_norm": 0.16555621080410832, "learning_rate": 1.924056000537905e-05, "loss": 2.7939, "step": 325 }, { "epoch": 0.88, "grad_norm": 0.16784278084821697, "learning_rate": 1.8405461409804676e-05, "loss": 2.8242, "step": 326 }, { "epoch": 0.88, "grad_norm": 0.1641280362316012, "learning_rate": 1.7588197243625816e-05, "loss": 2.8223, "step": 327 }, { "epoch": 0.89, "grad_norm": 0.17345327239016983, "learning_rate": 1.6788830442005742e-05, "loss": 2.835, "step": 328 }, { "epoch": 0.89, "grad_norm": 0.1678603553402573, "learning_rate": 1.600742256188287e-05, "loss": 2.8623, "step": 329 }, { "epoch": 0.89, "grad_norm": 0.17995413273432045, "learning_rate": 1.5244033777230909e-05, "loss": 2.7979, "step": 330 }, { "epoch": 0.89, "grad_norm": 0.17259350839299145, "learning_rate": 1.4498722874424859e-05, "loss": 2.8105, "step": 331 }, { "epoch": 0.9, "grad_norm": 0.1741787680290243, "learning_rate": 1.3771547247713956e-05, "loss": 2.7979, "step": 332 }, { "epoch": 0.9, "grad_norm": 0.1654260393599542, "learning_rate": 1.3062562894801955e-05, "loss": 2.7998, "step": 333 }, { "epoch": 0.9, "grad_norm": 0.15975325006455904, "learning_rate": 1.237182441253501e-05, "loss": 2.8076, "step": 334 }, { "epoch": 0.9, "grad_norm": 0.1831835291092947, "learning_rate": 1.169938499269721e-05, "loss": 2.833, "step": 335 }, { "epoch": 0.91, "grad_norm": 0.17054992749801087, "learning_rate": 1.1045296417914408e-05, "loss": 2.8125, "step": 336 }, { "epoch": 0.91, "grad_norm": 0.1914560652459072, "learning_rate": 1.040960905766683e-05, "loss": 2.8174, "step": 337 }, { "epoch": 0.91, "grad_norm": 0.16125789541336907, "learning_rate": 9.792371864409844e-06, "loss": 2.8145, "step": 338 }, { "epoch": 0.92, "grad_norm": 0.18343996045358885, "learning_rate": 9.19363236980475e-06, "loss": 2.8105, "step": 339 }, { "epoch": 0.92, "grad_norm": 0.19474642229296382, "learning_rate": 8.613436681058128e-06, "loss": 2.7979, "step": 340 }, { "epoch": 0.92, "grad_norm": 0.1541708717690255, "learning_rate": 8.051829477371458e-06, "loss": 2.8076, "step": 341 }, { "epoch": 0.92, "grad_norm": 0.15274058388659703, "learning_rate": 7.508854006500393e-06, "loss": 2.8057, "step": 342 }, { "epoch": 0.93, "grad_norm": 0.1778349122094768, "learning_rate": 6.984552081424456e-06, "loss": 2.8252, "step": 343 }, { "epoch": 0.93, "grad_norm": 0.18535400361130824, "learning_rate": 6.4789640771270864e-06, "loss": 2.8506, "step": 344 }, { "epoch": 0.93, "grad_norm": 0.17517598706340834, "learning_rate": 5.992128927486573e-06, "loss": 2.8047, "step": 345 }, { "epoch": 0.93, "grad_norm": 0.1594767650558485, "learning_rate": 5.524084122277756e-06, "loss": 2.7871, "step": 346 }, { "epoch": 0.94, "grad_norm": 0.17674478758195836, "learning_rate": 5.074865704284975e-06, "loss": 2.8242, "step": 347 }, { "epoch": 0.94, "grad_norm": 0.1795205762704063, "learning_rate": 4.644508266526848e-06, "loss": 2.8037, "step": 348 }, { "epoch": 0.94, "grad_norm": 0.16428283866890742, "learning_rate": 4.23304494959198e-06, "loss": 2.7852, "step": 349 }, { "epoch": 0.95, "grad_norm": 0.1637667775135852, "learning_rate": 3.840507439087093e-06, "loss": 2.8252, "step": 350 }, { "epoch": 0.95, "grad_norm": 0.1904128210839155, "learning_rate": 3.4669259631970873e-06, "loss": 2.8047, "step": 351 }, { "epoch": 0.95, "grad_norm": 0.17893501780149162, "learning_rate": 3.1123292903569854e-06, "loss": 2.8115, "step": 352 }, { "epoch": 0.95, "grad_norm": 0.17860019081470324, "learning_rate": 2.7767447270368453e-06, "loss": 2.8184, "step": 353 }, { "epoch": 0.96, "grad_norm": 0.16537005007226285, "learning_rate": 2.4601981156388566e-06, "loss": 2.7988, "step": 354 }, { "epoch": 0.96, "grad_norm": 0.1482593636447568, "learning_rate": 2.162713832507157e-06, "loss": 2.7979, "step": 355 }, { "epoch": 0.96, "grad_norm": 0.17394817869790966, "learning_rate": 1.8843147860509445e-06, "loss": 2.7988, "step": 356 }, { "epoch": 0.96, "grad_norm": 0.1728142685313597, "learning_rate": 1.625022414980165e-06, "loss": 2.792, "step": 357 }, { "epoch": 0.97, "grad_norm": 0.17603370515257444, "learning_rate": 1.3848566866545842e-06, "loss": 2.7998, "step": 358 }, { "epoch": 0.97, "grad_norm": 0.16591885986335803, "learning_rate": 1.1638360955463222e-06, "loss": 2.8135, "step": 359 }, { "epoch": 0.97, "grad_norm": 0.17699991948505095, "learning_rate": 9.619776618154386e-07, "loss": 2.8076, "step": 360 }, { "epoch": 0.98, "grad_norm": 0.1690937483022535, "learning_rate": 7.792969299994512e-07, "loss": 2.8438, "step": 361 }, { "epoch": 0.98, "grad_norm": 0.18098457876096985, "learning_rate": 6.158079678161566e-07, "loss": 2.8408, "step": 362 }, { "epoch": 0.98, "grad_norm": 0.18213680457305967, "learning_rate": 4.7152336508038407e-07, "loss": 2.8311, "step": 363 }, { "epoch": 0.98, "grad_norm": 0.16488850320672738, "learning_rate": 3.464542327344111e-07, "loss": 2.7949, "step": 364 }, { "epoch": 0.99, "grad_norm": 0.1606825195560946, "learning_rate": 2.406102019924805e-07, "loss": 2.8154, "step": 365 }, { "epoch": 0.99, "grad_norm": 0.1743249393559749, "learning_rate": 1.5399942359897723e-07, "loss": 2.8174, "step": 366 }, { "epoch": 0.99, "grad_norm": 0.1890660779935612, "learning_rate": 8.662856720090261e-08, "loss": 2.8359, "step": 367 }, { "epoch": 0.99, "grad_norm": 0.17906338951545656, "learning_rate": 3.8502820834174226e-08, "loss": 2.7842, "step": 368 }, { "epoch": 1.0, "grad_norm": 0.189945263964402, "learning_rate": 9.625890524084246e-09, "loss": 2.8037, "step": 369 }, { "epoch": 1.0, "grad_norm": 0.16386427942822598, "learning_rate": 0.0, "loss": 2.8037, "step": 370 }, { "epoch": 1.0, "step": 370, "total_flos": 2.715544613577294e+18, "train_loss": 3.01240234375, "train_runtime": 27753.3023, "train_samples_per_second": 27.316, "train_steps_per_second": 0.013 } ], "logging_steps": 1.0, "max_steps": 370, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 2.715544613577294e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }