diff --git "a/checkpoint-7000/trainer_state.json" "b/checkpoint-7000/trainer_state.json" --- "a/checkpoint-7000/trainer_state.json" +++ "b/checkpoint-7000/trainer_state.json" @@ -11,11208 +11,11208 @@ "log_history": [ { "epoch": 0.0013582342954159593, - "grad_norm": 2.606816053390503, + "grad_norm": 2.6726086139678955, "learning_rate": 4.999998381340316e-05, - "loss": 3.5266, - "num_input_tokens_seen": 40960, + "loss": 3.4614, + "num_input_tokens_seen": 41696, "step": 5 }, { "epoch": 0.0027164685908319186, - "grad_norm": 1.7451986074447632, + "grad_norm": 2.4156033992767334, "learning_rate": 4.999991805538943e-05, - "loss": 2.4357, - "num_input_tokens_seen": 85344, + "loss": 2.428, + "num_input_tokens_seen": 86288, "step": 10 }, { "epoch": 0.0040747028862478775, - "grad_norm": 2.419678211212158, + "grad_norm": 2.0973830223083496, "learning_rate": 4.999980171442944e-05, - "loss": 2.0181, - "num_input_tokens_seen": 118864, + "loss": 1.9754, + "num_input_tokens_seen": 119888, "step": 15 }, { "epoch": 0.005432937181663837, - "grad_norm": 1.4648441076278687, + "grad_norm": 1.3947581052780151, "learning_rate": 4.999963479075859e-05, - "loss": 1.7434, - "num_input_tokens_seen": 155888, + "loss": 1.6776, + "num_input_tokens_seen": 157616, "step": 20 }, { "epoch": 0.006791171477079796, - "grad_norm": 1.6045117378234863, + "grad_norm": 1.5086157321929932, "learning_rate": 4.999941728471462e-05, - "loss": 1.3569, - "num_input_tokens_seen": 200576, + "loss": 1.3285, + "num_input_tokens_seen": 202592, "step": 25 }, { "epoch": 0.008149405772495755, - "grad_norm": 1.615235447883606, + "grad_norm": 1.5990394353866577, "learning_rate": 4.999914919673763e-05, - "loss": 1.1605, - "num_input_tokens_seen": 240160, + "loss": 1.139, + "num_input_tokens_seen": 242576, "step": 30 }, { "epoch": 0.009507640067911714, - "grad_norm": 1.691972017288208, + "grad_norm": 1.596232295036316, "learning_rate": 4.999883052737003e-05, - "loss": 1.2038, - "num_input_tokens_seen": 273984, + "loss": 1.1718, + "num_input_tokens_seen": 276912, "step": 35 }, { "epoch": 0.010865874363327675, - "grad_norm": 1.8017555475234985, + "grad_norm": 1.7790018320083618, "learning_rate": 4.999846127725661e-05, - "loss": 0.9813, - "num_input_tokens_seen": 307904, + "loss": 0.9303, + "num_input_tokens_seen": 311232, "step": 40 }, { "epoch": 0.012224108658743633, - "grad_norm": 1.4876317977905273, + "grad_norm": 1.4563114643096924, "learning_rate": 4.999804144714446e-05, - "loss": 0.8857, - "num_input_tokens_seen": 345312, + "loss": 0.8536, + "num_input_tokens_seen": 348464, "step": 45 }, { "epoch": 0.013582342954159592, - "grad_norm": 1.6874938011169434, + "grad_norm": 1.6613279581069946, "learning_rate": 4.999757103788305e-05, - "loss": 0.7422, - "num_input_tokens_seen": 381936, + "loss": 0.7033, + "num_input_tokens_seen": 385856, "step": 50 }, { "epoch": 0.014940577249575551, - "grad_norm": 1.6182847023010254, + "grad_norm": 1.6354061365127563, "learning_rate": 4.999705005042417e-05, - "loss": 0.6914, - "num_input_tokens_seen": 416704, + "loss": 0.6725, + "num_input_tokens_seen": 421264, "step": 55 }, { "epoch": 0.01629881154499151, - "grad_norm": 1.5605204105377197, + "grad_norm": 1.879230260848999, "learning_rate": 4.999647848582194e-05, - "loss": 0.6277, - "num_input_tokens_seen": 455920, + "loss": 0.6176, + "num_input_tokens_seen": 460544, "step": 60 }, { "epoch": 0.01765704584040747, - "grad_norm": 2.5119259357452393, + "grad_norm": 2.0756216049194336, "learning_rate": 4.9995856345232814e-05, - "loss": 0.5642, - "num_input_tokens_seen": 488768, + "loss": 0.535, + "num_input_tokens_seen": 493632, "step": 65 }, { "epoch": 0.019015280135823428, - "grad_norm": 2.1353073120117188, + "grad_norm": 2.511418104171753, "learning_rate": 4.9995183629915606e-05, - "loss": 0.4256, - "num_input_tokens_seen": 523120, + "loss": 0.4311, + "num_input_tokens_seen": 528384, "step": 70 }, { "epoch": 0.02037351443123939, - "grad_norm": 1.7133029699325562, + "grad_norm": 1.5566009283065796, "learning_rate": 4.999446034123141e-05, - "loss": 0.4022, - "num_input_tokens_seen": 557408, + "loss": 0.388, + "num_input_tokens_seen": 562848, "step": 75 }, { "epoch": 0.02173174872665535, - "grad_norm": 1.813982367515564, + "grad_norm": 1.6081215143203735, "learning_rate": 4.99936864806437e-05, - "loss": 0.37, - "num_input_tokens_seen": 592544, + "loss": 0.3677, + "num_input_tokens_seen": 598128, "step": 80 }, { "epoch": 0.023089983022071308, - "grad_norm": 1.61416757106781, + "grad_norm": 1.6574671268463135, "learning_rate": 4.999286204971823e-05, - "loss": 0.3527, - "num_input_tokens_seen": 626096, + "loss": 0.3417, + "num_input_tokens_seen": 631920, "step": 85 }, { "epoch": 0.024448217317487267, - "grad_norm": 1.4342936277389526, + "grad_norm": 1.4221380949020386, "learning_rate": 4.99919870501231e-05, - "loss": 0.3161, - "num_input_tokens_seen": 660896, + "loss": 0.3047, + "num_input_tokens_seen": 667632, "step": 90 }, { "epoch": 0.025806451612903226, - "grad_norm": 1.4241148233413696, + "grad_norm": 1.443021297454834, "learning_rate": 4.999106148362871e-05, - "loss": 0.3546, - "num_input_tokens_seen": 698672, + "loss": 0.3368, + "num_input_tokens_seen": 705600, "step": 95 }, { "epoch": 0.027164685908319185, - "grad_norm": 1.3029650449752808, + "grad_norm": 1.3677445650100708, "learning_rate": 4.999008535210779e-05, - "loss": 0.2578, - "num_input_tokens_seen": 739616, + "loss": 0.2455, + "num_input_tokens_seen": 746880, "step": 100 }, { "epoch": 0.028522920203735144, - "grad_norm": 2.0215160846710205, + "grad_norm": 1.9557136297225952, "learning_rate": 4.998905865753536e-05, - "loss": 0.3829, - "num_input_tokens_seen": 776336, + "loss": 0.3735, + "num_input_tokens_seen": 783888, "step": 105 }, { "epoch": 0.029881154499151102, - "grad_norm": 1.4769304990768433, + "grad_norm": 1.343113899230957, "learning_rate": 4.998798140198877e-05, - "loss": 0.2823, - "num_input_tokens_seen": 819152, + "loss": 0.2601, + "num_input_tokens_seen": 826992, "step": 110 }, { "epoch": 0.03123938879456706, - "grad_norm": 1.9062002897262573, + "grad_norm": 1.6134370565414429, "learning_rate": 4.9986853587647646e-05, - "loss": 0.2595, - "num_input_tokens_seen": 858960, + "loss": 0.2492, + "num_input_tokens_seen": 866336, "step": 115 }, { "epoch": 0.03259762308998302, - "grad_norm": 1.2287719249725342, + "grad_norm": 1.2145836353302002, "learning_rate": 4.998567521679393e-05, - "loss": 0.2371, - "num_input_tokens_seen": 894560, + "loss": 0.2244, + "num_input_tokens_seen": 902256, "step": 120 }, { "epoch": 0.03395585738539898, - "grad_norm": 1.337752342224121, + "grad_norm": 1.2940493822097778, "learning_rate": 4.9984446291811844e-05, - "loss": 0.2071, - "num_input_tokens_seen": 932736, + "loss": 0.2024, + "num_input_tokens_seen": 940224, "step": 125 }, { "epoch": 0.03531409168081494, - "grad_norm": 1.3839318752288818, + "grad_norm": 1.3970698118209839, "learning_rate": 4.99831668151879e-05, - "loss": 0.1999, - "num_input_tokens_seen": 971056, + "loss": 0.1904, + "num_input_tokens_seen": 979280, "step": 130 }, { "epoch": 0.0366723259762309, - "grad_norm": 1.385371208190918, + "grad_norm": 1.2771857976913452, "learning_rate": 4.998183678951091e-05, - "loss": 0.1769, - "num_input_tokens_seen": 1006080, + "loss": 0.1724, + "num_input_tokens_seen": 1015072, "step": 135 }, { "epoch": 0.038030560271646856, - "grad_norm": 1.1983788013458252, + "grad_norm": 1.3085401058197021, "learning_rate": 4.998045621747192e-05, - "loss": 0.2025, - "num_input_tokens_seen": 1045248, + "loss": 0.1925, + "num_input_tokens_seen": 1054832, "step": 140 }, { "epoch": 0.03938879456706282, - "grad_norm": 1.291953206062317, + "grad_norm": 1.2101911306381226, "learning_rate": 4.9979025101864296e-05, - "loss": 0.2576, - "num_input_tokens_seen": 1081104, + "loss": 0.2487, + "num_input_tokens_seen": 1091232, "step": 145 }, { "epoch": 0.04074702886247878, - "grad_norm": 0.9747681021690369, + "grad_norm": 0.9078947305679321, "learning_rate": 4.9977543445583645e-05, - "loss": 0.1885, - "num_input_tokens_seen": 1117712, + "loss": 0.1795, + "num_input_tokens_seen": 1128288, "step": 150 }, { "epoch": 0.042105263157894736, - "grad_norm": 1.2783719301223755, + "grad_norm": 1.3068333864212036, "learning_rate": 4.997601125162784e-05, - "loss": 0.2265, - "num_input_tokens_seen": 1154480, + "loss": 0.2159, + "num_input_tokens_seen": 1165296, "step": 155 }, { "epoch": 0.0434634974533107, - "grad_norm": 1.0684447288513184, + "grad_norm": 1.281665563583374, "learning_rate": 4.9974428523097e-05, - "loss": 0.1809, - "num_input_tokens_seen": 1194928, + "loss": 0.177, + "num_input_tokens_seen": 1205840, "step": 160 }, { "epoch": 0.044821731748726654, - "grad_norm": 1.4132447242736816, + "grad_norm": 1.2974933385849, "learning_rate": 4.99727952631935e-05, - "loss": 0.1772, - "num_input_tokens_seen": 1230448, + "loss": 0.1771, + "num_input_tokens_seen": 1241680, "step": 165 }, { "epoch": 0.046179966044142616, - "grad_norm": 1.1107892990112305, + "grad_norm": 1.071578860282898, "learning_rate": 4.9971111475221964e-05, - "loss": 0.2307, - "num_input_tokens_seen": 1262688, + "loss": 0.2215, + "num_input_tokens_seen": 1274592, "step": 170 }, { "epoch": 0.04753820033955857, - "grad_norm": 1.2495514154434204, + "grad_norm": 0.9618109464645386, "learning_rate": 4.996937716258923e-05, - "loss": 0.1199, - "num_input_tokens_seen": 1302720, + "loss": 0.1116, + "num_input_tokens_seen": 1314528, "step": 175 }, { "epoch": 0.048896434634974534, - "grad_norm": 1.3853334188461304, + "grad_norm": 1.127852439880371, "learning_rate": 4.996759232880438e-05, - "loss": 0.1901, - "num_input_tokens_seen": 1335632, + "loss": 0.1778, + "num_input_tokens_seen": 1347296, "step": 180 }, { "epoch": 0.05025466893039049, - "grad_norm": 1.2079516649246216, + "grad_norm": 1.2801867723464966, "learning_rate": 4.9965756977478706e-05, - "loss": 0.1584, - "num_input_tokens_seen": 1372992, + "loss": 0.1453, + "num_input_tokens_seen": 1385712, "step": 185 }, { "epoch": 0.05161290322580645, - "grad_norm": 1.8734357357025146, + "grad_norm": 1.7468706369400024, "learning_rate": 4.9963871112325736e-05, - "loss": 0.1891, - "num_input_tokens_seen": 1406720, + "loss": 0.1776, + "num_input_tokens_seen": 1419696, "step": 190 }, { "epoch": 0.052971137521222414, - "grad_norm": 0.8659824132919312, + "grad_norm": 0.9542202949523926, "learning_rate": 4.9961934737161166e-05, - "loss": 0.1057, - "num_input_tokens_seen": 1446016, + "loss": 0.1111, + "num_input_tokens_seen": 1458832, "step": 195 }, { "epoch": 0.05432937181663837, - "grad_norm": 1.4822900295257568, + "grad_norm": 1.0922791957855225, "learning_rate": 4.9959947855902925e-05, - "loss": 0.1636, - "num_input_tokens_seen": 1479008, + "loss": 0.1545, + "num_input_tokens_seen": 1492048, "step": 200 }, { "epoch": 0.05568760611205433, - "grad_norm": 1.0720332860946655, + "grad_norm": 0.7568637132644653, "learning_rate": 4.995791047257111e-05, - "loss": 0.1323, - "num_input_tokens_seen": 1520240, + "loss": 0.1278, + "num_input_tokens_seen": 1533392, "step": 205 }, { "epoch": 0.05704584040747029, - "grad_norm": 0.7066819071769714, + "grad_norm": 0.9932854771614075, "learning_rate": 4.995582259128801e-05, - "loss": 0.1317, - "num_input_tokens_seen": 1556384, + "loss": 0.1165, + "num_input_tokens_seen": 1570048, "step": 210 }, { "epoch": 0.05840407470288625, - "grad_norm": 0.752334713935852, + "grad_norm": 0.8142542839050293, "learning_rate": 4.9953684216278105e-05, - "loss": 0.1565, - "num_input_tokens_seen": 1595008, + "loss": 0.1444, + "num_input_tokens_seen": 1609088, "step": 215 }, { "epoch": 0.059762308998302205, - "grad_norm": 1.5184067487716675, + "grad_norm": 1.3339431285858154, "learning_rate": 4.9951495351867994e-05, - "loss": 0.1257, - "num_input_tokens_seen": 1632624, + "loss": 0.122, + "num_input_tokens_seen": 1646784, "step": 220 }, { "epoch": 0.06112054329371817, - "grad_norm": 1.2833807468414307, + "grad_norm": 1.2272486686706543, "learning_rate": 4.9949256002486475e-05, - "loss": 0.1366, - "num_input_tokens_seen": 1667216, + "loss": 0.1306, + "num_input_tokens_seen": 1681520, "step": 225 }, { "epoch": 0.06247877758913412, - "grad_norm": 0.8973982930183411, + "grad_norm": 0.8533628582954407, "learning_rate": 4.9946966172664476e-05, - "loss": 0.1383, - "num_input_tokens_seen": 1707232, + "loss": 0.133, + "num_input_tokens_seen": 1722272, "step": 230 }, { "epoch": 0.06383701188455009, - "grad_norm": 0.7995915412902832, + "grad_norm": 0.9362676739692688, "learning_rate": 4.9944625867035066e-05, - "loss": 0.0788, - "num_input_tokens_seen": 1742432, + "loss": 0.0751, + "num_input_tokens_seen": 1758224, "step": 235 }, { "epoch": 0.06519524617996604, - "grad_norm": 1.3337289094924927, + "grad_norm": 1.1582828760147095, "learning_rate": 4.994223509033344e-05, - "loss": 0.1311, - "num_input_tokens_seen": 1779312, + "loss": 0.117, + "num_input_tokens_seen": 1795328, "step": 240 }, { "epoch": 0.06655348047538201, - "grad_norm": 1.2932195663452148, + "grad_norm": 1.326873540878296, "learning_rate": 4.993979384739692e-05, - "loss": 0.1214, - "num_input_tokens_seen": 1819024, + "loss": 0.114, + "num_input_tokens_seen": 1835360, "step": 245 }, { "epoch": 0.06791171477079797, - "grad_norm": 1.1224360466003418, + "grad_norm": 0.705991268157959, "learning_rate": 4.9937302143164925e-05, - "loss": 0.1177, - "num_input_tokens_seen": 1862608, + "loss": 0.1058, + "num_input_tokens_seen": 1878944, "step": 250 }, { "epoch": 0.06926994906621392, - "grad_norm": 0.9420576095581055, + "grad_norm": 1.3264819383621216, "learning_rate": 4.993475998267899e-05, - "loss": 0.088, - "num_input_tokens_seen": 1898032, + "loss": 0.0829, + "num_input_tokens_seen": 1914416, "step": 255 }, { "epoch": 0.07062818336162988, - "grad_norm": 0.6910549402236938, + "grad_norm": 0.7507390379905701, "learning_rate": 4.993216737108273e-05, - "loss": 0.0944, - "num_input_tokens_seen": 1934272, + "loss": 0.0908, + "num_input_tokens_seen": 1951520, "step": 260 }, { "epoch": 0.07198641765704585, - "grad_norm": 0.965251624584198, + "grad_norm": 0.5984723567962646, "learning_rate": 4.9929524313621825e-05, - "loss": 0.092, - "num_input_tokens_seen": 1971536, + "loss": 0.0867, + "num_input_tokens_seen": 1989872, "step": 265 }, { "epoch": 0.0733446519524618, - "grad_norm": 1.154929518699646, + "grad_norm": 1.0802069902420044, "learning_rate": 4.9926830815644054e-05, - "loss": 0.1229, - "num_input_tokens_seen": 2010336, + "loss": 0.1196, + "num_input_tokens_seen": 2028992, "step": 270 }, { "epoch": 0.07470288624787776, - "grad_norm": 1.3174892663955688, + "grad_norm": 1.2659368515014648, "learning_rate": 4.992408688259923e-05, - "loss": 0.086, - "num_input_tokens_seen": 2047072, + "loss": 0.0824, + "num_input_tokens_seen": 2066160, "step": 275 }, { "epoch": 0.07606112054329371, - "grad_norm": 1.2460577487945557, + "grad_norm": 1.4435776472091675, "learning_rate": 4.992129252003922e-05, - "loss": 0.0963, - "num_input_tokens_seen": 2083424, + "loss": 0.1003, + "num_input_tokens_seen": 2102944, "step": 280 }, { "epoch": 0.07741935483870968, - "grad_norm": 0.9696467518806458, + "grad_norm": 0.8008107542991638, "learning_rate": 4.991844773361793e-05, - "loss": 0.1062, - "num_input_tokens_seen": 2122640, + "loss": 0.105, + "num_input_tokens_seen": 2142560, "step": 285 }, { "epoch": 0.07877758913412564, - "grad_norm": 0.7463433742523193, + "grad_norm": 0.9100258350372314, "learning_rate": 4.991555252909128e-05, - "loss": 0.0735, - "num_input_tokens_seen": 2158288, + "loss": 0.069, + "num_input_tokens_seen": 2178512, "step": 290 }, { "epoch": 0.08013582342954159, - "grad_norm": 0.628787636756897, + "grad_norm": 0.5681329965591431, "learning_rate": 4.9912606912317216e-05, - "loss": 0.0978, - "num_input_tokens_seen": 2191184, + "loss": 0.0871, + "num_input_tokens_seen": 2212272, "step": 295 }, { "epoch": 0.08149405772495756, - "grad_norm": 0.6916053891181946, + "grad_norm": 0.6568803191184998, "learning_rate": 4.990961088925568e-05, - "loss": 0.0683, - "num_input_tokens_seen": 2232800, + "loss": 0.062, + "num_input_tokens_seen": 2253968, "step": 300 }, { "epoch": 0.08285229202037352, - "grad_norm": 0.7193326950073242, + "grad_norm": 0.6338688731193542, "learning_rate": 4.990656446596859e-05, - "loss": 0.0927, - "num_input_tokens_seen": 2270400, + "loss": 0.088, + "num_input_tokens_seen": 2291824, "step": 305 }, { "epoch": 0.08421052631578947, - "grad_norm": 1.0654414892196655, + "grad_norm": 0.9547824263572693, "learning_rate": 4.990346764861984e-05, - "loss": 0.1191, - "num_input_tokens_seen": 2306992, + "loss": 0.1096, + "num_input_tokens_seen": 2328736, "step": 310 }, { "epoch": 0.08556876061120543, - "grad_norm": 0.9569358229637146, + "grad_norm": 0.7957741022109985, "learning_rate": 4.990032044347531e-05, - "loss": 0.0976, - "num_input_tokens_seen": 2346336, + "loss": 0.0977, + "num_input_tokens_seen": 2368464, "step": 315 }, { "epoch": 0.0869269949066214, - "grad_norm": 1.090050220489502, + "grad_norm": 1.0003756284713745, "learning_rate": 4.989712285690281e-05, - "loss": 0.0951, - "num_input_tokens_seen": 2384624, + "loss": 0.0901, + "num_input_tokens_seen": 2407376, "step": 320 }, { "epoch": 0.08828522920203735, - "grad_norm": 0.8073093891143799, + "grad_norm": 0.8435981869697571, "learning_rate": 4.9893874895372104e-05, - "loss": 0.0829, - "num_input_tokens_seen": 2419840, + "loss": 0.0784, + "num_input_tokens_seen": 2442672, "step": 325 }, { "epoch": 0.08964346349745331, - "grad_norm": 0.918918251991272, + "grad_norm": 0.763461172580719, "learning_rate": 4.989057656545487e-05, - "loss": 0.0724, - "num_input_tokens_seen": 2450720, + "loss": 0.068, + "num_input_tokens_seen": 2474208, "step": 330 }, { "epoch": 0.09100169779286928, - "grad_norm": 1.1814583539962769, + "grad_norm": 1.1192127466201782, "learning_rate": 4.98872278738247e-05, - "loss": 0.0944, - "num_input_tokens_seen": 2483824, + "loss": 0.0961, + "num_input_tokens_seen": 2508544, "step": 335 }, { "epoch": 0.09235993208828523, - "grad_norm": 0.7077891826629639, + "grad_norm": 0.662503719329834, "learning_rate": 4.9883828827257074e-05, - "loss": 0.0749, - "num_input_tokens_seen": 2516800, + "loss": 0.0679, + "num_input_tokens_seen": 2541616, "step": 340 }, { "epoch": 0.09371816638370119, - "grad_norm": 0.9808719754219055, + "grad_norm": 0.994500458240509, "learning_rate": 4.9880379432629376e-05, - "loss": 0.1, - "num_input_tokens_seen": 2557104, + "loss": 0.1026, + "num_input_tokens_seen": 2582320, "step": 345 }, { "epoch": 0.09507640067911714, - "grad_norm": 1.0570650100708008, + "grad_norm": 0.7198169827461243, "learning_rate": 4.987687969692084e-05, - "loss": 0.0813, - "num_input_tokens_seen": 2593776, + "loss": 0.0834, + "num_input_tokens_seen": 2618912, "step": 350 }, { "epoch": 0.09643463497453311, - "grad_norm": 1.074021339416504, + "grad_norm": 0.8764423131942749, "learning_rate": 4.9873329627212585e-05, - "loss": 0.0765, - "num_input_tokens_seen": 2638640, + "loss": 0.073, + "num_input_tokens_seen": 2663904, "step": 355 }, { "epoch": 0.09779286926994907, - "grad_norm": 0.908839762210846, + "grad_norm": 0.8454824090003967, "learning_rate": 4.986972923068753e-05, - "loss": 0.0623, - "num_input_tokens_seen": 2672800, + "loss": 0.06, + "num_input_tokens_seen": 2698464, "step": 360 }, { "epoch": 0.09915110356536502, - "grad_norm": 0.497942715883255, + "grad_norm": 0.4746737778186798, "learning_rate": 4.986607851463048e-05, - "loss": 0.0641, - "num_input_tokens_seen": 2703600, + "loss": 0.0616, + "num_input_tokens_seen": 2730272, "step": 365 }, { "epoch": 0.10050933786078098, - "grad_norm": 0.6844484806060791, + "grad_norm": 0.6699349284172058, "learning_rate": 4.986237748642798e-05, - "loss": 0.0643, - "num_input_tokens_seen": 2741888, + "loss": 0.0618, + "num_input_tokens_seen": 2769456, "step": 370 }, { "epoch": 0.10186757215619695, - "grad_norm": 0.7192046642303467, + "grad_norm": 0.7791961431503296, "learning_rate": 4.985862615356843e-05, - "loss": 0.0654, - "num_input_tokens_seen": 2784672, + "loss": 0.0679, + "num_input_tokens_seen": 2812720, "step": 375 }, { "epoch": 0.1032258064516129, - "grad_norm": 0.6705126166343689, + "grad_norm": 0.7471269369125366, "learning_rate": 4.9854824523642e-05, - "loss": 0.0677, - "num_input_tokens_seen": 2819840, + "loss": 0.0655, + "num_input_tokens_seen": 2848704, "step": 380 }, { "epoch": 0.10458404074702886, - "grad_norm": 0.7350865602493286, + "grad_norm": 0.7499180436134338, "learning_rate": 4.9850972604340604e-05, - "loss": 0.0727, - "num_input_tokens_seen": 2857360, + "loss": 0.0685, + "num_input_tokens_seen": 2886448, "step": 385 }, { "epoch": 0.10594227504244483, - "grad_norm": 0.5516068935394287, + "grad_norm": 0.44066399335861206, "learning_rate": 4.984707040345793e-05, - "loss": 0.0553, - "num_input_tokens_seen": 2893312, + "loss": 0.0556, + "num_input_tokens_seen": 2923120, "step": 390 }, { "epoch": 0.10730050933786078, - "grad_norm": 0.5543715357780457, + "grad_norm": 0.6232820749282837, "learning_rate": 4.98431179288894e-05, - "loss": 0.0645, - "num_input_tokens_seen": 2934608, + "loss": 0.0675, + "num_input_tokens_seen": 2964480, "step": 395 }, { "epoch": 0.10865874363327674, - "grad_norm": 0.8436211943626404, + "grad_norm": 0.8744762539863586, "learning_rate": 4.983911518863216e-05, - "loss": 0.0708, - "num_input_tokens_seen": 2970816, + "loss": 0.0707, + "num_input_tokens_seen": 3001024, "step": 400 }, { "epoch": 0.1100169779286927, - "grad_norm": 0.7570847868919373, + "grad_norm": 0.7887967228889465, "learning_rate": 4.983506219078504e-05, - "loss": 0.0485, - "num_input_tokens_seen": 3010192, + "loss": 0.0512, + "num_input_tokens_seen": 3040896, "step": 405 }, { "epoch": 0.11137521222410866, - "grad_norm": 1.0788004398345947, + "grad_norm": 1.0012716054916382, "learning_rate": 4.983095894354858e-05, - "loss": 0.0711, - "num_input_tokens_seen": 3053728, + "loss": 0.0647, + "num_input_tokens_seen": 3085200, "step": 410 }, { "epoch": 0.11273344651952462, - "grad_norm": 0.8196499347686768, + "grad_norm": 0.5702102780342102, "learning_rate": 4.9826805455224965e-05, - "loss": 0.064, - "num_input_tokens_seen": 3087936, + "loss": 0.0587, + "num_input_tokens_seen": 3119504, "step": 415 }, { "epoch": 0.11409168081494057, - "grad_norm": 0.7214381098747253, + "grad_norm": 0.605652391910553, "learning_rate": 4.982260173421807e-05, - "loss": 0.0539, - "num_input_tokens_seen": 3132192, + "loss": 0.0505, + "num_input_tokens_seen": 3164128, "step": 420 }, { "epoch": 0.11544991511035653, - "grad_norm": 0.8546941876411438, + "grad_norm": 0.7245158553123474, "learning_rate": 4.981834778903337e-05, - "loss": 0.0542, - "num_input_tokens_seen": 3168800, + "loss": 0.0528, + "num_input_tokens_seen": 3201152, "step": 425 }, { "epoch": 0.1168081494057725, - "grad_norm": 0.39968863129615784, + "grad_norm": 0.510598361492157, "learning_rate": 4.981404362827799e-05, - "loss": 0.0536, - "num_input_tokens_seen": 3212912, + "loss": 0.0562, + "num_input_tokens_seen": 3245648, "step": 430 }, { "epoch": 0.11816638370118845, - "grad_norm": 0.7735604643821716, + "grad_norm": 0.685762345790863, "learning_rate": 4.980968926066062e-05, - "loss": 0.0617, - "num_input_tokens_seen": 3246480, + "loss": 0.0592, + "num_input_tokens_seen": 3279936, "step": 435 }, { "epoch": 0.11952461799660441, - "grad_norm": 0.8055765628814697, + "grad_norm": 0.7092388868331909, "learning_rate": 4.980528469499158e-05, - "loss": 0.0634, - "num_input_tokens_seen": 3278848, + "loss": 0.0597, + "num_input_tokens_seen": 3312576, "step": 440 }, { "epoch": 0.12088285229202038, - "grad_norm": 0.9214922785758972, + "grad_norm": 1.0192017555236816, "learning_rate": 4.980082994018274e-05, - "loss": 0.0535, - "num_input_tokens_seen": 3316672, + "loss": 0.0517, + "num_input_tokens_seen": 3350800, "step": 445 }, { "epoch": 0.12224108658743633, - "grad_norm": 1.1991372108459473, + "grad_norm": 0.8104858994483948, "learning_rate": 4.97963250052475e-05, - "loss": 0.0668, - "num_input_tokens_seen": 3346496, + "loss": 0.0621, + "num_input_tokens_seen": 3380976, "step": 450 }, { "epoch": 0.12359932088285229, - "grad_norm": 0.4250160753726959, + "grad_norm": 0.44817814230918884, "learning_rate": 4.979176989930081e-05, - "loss": 0.0487, - "num_input_tokens_seen": 3383264, + "loss": 0.0498, + "num_input_tokens_seen": 3418032, "step": 455 }, { "epoch": 0.12495755517826825, - "grad_norm": 0.5896111130714417, + "grad_norm": 0.4669483006000519, "learning_rate": 4.978716463155913e-05, - "loss": 0.0534, - "num_input_tokens_seen": 3421936, + "loss": 0.0532, + "num_input_tokens_seen": 3457760, "step": 460 }, { "epoch": 0.12631578947368421, - "grad_norm": 0.7534325122833252, + "grad_norm": 0.5602646470069885, "learning_rate": 4.97825092113404e-05, - "loss": 0.0461, - "num_input_tokens_seen": 3460112, + "loss": 0.0457, + "num_input_tokens_seen": 3496048, "step": 465 }, { "epoch": 0.12767402376910017, - "grad_norm": 0.39356452226638794, + "grad_norm": 0.47929155826568604, "learning_rate": 4.9777803648064066e-05, - "loss": 0.0513, - "num_input_tokens_seen": 3504960, + "loss": 0.0487, + "num_input_tokens_seen": 3541280, "step": 470 }, { "epoch": 0.12903225806451613, - "grad_norm": 0.7092240452766418, + "grad_norm": 1.1785304546356201, "learning_rate": 4.977304795125099e-05, - "loss": 0.0633, - "num_input_tokens_seen": 3541856, + "loss": 0.0566, + "num_input_tokens_seen": 3577872, "step": 475 }, { "epoch": 0.13039049235993208, - "grad_norm": 0.700497567653656, + "grad_norm": 0.4399586021900177, "learning_rate": 4.97682421305235e-05, - "loss": 0.0478, - "num_input_tokens_seen": 3583264, + "loss": 0.0474, + "num_input_tokens_seen": 3620016, "step": 480 }, { "epoch": 0.13174872665534804, - "grad_norm": 0.5111300945281982, + "grad_norm": 0.5703775882720947, "learning_rate": 4.976338619560532e-05, - "loss": 0.0435, - "num_input_tokens_seen": 3621568, + "loss": 0.0519, + "num_input_tokens_seen": 3658704, "step": 485 }, { "epoch": 0.13310696095076402, - "grad_norm": 0.46932822465896606, + "grad_norm": 0.47112900018692017, "learning_rate": 4.9758480156321604e-05, - "loss": 0.0553, - "num_input_tokens_seen": 3660880, + "loss": 0.0471, + "num_input_tokens_seen": 3698496, "step": 490 }, { "epoch": 0.13446519524617997, - "grad_norm": 0.3591943383216858, + "grad_norm": 0.2991386353969574, "learning_rate": 4.975352402259884e-05, - "loss": 0.0472, - "num_input_tokens_seen": 3697872, + "loss": 0.0438, + "num_input_tokens_seen": 3736000, "step": 495 }, { "epoch": 0.13582342954159593, - "grad_norm": 0.44767165184020996, + "grad_norm": 0.40186935663223267, "learning_rate": 4.97485178044649e-05, - "loss": 0.0509, - "num_input_tokens_seen": 3742176, + "loss": 0.0508, + "num_input_tokens_seen": 3780512, "step": 500 }, { "epoch": 0.13718166383701189, - "grad_norm": 0.6314947009086609, + "grad_norm": 0.5536324977874756, "learning_rate": 4.9743461512049e-05, - "loss": 0.0598, - "num_input_tokens_seen": 3781696, + "loss": 0.058, + "num_input_tokens_seen": 3820160, "step": 505 }, { "epoch": 0.13853989813242784, - "grad_norm": 1.4273566007614136, + "grad_norm": 1.115540623664856, "learning_rate": 4.973835515558164e-05, - "loss": 0.0736, - "num_input_tokens_seen": 3821840, + "loss": 0.0697, + "num_input_tokens_seen": 3860448, "step": 510 }, { "epoch": 0.1398981324278438, - "grad_norm": 0.3278545141220093, + "grad_norm": 0.34854522347450256, "learning_rate": 4.9733198745394646e-05, - "loss": 0.0563, - "num_input_tokens_seen": 3857088, + "loss": 0.0554, + "num_input_tokens_seen": 3896224, "step": 515 }, { "epoch": 0.14125636672325975, - "grad_norm": 0.853356122970581, + "grad_norm": 0.7654231190681458, "learning_rate": 4.972799229192111e-05, - "loss": 0.0592, - "num_input_tokens_seen": 3895776, + "loss": 0.0549, + "num_input_tokens_seen": 3935552, "step": 520 }, { "epoch": 0.14261460101867574, - "grad_norm": 0.6819631457328796, + "grad_norm": 0.9470447897911072, "learning_rate": 4.972273580569539e-05, - "loss": 0.0407, - "num_input_tokens_seen": 3931696, + "loss": 0.0454, + "num_input_tokens_seen": 3971872, "step": 525 }, { "epoch": 0.1439728353140917, - "grad_norm": 0.5907939076423645, + "grad_norm": 0.6139296293258667, "learning_rate": 4.971742929735303e-05, - "loss": 0.0477, - "num_input_tokens_seen": 3968048, + "loss": 0.0466, + "num_input_tokens_seen": 4008704, "step": 530 }, { "epoch": 0.14533106960950765, - "grad_norm": 0.33857032656669617, + "grad_norm": 0.4273647367954254, "learning_rate": 4.971207277763085e-05, - "loss": 0.0529, - "num_input_tokens_seen": 4005776, + "loss": 0.0525, + "num_input_tokens_seen": 4046624, "step": 535 }, { "epoch": 0.1466893039049236, - "grad_norm": 0.7828961610794067, + "grad_norm": 0.6156778335571289, "learning_rate": 4.970666625736681e-05, - "loss": 0.0479, - "num_input_tokens_seen": 4042272, + "loss": 0.0462, + "num_input_tokens_seen": 4083488, "step": 540 }, { "epoch": 0.14804753820033956, - "grad_norm": 0.6301365494728088, + "grad_norm": 0.7248361110687256, "learning_rate": 4.970120974750005e-05, - "loss": 0.0411, - "num_input_tokens_seen": 4081344, + "loss": 0.0386, + "num_input_tokens_seen": 4123056, "step": 545 }, { "epoch": 0.1494057724957555, - "grad_norm": 0.7121984958648682, + "grad_norm": 0.5888564586639404, "learning_rate": 4.969570325907088e-05, - "loss": 0.0444, - "num_input_tokens_seen": 4119984, + "loss": 0.0411, + "num_input_tokens_seen": 4161904, "step": 550 }, { "epoch": 0.15076400679117147, - "grad_norm": 0.8037962317466736, + "grad_norm": 0.6727175712585449, "learning_rate": 4.96901468032207e-05, - "loss": 0.0496, - "num_input_tokens_seen": 4159968, + "loss": 0.0493, + "num_input_tokens_seen": 4201824, "step": 555 }, { "epoch": 0.15212224108658742, - "grad_norm": 0.6040731072425842, + "grad_norm": 0.48524925112724304, "learning_rate": 4.9684540391192034e-05, - "loss": 0.0599, - "num_input_tokens_seen": 4201072, + "loss": 0.0579, + "num_input_tokens_seen": 4243072, "step": 560 }, { "epoch": 0.1534804753820034, - "grad_norm": 0.32911279797554016, + "grad_norm": 0.4880739450454712, "learning_rate": 4.967888403432846e-05, - "loss": 0.0424, - "num_input_tokens_seen": 4245168, + "loss": 0.0404, + "num_input_tokens_seen": 4287648, "step": 565 }, { "epoch": 0.15483870967741936, - "grad_norm": 0.4384954869747162, + "grad_norm": 0.44308730959892273, "learning_rate": 4.967317774407463e-05, - "loss": 0.0468, - "num_input_tokens_seen": 4279152, + "loss": 0.0484, + "num_input_tokens_seen": 4321760, "step": 570 }, { "epoch": 0.15619694397283532, - "grad_norm": 0.54929518699646, + "grad_norm": 0.6363321542739868, "learning_rate": 4.966742153197621e-05, - "loss": 0.0511, - "num_input_tokens_seen": 4316064, + "loss": 0.0531, + "num_input_tokens_seen": 4359408, "step": 575 }, { "epoch": 0.15755517826825127, - "grad_norm": 0.40565288066864014, + "grad_norm": 0.3722250163555145, "learning_rate": 4.96616154096799e-05, - "loss": 0.0484, - "num_input_tokens_seen": 4350240, + "loss": 0.0438, + "num_input_tokens_seen": 4394176, "step": 580 }, { "epoch": 0.15891341256366723, - "grad_norm": 0.706996738910675, + "grad_norm": 0.8290708065032959, "learning_rate": 4.965575938893336e-05, - "loss": 0.0429, - "num_input_tokens_seen": 4392160, + "loss": 0.0434, + "num_input_tokens_seen": 4436896, "step": 585 }, { "epoch": 0.16027164685908318, - "grad_norm": 0.6308326721191406, + "grad_norm": 1.2016615867614746, "learning_rate": 4.964985348158522e-05, - "loss": 0.0628, - "num_input_tokens_seen": 4425424, + "loss": 0.0596, + "num_input_tokens_seen": 4470416, "step": 590 }, { "epoch": 0.16162988115449914, - "grad_norm": 0.3883691728115082, + "grad_norm": 0.6099566221237183, "learning_rate": 4.9643897699585056e-05, - "loss": 0.0525, - "num_input_tokens_seen": 4465376, + "loss": 0.0508, + "num_input_tokens_seen": 4510320, "step": 595 }, { "epoch": 0.16298811544991512, - "grad_norm": 0.6951537728309631, + "grad_norm": 0.7939335107803345, "learning_rate": 4.9637892054983334e-05, - "loss": 0.0443, - "num_input_tokens_seen": 4510464, + "loss": 0.0436, + "num_input_tokens_seen": 4556096, "step": 600 }, { "epoch": 0.16434634974533108, - "grad_norm": 0.3474721312522888, + "grad_norm": 0.48819658160209656, "learning_rate": 4.963183655993144e-05, - "loss": 0.0371, - "num_input_tokens_seen": 4548784, + "loss": 0.0387, + "num_input_tokens_seen": 4594720, "step": 605 }, { "epoch": 0.16570458404074703, - "grad_norm": 0.909916877746582, + "grad_norm": 0.7162647247314453, "learning_rate": 4.962573122668159e-05, - "loss": 0.0364, - "num_input_tokens_seen": 4591696, + "loss": 0.0352, + "num_input_tokens_seen": 4638048, "step": 610 }, { "epoch": 0.167062818336163, - "grad_norm": 0.4346422255039215, + "grad_norm": 0.52556973695755, "learning_rate": 4.961957606758686e-05, - "loss": 0.0506, - "num_input_tokens_seen": 4630080, + "loss": 0.0485, + "num_input_tokens_seen": 4676864, "step": 615 }, { "epoch": 0.16842105263157894, - "grad_norm": 0.36975690722465515, + "grad_norm": 0.4779506027698517, "learning_rate": 4.961337109510112e-05, - "loss": 0.0462, - "num_input_tokens_seen": 4665568, + "loss": 0.0419, + "num_input_tokens_seen": 4712864, "step": 620 }, { "epoch": 0.1697792869269949, - "grad_norm": 0.4856986701488495, + "grad_norm": 0.49920424818992615, "learning_rate": 4.9607116321779074e-05, - "loss": 0.0421, - "num_input_tokens_seen": 4702224, + "loss": 0.0386, + "num_input_tokens_seen": 4749728, "step": 625 }, { "epoch": 0.17113752122241085, - "grad_norm": 0.7107334733009338, + "grad_norm": 0.778538703918457, "learning_rate": 4.960081176027613e-05, - "loss": 0.0546, - "num_input_tokens_seen": 4738560, + "loss": 0.0471, + "num_input_tokens_seen": 4786768, "step": 630 }, { "epoch": 0.17249575551782684, - "grad_norm": 0.3939542770385742, + "grad_norm": 0.45747819542884827, "learning_rate": 4.959445742334847e-05, - "loss": 0.0422, - "num_input_tokens_seen": 4777296, + "loss": 0.0421, + "num_input_tokens_seen": 4825152, "step": 635 }, { "epoch": 0.1738539898132428, - "grad_norm": 0.32806187868118286, + "grad_norm": 0.6829373240470886, "learning_rate": 4.958805332385299e-05, - "loss": 0.0383, - "num_input_tokens_seen": 4815632, + "loss": 0.0376, + "num_input_tokens_seen": 4863008, "step": 640 }, { "epoch": 0.17521222410865875, - "grad_norm": 0.44892340898513794, + "grad_norm": 0.38208964467048645, "learning_rate": 4.958159947474724e-05, - "loss": 0.0434, - "num_input_tokens_seen": 4854336, + "loss": 0.0445, + "num_input_tokens_seen": 4902224, "step": 645 }, { "epoch": 0.1765704584040747, - "grad_norm": 0.47843137383461, + "grad_norm": 0.5627296566963196, "learning_rate": 4.9575095889089473e-05, - "loss": 0.0447, - "num_input_tokens_seen": 4893328, + "loss": 0.0442, + "num_input_tokens_seen": 4941200, "step": 650 }, { "epoch": 0.17792869269949066, - "grad_norm": 0.6039530038833618, + "grad_norm": 0.5723150968551636, "learning_rate": 4.956854258003854e-05, - "loss": 0.0361, - "num_input_tokens_seen": 4932736, + "loss": 0.0375, + "num_input_tokens_seen": 4980672, "step": 655 }, { "epoch": 0.17928692699490661, - "grad_norm": 0.49428287148475647, + "grad_norm": 0.5618226528167725, "learning_rate": 4.956193956085391e-05, - "loss": 0.0382, - "num_input_tokens_seen": 4964656, + "loss": 0.038, + "num_input_tokens_seen": 5012736, "step": 660 }, { "epoch": 0.18064516129032257, - "grad_norm": 0.589676558971405, + "grad_norm": 0.9366943836212158, "learning_rate": 4.9555286844895644e-05, - "loss": 0.0351, - "num_input_tokens_seen": 5003856, + "loss": 0.034, + "num_input_tokens_seen": 5052464, "step": 665 }, { "epoch": 0.18200339558573855, - "grad_norm": 0.717024028301239, + "grad_norm": 0.6985517144203186, "learning_rate": 4.9548584445624337e-05, - "loss": 0.0504, - "num_input_tokens_seen": 5037952, + "loss": 0.0502, + "num_input_tokens_seen": 5086960, "step": 670 }, { "epoch": 0.1833616298811545, - "grad_norm": 0.40191102027893066, + "grad_norm": 0.33674535155296326, "learning_rate": 4.9541832376601116e-05, "loss": 0.0468, - "num_input_tokens_seen": 5074128, + "num_input_tokens_seen": 5123344, "step": 675 }, { "epoch": 0.18471986417657046, - "grad_norm": 0.6011912822723389, + "grad_norm": 0.5784131288528442, "learning_rate": 4.953503065148762e-05, - "loss": 0.0376, - "num_input_tokens_seen": 5106816, + "loss": 0.0378, + "num_input_tokens_seen": 5156272, "step": 680 }, { "epoch": 0.18607809847198642, - "grad_norm": 0.34043794870376587, + "grad_norm": 0.4212299585342407, "learning_rate": 4.9528179284045926e-05, - "loss": 0.0377, - "num_input_tokens_seen": 5144544, + "loss": 0.0376, + "num_input_tokens_seen": 5194800, "step": 685 }, { "epoch": 0.18743633276740237, - "grad_norm": 0.22378966212272644, + "grad_norm": 0.7790331840515137, "learning_rate": 4.9521278288138595e-05, - "loss": 0.0413, - "num_input_tokens_seen": 5179200, + "loss": 0.0456, + "num_input_tokens_seen": 5229856, "step": 690 }, { "epoch": 0.18879456706281833, - "grad_norm": 0.5165534615516663, + "grad_norm": 0.4427700936794281, "learning_rate": 4.9514327677728566e-05, - "loss": 0.0372, - "num_input_tokens_seen": 5218976, + "loss": 0.0351, + "num_input_tokens_seen": 5270320, "step": 695 }, { "epoch": 0.19015280135823429, - "grad_norm": 0.3726426362991333, + "grad_norm": 0.46341609954833984, "learning_rate": 4.950732746687918e-05, - "loss": 0.0343, - "num_input_tokens_seen": 5256432, + "loss": 0.0336, + "num_input_tokens_seen": 5307456, "step": 700 }, { "epoch": 0.19151103565365024, - "grad_norm": 0.37666696310043335, + "grad_norm": 0.4067285358905792, "learning_rate": 4.950027766975415e-05, - "loss": 0.0452, - "num_input_tokens_seen": 5300832, + "loss": 0.045, + "num_input_tokens_seen": 5351968, "step": 705 }, { "epoch": 0.19286926994906622, - "grad_norm": 0.7127814888954163, + "grad_norm": 0.3549594283103943, "learning_rate": 4.9493178300617484e-05, - "loss": 0.0401, - "num_input_tokens_seen": 5338128, + "loss": 0.0403, + "num_input_tokens_seen": 5389504, "step": 710 }, { "epoch": 0.19422750424448218, - "grad_norm": 0.7724159359931946, + "grad_norm": 0.7888506054878235, "learning_rate": 4.948602937383351e-05, - "loss": 0.0434, - "num_input_tokens_seen": 5375232, + "loss": 0.0455, + "num_input_tokens_seen": 5427424, "step": 715 }, { "epoch": 0.19558573853989814, - "grad_norm": 0.2685568332672119, + "grad_norm": 0.27072879672050476, "learning_rate": 4.9478830903866846e-05, - "loss": 0.0361, - "num_input_tokens_seen": 5415360, + "loss": 0.0356, + "num_input_tokens_seen": 5468048, "step": 720 }, { "epoch": 0.1969439728353141, - "grad_norm": 0.3035404682159424, + "grad_norm": 0.41395801305770874, "learning_rate": 4.947158290528232e-05, - "loss": 0.0354, - "num_input_tokens_seen": 5455312, + "loss": 0.0382, + "num_input_tokens_seen": 5508160, "step": 725 }, { "epoch": 0.19830220713073005, - "grad_norm": 0.3800182044506073, + "grad_norm": 0.47651299834251404, "learning_rate": 4.946428539274497e-05, - "loss": 0.0426, - "num_input_tokens_seen": 5487360, + "loss": 0.0436, + "num_input_tokens_seen": 5540496, "step": 730 }, { "epoch": 0.199660441426146, - "grad_norm": 0.25346893072128296, + "grad_norm": 0.37204816937446594, "learning_rate": 4.9456938381020066e-05, - "loss": 0.0339, - "num_input_tokens_seen": 5523376, + "loss": 0.0353, + "num_input_tokens_seen": 5576704, "step": 735 }, { "epoch": 0.20101867572156196, - "grad_norm": 0.5410544276237488, + "grad_norm": 0.5645677447319031, "learning_rate": 4.944954188497297e-05, - "loss": 0.0401, - "num_input_tokens_seen": 5559376, + "loss": 0.0412, + "num_input_tokens_seen": 5613024, "step": 740 }, { "epoch": 0.20237691001697794, - "grad_norm": 0.31637877225875854, + "grad_norm": 0.3306039273738861, "learning_rate": 4.9442095919569195e-05, - "loss": 0.0401, - "num_input_tokens_seen": 5592064, + "loss": 0.0382, + "num_input_tokens_seen": 5646128, "step": 745 }, { "epoch": 0.2037351443123939, - "grad_norm": 0.3769804537296295, + "grad_norm": 0.37887242436408997, "learning_rate": 4.943460049987436e-05, - "loss": 0.0373, - "num_input_tokens_seen": 5637744, + "loss": 0.0378, + "num_input_tokens_seen": 5692576, "step": 750 }, { "epoch": 0.20509337860780985, - "grad_norm": 0.4295789301395416, + "grad_norm": 0.3586256206035614, "learning_rate": 4.942705564105412e-05, - "loss": 0.0376, - "num_input_tokens_seen": 5689792, + "loss": 0.0353, + "num_input_tokens_seen": 5744864, "step": 755 }, { "epoch": 0.2064516129032258, - "grad_norm": 0.6166548132896423, + "grad_norm": 0.7145051956176758, "learning_rate": 4.941946135837418e-05, - "loss": 0.0452, - "num_input_tokens_seen": 5725728, + "loss": 0.0442, + "num_input_tokens_seen": 5781616, "step": 760 }, { "epoch": 0.20780984719864176, - "grad_norm": 0.3654342293739319, + "grad_norm": 0.24144013226032257, "learning_rate": 4.9411817667200236e-05, - "loss": 0.0412, - "num_input_tokens_seen": 5769360, + "loss": 0.0384, + "num_input_tokens_seen": 5826208, "step": 765 }, { "epoch": 0.20916808149405772, - "grad_norm": 0.6942029595375061, + "grad_norm": 0.6157625913619995, "learning_rate": 4.940412458299793e-05, - "loss": 0.0404, - "num_input_tokens_seen": 5805712, + "loss": 0.037, + "num_input_tokens_seen": 5862848, "step": 770 }, { "epoch": 0.21052631578947367, - "grad_norm": 0.4988439679145813, + "grad_norm": 0.5007181167602539, "learning_rate": 4.939638212133292e-05, - "loss": 0.0405, - "num_input_tokens_seen": 5842096, + "loss": 0.0395, + "num_input_tokens_seen": 5899392, "step": 775 }, { "epoch": 0.21188455008488966, - "grad_norm": 0.42431896924972534, + "grad_norm": 0.43118712306022644, "learning_rate": 4.938859029787067e-05, - "loss": 0.0389, - "num_input_tokens_seen": 5877232, + "loss": 0.0376, + "num_input_tokens_seen": 5935184, "step": 780 }, { "epoch": 0.2132427843803056, - "grad_norm": 0.4785408675670624, + "grad_norm": 0.33764657378196716, "learning_rate": 4.938074912837659e-05, - "loss": 0.0367, - "num_input_tokens_seen": 5909744, + "loss": 0.0335, + "num_input_tokens_seen": 5968208, "step": 785 }, { "epoch": 0.21460101867572157, - "grad_norm": 0.40158113837242126, + "grad_norm": 0.26866912841796875, "learning_rate": 4.9372858628715905e-05, - "loss": 0.0298, - "num_input_tokens_seen": 5950448, + "loss": 0.0295, + "num_input_tokens_seen": 6009248, "step": 790 }, { "epoch": 0.21595925297113752, - "grad_norm": 0.33797016739845276, + "grad_norm": 0.3498166799545288, "learning_rate": 4.936491881485366e-05, - "loss": 0.0401, - "num_input_tokens_seen": 5984672, + "loss": 0.0358, + "num_input_tokens_seen": 6043344, "step": 795 }, { "epoch": 0.21731748726655348, - "grad_norm": 0.31715843081474304, + "grad_norm": 0.30877041816711426, "learning_rate": 4.935692970285467e-05, - "loss": 0.0309, - "num_input_tokens_seen": 6019312, + "loss": 0.0311, + "num_input_tokens_seen": 6078592, "step": 800 }, { "epoch": 0.21867572156196943, - "grad_norm": 0.2913011610507965, + "grad_norm": 0.31062501668930054, "learning_rate": 4.934889130888351e-05, - "loss": 0.0354, - "num_input_tokens_seen": 6058128, + "loss": 0.0327, + "num_input_tokens_seen": 6118192, "step": 805 }, { "epoch": 0.2200339558573854, - "grad_norm": 0.24183261394500732, + "grad_norm": 0.1907210797071457, "learning_rate": 4.934080364920444e-05, - "loss": 0.0342, - "num_input_tokens_seen": 6099232, + "loss": 0.0324, + "num_input_tokens_seen": 6159792, "step": 810 }, { "epoch": 0.22139219015280137, - "grad_norm": 0.32603639364242554, + "grad_norm": 0.35154321789741516, "learning_rate": 4.933266674018144e-05, - "loss": 0.037, - "num_input_tokens_seen": 6140672, + "loss": 0.035, + "num_input_tokens_seen": 6201536, "step": 815 }, { "epoch": 0.22275042444821733, - "grad_norm": 0.7212581634521484, + "grad_norm": 0.3269914388656616, "learning_rate": 4.9324480598278104e-05, - "loss": 0.0478, - "num_input_tokens_seen": 6174224, + "loss": 0.042, + "num_input_tokens_seen": 6235216, "step": 820 }, { "epoch": 0.22410865874363328, - "grad_norm": 0.5936550498008728, + "grad_norm": 0.3077080249786377, "learning_rate": 4.9316245240057666e-05, - "loss": 0.037, - "num_input_tokens_seen": 6213408, + "loss": 0.0354, + "num_input_tokens_seen": 6275248, "step": 825 }, { "epoch": 0.22546689303904924, - "grad_norm": 0.27912524342536926, + "grad_norm": 0.4142979383468628, "learning_rate": 4.9307960682182914e-05, - "loss": 0.0345, - "num_input_tokens_seen": 6251280, + "loss": 0.035, + "num_input_tokens_seen": 6313840, "step": 830 }, { "epoch": 0.2268251273344652, - "grad_norm": 0.3085426092147827, + "grad_norm": 0.31950220465660095, "learning_rate": 4.9299626941416205e-05, - "loss": 0.0336, - "num_input_tokens_seen": 6292848, + "loss": 0.0344, + "num_input_tokens_seen": 6355568, "step": 835 }, { "epoch": 0.22818336162988115, - "grad_norm": 0.42258647084236145, + "grad_norm": 0.2981835603713989, "learning_rate": 4.92912440346194e-05, - "loss": 0.0336, - "num_input_tokens_seen": 6329424, + "loss": 0.033, + "num_input_tokens_seen": 6392048, "step": 840 }, { "epoch": 0.2295415959252971, - "grad_norm": 0.4719127416610718, + "grad_norm": 0.40168067812919617, "learning_rate": 4.928281197875383e-05, - "loss": 0.0284, - "num_input_tokens_seen": 6367456, + "loss": 0.0263, + "num_input_tokens_seen": 6430032, "step": 845 }, { "epoch": 0.23089983022071306, - "grad_norm": 0.3193594813346863, + "grad_norm": 0.20020562410354614, "learning_rate": 4.92743307908803e-05, - "loss": 0.0362, - "num_input_tokens_seen": 6400656, + "loss": 0.0358, + "num_input_tokens_seen": 6464112, "step": 850 }, { "epoch": 0.23225806451612904, - "grad_norm": 0.5423386096954346, + "grad_norm": 0.30108246207237244, "learning_rate": 4.9265800488158984e-05, - "loss": 0.0321, - "num_input_tokens_seen": 6437440, + "loss": 0.0306, + "num_input_tokens_seen": 6501264, "step": 855 }, { "epoch": 0.233616298811545, - "grad_norm": 0.42566195130348206, + "grad_norm": 0.3107754588127136, "learning_rate": 4.925722108784947e-05, - "loss": 0.0397, - "num_input_tokens_seen": 6474752, + "loss": 0.0372, + "num_input_tokens_seen": 6538736, "step": 860 }, { "epoch": 0.23497453310696095, - "grad_norm": 0.35512852668762207, + "grad_norm": 0.26121941208839417, "learning_rate": 4.924859260731066e-05, - "loss": 0.0357, - "num_input_tokens_seen": 6506976, + "loss": 0.0349, + "num_input_tokens_seen": 6571248, "step": 865 }, { "epoch": 0.2363327674023769, - "grad_norm": 0.4176691770553589, + "grad_norm": 0.3352434039115906, "learning_rate": 4.923991506400077e-05, - "loss": 0.0331, - "num_input_tokens_seen": 6544272, + "loss": 0.0341, + "num_input_tokens_seen": 6608832, "step": 870 }, { "epoch": 0.23769100169779286, - "grad_norm": 0.32184863090515137, + "grad_norm": 0.3679253160953522, "learning_rate": 4.923118847547729e-05, - "loss": 0.033, - "num_input_tokens_seen": 6581904, + "loss": 0.0316, + "num_input_tokens_seen": 6646640, "step": 875 }, { "epoch": 0.23904923599320882, - "grad_norm": 0.36981451511383057, + "grad_norm": 0.4997706115245819, "learning_rate": 4.922241285939693e-05, - "loss": 0.0326, - "num_input_tokens_seen": 6619568, + "loss": 0.0324, + "num_input_tokens_seen": 6684784, "step": 880 }, { "epoch": 0.24040747028862477, - "grad_norm": 0.2147742360830307, + "grad_norm": 0.2579744756221771, "learning_rate": 4.921358823351561e-05, - "loss": 0.0347, - "num_input_tokens_seen": 6660576, + "loss": 0.0343, + "num_input_tokens_seen": 6725920, "step": 885 }, { "epoch": 0.24176570458404076, - "grad_norm": 0.6517984867095947, + "grad_norm": 0.4473377764225006, "learning_rate": 4.920471461568843e-05, - "loss": 0.034, - "num_input_tokens_seen": 6701552, + "loss": 0.0321, + "num_input_tokens_seen": 6767232, "step": 890 }, { "epoch": 0.2431239388794567, - "grad_norm": 0.5047566294670105, + "grad_norm": 0.5155393481254578, "learning_rate": 4.919579202386958e-05, - "loss": 0.0315, - "num_input_tokens_seen": 6738528, + "loss": 0.0334, + "num_input_tokens_seen": 6804304, "step": 895 }, { "epoch": 0.24448217317487267, - "grad_norm": 0.313641220331192, + "grad_norm": 0.5548986792564392, "learning_rate": 4.9186820476112364e-05, - "loss": 0.0329, - "num_input_tokens_seen": 6770400, + "loss": 0.0304, + "num_input_tokens_seen": 6836480, "step": 900 }, { "epoch": 0.24584040747028862, - "grad_norm": 0.29352760314941406, + "grad_norm": 0.2043391466140747, "learning_rate": 4.917779999056913e-05, - "loss": 0.0311, - "num_input_tokens_seen": 6810448, + "loss": 0.0291, + "num_input_tokens_seen": 6876512, "step": 905 }, { "epoch": 0.24719864176570458, - "grad_norm": 0.367274671792984, + "grad_norm": 0.5712271332740784, "learning_rate": 4.9168730585491255e-05, - "loss": 0.0327, - "num_input_tokens_seen": 6847088, + "loss": 0.0325, + "num_input_tokens_seen": 6913584, "step": 910 }, { "epoch": 0.24855687606112054, - "grad_norm": 0.25984224677085876, + "grad_norm": 0.2577384114265442, "learning_rate": 4.9159612279229075e-05, - "loss": 0.0323, - "num_input_tokens_seen": 6882656, + "loss": 0.036, + "num_input_tokens_seen": 6949376, "step": 915 }, { "epoch": 0.2499151103565365, - "grad_norm": 0.2983449399471283, + "grad_norm": 0.5997853875160217, "learning_rate": 4.91504450902319e-05, - "loss": 0.0318, - "num_input_tokens_seen": 6916400, + "loss": 0.0353, + "num_input_tokens_seen": 6983392, "step": 920 }, { "epoch": 0.25127334465195245, - "grad_norm": 0.3906146287918091, + "grad_norm": 0.4281270205974579, "learning_rate": 4.914122903704792e-05, - "loss": 0.0378, - "num_input_tokens_seen": 6951952, + "loss": 0.0396, + "num_input_tokens_seen": 7019152, "step": 925 }, { "epoch": 0.25263157894736843, - "grad_norm": 0.311927855014801, + "grad_norm": 0.39936327934265137, "learning_rate": 4.9131964138324205e-05, - "loss": 0.0335, - "num_input_tokens_seen": 6987136, + "loss": 0.0337, + "num_input_tokens_seen": 7054576, "step": 930 }, { "epoch": 0.25398981324278436, - "grad_norm": 0.2048288881778717, + "grad_norm": 0.25076526403427124, "learning_rate": 4.9122650412806636e-05, - "loss": 0.0292, - "num_input_tokens_seen": 7029328, + "loss": 0.0307, + "num_input_tokens_seen": 7096736, "step": 935 }, { "epoch": 0.25534804753820034, - "grad_norm": 0.4681178629398346, + "grad_norm": 0.5432738661766052, "learning_rate": 4.911328787933992e-05, - "loss": 0.0298, - "num_input_tokens_seen": 7066416, + "loss": 0.031, + "num_input_tokens_seen": 7134592, "step": 940 }, { "epoch": 0.2567062818336163, - "grad_norm": 0.4400295317173004, + "grad_norm": 0.2741766571998596, "learning_rate": 4.91038765568675e-05, - "loss": 0.0312, - "num_input_tokens_seen": 7097760, + "loss": 0.0292, + "num_input_tokens_seen": 7166256, "step": 945 }, { "epoch": 0.25806451612903225, - "grad_norm": 0.2512310743331909, + "grad_norm": 0.2537309229373932, "learning_rate": 4.909441646443152e-05, - "loss": 0.0315, - "num_input_tokens_seen": 7134688, + "loss": 0.0323, + "num_input_tokens_seen": 7203568, "step": 950 }, { "epoch": 0.25942275042444823, - "grad_norm": 0.23557136952877045, + "grad_norm": 0.25340384244918823, "learning_rate": 4.9084907621172826e-05, - "loss": 0.0326, - "num_input_tokens_seen": 7174832, + "loss": 0.0336, + "num_input_tokens_seen": 7244048, "step": 955 }, { "epoch": 0.26078098471986416, - "grad_norm": 0.30942755937576294, + "grad_norm": 0.38520729541778564, "learning_rate": 4.907535004633089e-05, - "loss": 0.0299, - "num_input_tokens_seen": 7211344, + "loss": 0.031, + "num_input_tokens_seen": 7281072, "step": 960 }, { "epoch": 0.26213921901528014, - "grad_norm": 0.22276735305786133, + "grad_norm": 0.3101852834224701, "learning_rate": 4.9065743759243794e-05, - "loss": 0.0295, - "num_input_tokens_seen": 7243024, + "loss": 0.0274, + "num_input_tokens_seen": 7312816, "step": 965 }, { "epoch": 0.2634974533106961, - "grad_norm": 0.3891482949256897, + "grad_norm": 0.5748435258865356, "learning_rate": 4.9056088779348164e-05, "loss": 0.0308, - "num_input_tokens_seen": 7290256, + "num_input_tokens_seen": 7360304, "step": 970 }, { "epoch": 0.26485568760611206, - "grad_norm": 0.25494882464408875, + "grad_norm": 0.19920141994953156, "learning_rate": 4.904638512617917e-05, - "loss": 0.03, - "num_input_tokens_seen": 7330976, + "loss": 0.0287, + "num_input_tokens_seen": 7401136, "step": 975 }, { "epoch": 0.26621392190152804, - "grad_norm": 0.25893640518188477, + "grad_norm": 0.23063939809799194, "learning_rate": 4.9036632819370435e-05, - "loss": 0.0316, - "num_input_tokens_seen": 7368416, + "loss": 0.0311, + "num_input_tokens_seen": 7439008, "step": 980 }, { "epoch": 0.26757215619694397, - "grad_norm": 0.2933538854122162, + "grad_norm": 0.2006792426109314, "learning_rate": 4.902683187865406e-05, - "loss": 0.0331, - "num_input_tokens_seen": 7405136, + "loss": 0.0287, + "num_input_tokens_seen": 7476704, "step": 985 }, { "epoch": 0.26893039049235995, - "grad_norm": 0.4313328266143799, + "grad_norm": 0.47015413641929626, "learning_rate": 4.9016982323860505e-05, - "loss": 0.0322, - "num_input_tokens_seen": 7441024, + "loss": 0.0298, + "num_input_tokens_seen": 7512800, "step": 990 }, { "epoch": 0.2702886247877759, - "grad_norm": 0.2822008430957794, + "grad_norm": 0.35960569977760315, "learning_rate": 4.9007084174918636e-05, - "loss": 0.0344, - "num_input_tokens_seen": 7475536, + "loss": 0.0326, + "num_input_tokens_seen": 7547808, "step": 995 }, { "epoch": 0.27164685908319186, - "grad_norm": 0.4102694094181061, + "grad_norm": 0.7696571946144104, "learning_rate": 4.89971374518556e-05, - "loss": 0.0331, - "num_input_tokens_seen": 7507952, + "loss": 0.0323, + "num_input_tokens_seen": 7580704, "step": 1000 }, { "epoch": 0.2730050933786078, - "grad_norm": 0.36310362815856934, + "grad_norm": 0.38724878430366516, "learning_rate": 4.898714217479687e-05, - "loss": 0.0327, - "num_input_tokens_seen": 7545584, + "loss": 0.0328, + "num_input_tokens_seen": 7618704, "step": 1005 }, { "epoch": 0.27436332767402377, - "grad_norm": 0.19375079870224, + "grad_norm": 0.41109806299209595, "learning_rate": 4.897709836396611e-05, - "loss": 0.0306, - "num_input_tokens_seen": 7583056, + "loss": 0.0321, + "num_input_tokens_seen": 7656512, "step": 1010 }, { "epoch": 0.27572156196943975, - "grad_norm": 0.34920889139175415, + "grad_norm": 0.3128097355365753, "learning_rate": 4.896700603968523e-05, - "loss": 0.0354, - "num_input_tokens_seen": 7622800, + "loss": 0.0308, + "num_input_tokens_seen": 7696800, "step": 1015 }, { "epoch": 0.2770797962648557, - "grad_norm": 0.5086162090301514, + "grad_norm": 0.20954200625419617, "learning_rate": 4.895686522237426e-05, - "loss": 0.0314, - "num_input_tokens_seen": 7655936, + "loss": 0.0306, + "num_input_tokens_seen": 7729888, "step": 1020 }, { "epoch": 0.27843803056027167, - "grad_norm": 0.31152108311653137, + "grad_norm": 0.3851282298564911, "learning_rate": 4.894667593255138e-05, - "loss": 0.0294, - "num_input_tokens_seen": 7700112, + "loss": 0.0305, + "num_input_tokens_seen": 7774256, "step": 1025 }, { "epoch": 0.2797962648556876, - "grad_norm": 0.4292425215244293, + "grad_norm": 0.2662223279476166, "learning_rate": 4.8936438190832815e-05, - "loss": 0.031, - "num_input_tokens_seen": 7738160, + "loss": 0.0306, + "num_input_tokens_seen": 7812640, "step": 1030 }, { "epoch": 0.2811544991511036, - "grad_norm": 0.35866788029670715, + "grad_norm": 0.22895866632461548, "learning_rate": 4.8926152017932834e-05, - "loss": 0.0293, - "num_input_tokens_seen": 7774784, + "loss": 0.03, + "num_input_tokens_seen": 7849952, "step": 1035 }, { "epoch": 0.2825127334465195, - "grad_norm": 0.42837586998939514, + "grad_norm": 0.20651288330554962, "learning_rate": 4.891581743466372e-05, - "loss": 0.0311, - "num_input_tokens_seen": 7811840, + "loss": 0.0302, + "num_input_tokens_seen": 7887552, "step": 1040 }, { "epoch": 0.2838709677419355, - "grad_norm": 0.800658106803894, + "grad_norm": 0.2753933072090149, "learning_rate": 4.8905434461935687e-05, - "loss": 0.0347, - "num_input_tokens_seen": 7845600, + "loss": 0.0312, + "num_input_tokens_seen": 7921632, "step": 1045 }, { "epoch": 0.28522920203735147, - "grad_norm": 0.2256612926721573, + "grad_norm": 0.19898736476898193, "learning_rate": 4.8895003120756846e-05, - "loss": 0.0318, - "num_input_tokens_seen": 7881856, + "loss": 0.037, + "num_input_tokens_seen": 7958496, "step": 1050 }, { "epoch": 0.2865874363327674, - "grad_norm": 0.4808388650417328, + "grad_norm": 0.22226101160049438, "learning_rate": 4.888452343223319e-05, - "loss": 0.0336, - "num_input_tokens_seen": 7916416, + "loss": 0.0318, + "num_input_tokens_seen": 7993552, "step": 1055 }, { "epoch": 0.2879456706281834, - "grad_norm": 0.22128315269947052, + "grad_norm": 0.19411003589630127, "learning_rate": 4.887399541756852e-05, - "loss": 0.0309, - "num_input_tokens_seen": 7953792, + "loss": 0.0284, + "num_input_tokens_seen": 8031456, "step": 1060 }, { "epoch": 0.2893039049235993, - "grad_norm": 0.2901773154735565, + "grad_norm": 0.2812041938304901, "learning_rate": 4.886341909806444e-05, - "loss": 0.0325, - "num_input_tokens_seen": 7990704, + "loss": 0.0296, + "num_input_tokens_seen": 8068848, "step": 1065 }, { "epoch": 0.2906621392190153, - "grad_norm": 0.3010469675064087, + "grad_norm": 0.550690770149231, "learning_rate": 4.885279449512028e-05, - "loss": 0.0348, - "num_input_tokens_seen": 8026576, + "loss": 0.0337, + "num_input_tokens_seen": 8104848, "step": 1070 }, { "epoch": 0.2920203735144312, - "grad_norm": 0.5264120697975159, + "grad_norm": 0.19867663085460663, "learning_rate": 4.884212163023305e-05, - "loss": 0.0323, - "num_input_tokens_seen": 8061216, + "loss": 0.033, + "num_input_tokens_seen": 8140032, "step": 1075 }, { "epoch": 0.2933786078098472, - "grad_norm": 0.35435718297958374, + "grad_norm": 0.27529871463775635, "learning_rate": 4.883140052499742e-05, - "loss": 0.0351, - "num_input_tokens_seen": 8097216, + "loss": 0.0355, + "num_input_tokens_seen": 8176960, "step": 1080 }, { "epoch": 0.29473684210526313, - "grad_norm": 0.38566070795059204, + "grad_norm": 0.22515107691287994, "learning_rate": 4.882063120110566e-05, - "loss": 0.0371, - "num_input_tokens_seen": 8130000, + "loss": 0.0301, + "num_input_tokens_seen": 8210208, "step": 1085 }, { "epoch": 0.2960950764006791, - "grad_norm": 0.5030001997947693, + "grad_norm": 0.23444144427776337, "learning_rate": 4.880981368034761e-05, - "loss": 0.0365, - "num_input_tokens_seen": 8170208, + "loss": 0.0314, + "num_input_tokens_seen": 8250992, "step": 1090 }, { "epoch": 0.2974533106960951, - "grad_norm": 0.4397047162055969, + "grad_norm": 0.33682987093925476, "learning_rate": 4.879894798461063e-05, - "loss": 0.036, - "num_input_tokens_seen": 8207840, + "loss": 0.0351, + "num_input_tokens_seen": 8289328, "step": 1095 }, { "epoch": 0.298811544991511, - "grad_norm": 0.45017296075820923, + "grad_norm": 0.16044403612613678, "learning_rate": 4.8788034135879535e-05, - "loss": 0.0296, - "num_input_tokens_seen": 8252224, + "loss": 0.0265, + "num_input_tokens_seen": 8333664, "step": 1100 }, { "epoch": 0.300169779286927, - "grad_norm": 0.4451981484889984, + "grad_norm": 0.271920770406723, "learning_rate": 4.8777072156236604e-05, - "loss": 0.0308, - "num_input_tokens_seen": 8288752, + "loss": 0.0286, + "num_input_tokens_seen": 8370256, "step": 1105 }, { "epoch": 0.30152801358234294, - "grad_norm": 0.5526825189590454, + "grad_norm": 0.5168178081512451, "learning_rate": 4.876606206786146e-05, - "loss": 0.0387, - "num_input_tokens_seen": 8328976, + "loss": 0.0319, + "num_input_tokens_seen": 8410320, "step": 1110 }, { "epoch": 0.3028862478777589, - "grad_norm": 0.24274751543998718, + "grad_norm": 0.24976550042629242, "learning_rate": 4.875500389303108e-05, - "loss": 0.0331, - "num_input_tokens_seen": 8366976, + "loss": 0.0288, + "num_input_tokens_seen": 8448976, "step": 1115 }, { "epoch": 0.30424448217317485, - "grad_norm": 0.47535085678100586, + "grad_norm": 0.7469359040260315, "learning_rate": 4.874389765411976e-05, - "loss": 0.0333, - "num_input_tokens_seen": 8401664, + "loss": 0.0305, + "num_input_tokens_seen": 8484112, "step": 1120 }, { "epoch": 0.30560271646859083, - "grad_norm": 0.2822076976299286, + "grad_norm": 0.2924445867538452, "learning_rate": 4.873274337359902e-05, - "loss": 0.0307, - "num_input_tokens_seen": 8436144, + "loss": 0.03, + "num_input_tokens_seen": 8519136, "step": 1125 }, { "epoch": 0.3069609507640068, - "grad_norm": 0.3742773234844208, + "grad_norm": 0.23330393433570862, "learning_rate": 4.872154107403758e-05, - "loss": 0.0308, - "num_input_tokens_seen": 8468480, + "loss": 0.0287, + "num_input_tokens_seen": 8552032, "step": 1130 }, { "epoch": 0.30831918505942274, - "grad_norm": 0.4586505591869354, + "grad_norm": 0.8244443535804749, "learning_rate": 4.871029077810133e-05, - "loss": 0.0315, - "num_input_tokens_seen": 8506464, + "loss": 0.0305, + "num_input_tokens_seen": 8590048, "step": 1135 }, { "epoch": 0.3096774193548387, - "grad_norm": 0.5089846849441528, + "grad_norm": 0.1860063523054123, "learning_rate": 4.8698992508553276e-05, - "loss": 0.0337, - "num_input_tokens_seen": 8541072, + "loss": 0.0298, + "num_input_tokens_seen": 8625248, "step": 1140 }, { "epoch": 0.31103565365025465, - "grad_norm": 0.6390443444252014, + "grad_norm": 0.29115355014801025, "learning_rate": 4.8687646288253474e-05, - "loss": 0.0297, - "num_input_tokens_seen": 8579568, + "loss": 0.0277, + "num_input_tokens_seen": 8664352, "step": 1145 }, { "epoch": 0.31239388794567063, - "grad_norm": 0.2877405285835266, + "grad_norm": 0.3288950026035309, "learning_rate": 4.8676252140159005e-05, - "loss": 0.0276, - "num_input_tokens_seen": 8618400, + "loss": 0.0265, + "num_input_tokens_seen": 8703168, "step": 1150 }, { "epoch": 0.31375212224108656, - "grad_norm": 0.4035789966583252, + "grad_norm": 0.5520743727684021, "learning_rate": 4.866481008732395e-05, - "loss": 0.0357, - "num_input_tokens_seen": 8656384, + "loss": 0.0322, + "num_input_tokens_seen": 8741568, "step": 1155 }, { "epoch": 0.31511035653650254, - "grad_norm": 0.3751215636730194, + "grad_norm": 0.45341235399246216, "learning_rate": 4.865332015289927e-05, - "loss": 0.0301, - "num_input_tokens_seen": 8695088, + "loss": 0.0295, + "num_input_tokens_seen": 8780464, "step": 1160 }, { "epoch": 0.31646859083191853, - "grad_norm": 0.5118940472602844, + "grad_norm": 0.2582039535045624, "learning_rate": 4.8641782360132846e-05, - "loss": 0.0319, - "num_input_tokens_seen": 8729024, + "loss": 0.0282, + "num_input_tokens_seen": 8814400, "step": 1165 }, { "epoch": 0.31782682512733446, - "grad_norm": 0.1678381860256195, + "grad_norm": 0.541491687297821, "learning_rate": 4.863019673236937e-05, - "loss": 0.0294, - "num_input_tokens_seen": 8766528, + "loss": 0.0272, + "num_input_tokens_seen": 8852352, "step": 1170 }, { "epoch": 0.31918505942275044, - "grad_norm": 0.5635147094726562, + "grad_norm": 1.4631787538528442, "learning_rate": 4.861856329305032e-05, - "loss": 0.0345, - "num_input_tokens_seen": 8806112, + "loss": 0.0326, + "num_input_tokens_seen": 8892480, "step": 1175 }, { "epoch": 0.32054329371816637, - "grad_norm": 0.32603392004966736, + "grad_norm": 0.35603269934654236, "learning_rate": 4.860688206571394e-05, - "loss": 0.0295, - "num_input_tokens_seen": 8850960, + "loss": 0.0299, + "num_input_tokens_seen": 8937600, "step": 1180 }, { "epoch": 0.32190152801358235, - "grad_norm": 0.21516482532024384, + "grad_norm": 0.18159954249858856, "learning_rate": 4.859515307399513e-05, - "loss": 0.0315, - "num_input_tokens_seen": 8887488, + "loss": 0.034, + "num_input_tokens_seen": 8974496, "step": 1185 }, { "epoch": 0.3232597623089983, - "grad_norm": 1.0685648918151855, + "grad_norm": 0.2052171528339386, "learning_rate": 4.858337634162544e-05, - "loss": 0.0313, - "num_input_tokens_seen": 8925904, + "loss": 0.0303, + "num_input_tokens_seen": 9012848, "step": 1190 }, { "epoch": 0.32461799660441426, - "grad_norm": 0.27368736267089844, + "grad_norm": 0.36822938919067383, "learning_rate": 4.857155189243302e-05, "loss": 0.0333, - "num_input_tokens_seen": 8961104, + "num_input_tokens_seen": 9048848, "step": 1195 }, { "epoch": 0.32597623089983024, - "grad_norm": 0.21258795261383057, + "grad_norm": 0.20404799282550812, "learning_rate": 4.855967975034258e-05, - "loss": 0.0278, - "num_input_tokens_seen": 8997680, + "loss": 0.0255, + "num_input_tokens_seen": 9086144, "step": 1200 }, { "epoch": 0.32733446519524617, - "grad_norm": 0.19891807436943054, + "grad_norm": 0.16977132856845856, "learning_rate": 4.854775993937532e-05, - "loss": 0.0346, - "num_input_tokens_seen": 9035728, + "loss": 0.0289, + "num_input_tokens_seen": 9124352, "step": 1205 }, { "epoch": 0.32869269949066215, - "grad_norm": 0.9986312985420227, + "grad_norm": 0.23009471595287323, "learning_rate": 4.853579248364886e-05, - "loss": 0.03, - "num_input_tokens_seen": 9075344, + "loss": 0.0288, + "num_input_tokens_seen": 9163776, "step": 1210 }, { "epoch": 0.3300509337860781, - "grad_norm": 0.3903096318244934, + "grad_norm": 0.29363152384757996, "learning_rate": 4.852377740737727e-05, - "loss": 0.0292, - "num_input_tokens_seen": 9117824, + "loss": 0.026, + "num_input_tokens_seen": 9206288, "step": 1215 }, { "epoch": 0.33140916808149407, - "grad_norm": 0.2189033180475235, + "grad_norm": 0.3070659041404724, "learning_rate": 4.851171473487094e-05, - "loss": 0.0287, - "num_input_tokens_seen": 9156080, + "loss": 0.029, + "num_input_tokens_seen": 9244800, "step": 1220 }, { "epoch": 0.33276740237691, - "grad_norm": 0.334289014339447, + "grad_norm": 0.22421318292617798, "learning_rate": 4.849960449053656e-05, "loss": 0.0331, - "num_input_tokens_seen": 9194448, + "num_input_tokens_seen": 9282928, "step": 1225 }, { "epoch": 0.334125636672326, - "grad_norm": 0.26775461435317993, + "grad_norm": 0.23346398770809174, "learning_rate": 4.848744669887711e-05, - "loss": 0.0286, - "num_input_tokens_seen": 9228512, + "loss": 0.0275, + "num_input_tokens_seen": 9317568, "step": 1230 }, { "epoch": 0.33548387096774196, - "grad_norm": 0.5137389302253723, + "grad_norm": 0.218308225274086, "learning_rate": 4.847524138449172e-05, - "loss": 0.0322, - "num_input_tokens_seen": 9268112, + "loss": 0.0272, + "num_input_tokens_seen": 9357664, "step": 1235 }, { "epoch": 0.3368421052631579, - "grad_norm": 0.6069295406341553, + "grad_norm": 0.22206993401050568, "learning_rate": 4.846298857207572e-05, - "loss": 0.0317, - "num_input_tokens_seen": 9304944, + "loss": 0.028, + "num_input_tokens_seen": 9394912, "step": 1240 }, { "epoch": 0.33820033955857387, - "grad_norm": 0.3216699957847595, + "grad_norm": 0.3443458378314972, "learning_rate": 4.845068828642051e-05, - "loss": 0.0375, - "num_input_tokens_seen": 9338240, + "loss": 0.0302, + "num_input_tokens_seen": 9428544, "step": 1245 }, { "epoch": 0.3395585738539898, - "grad_norm": 0.5324333906173706, + "grad_norm": 0.17635242640972137, "learning_rate": 4.843834055241357e-05, - "loss": 0.0271, - "num_input_tokens_seen": 9382624, + "loss": 0.0241, + "num_input_tokens_seen": 9473152, "step": 1250 }, { "epoch": 0.3409168081494058, - "grad_norm": 0.2982085049152374, + "grad_norm": 0.18065407872200012, "learning_rate": 4.842594539503838e-05, - "loss": 0.0313, - "num_input_tokens_seen": 9420928, + "loss": 0.0261, + "num_input_tokens_seen": 9511904, "step": 1255 }, { "epoch": 0.3422750424448217, - "grad_norm": 0.36439913511276245, + "grad_norm": 0.39964428544044495, "learning_rate": 4.8413502839374345e-05, - "loss": 0.0338, - "num_input_tokens_seen": 9457312, + "loss": 0.029, + "num_input_tokens_seen": 9548544, "step": 1260 }, { "epoch": 0.3436332767402377, - "grad_norm": 0.3627699613571167, + "grad_norm": 0.17346911132335663, "learning_rate": 4.840101291059681e-05, - "loss": 0.0281, - "num_input_tokens_seen": 9492944, + "loss": 0.0266, + "num_input_tokens_seen": 9584448, "step": 1265 }, { "epoch": 0.3449915110356537, - "grad_norm": 0.41822347044944763, + "grad_norm": 0.27910569310188293, "learning_rate": 4.838847563397694e-05, - "loss": 0.0364, - "num_input_tokens_seen": 9534352, + "loss": 0.0323, + "num_input_tokens_seen": 9626224, "step": 1270 }, { "epoch": 0.3463497453310696, - "grad_norm": 0.39943745732307434, + "grad_norm": 0.5445939302444458, "learning_rate": 4.8375891034881726e-05, - "loss": 0.0313, - "num_input_tokens_seen": 9575296, + "loss": 0.0337, + "num_input_tokens_seen": 9666784, "step": 1275 }, { "epoch": 0.3477079796264856, - "grad_norm": 0.26738977432250977, + "grad_norm": 0.20662853121757507, "learning_rate": 4.836325913877388e-05, - "loss": 0.0312, - "num_input_tokens_seen": 9611232, + "loss": 0.0325, + "num_input_tokens_seen": 9703280, "step": 1280 }, { "epoch": 0.3490662139219015, - "grad_norm": 0.21385633945465088, + "grad_norm": 0.23035022616386414, "learning_rate": 4.835057997121185e-05, - "loss": 0.0307, - "num_input_tokens_seen": 9652448, + "loss": 0.0282, + "num_input_tokens_seen": 9744480, "step": 1285 }, { "epoch": 0.3504244482173175, - "grad_norm": 0.30385833978652954, + "grad_norm": 0.1986294388771057, "learning_rate": 4.833785355784968e-05, - "loss": 0.0244, - "num_input_tokens_seen": 9692512, + "loss": 0.0227, + "num_input_tokens_seen": 9784944, "step": 1290 }, { "epoch": 0.3517826825127334, - "grad_norm": 0.20211444795131683, + "grad_norm": 0.1951354593038559, "learning_rate": 4.832507992443705e-05, - "loss": 0.0295, - "num_input_tokens_seen": 9727216, + "loss": 0.0272, + "num_input_tokens_seen": 9820032, "step": 1295 }, { "epoch": 0.3531409168081494, - "grad_norm": 0.35494884848594666, + "grad_norm": 0.36890122294425964, "learning_rate": 4.831225909681916e-05, "loss": 0.0272, - "num_input_tokens_seen": 9762688, + "num_input_tokens_seen": 9856192, "step": 1300 }, { "epoch": 0.3544991511035654, - "grad_norm": 0.20400692522525787, + "grad_norm": 0.3691164553165436, "learning_rate": 4.829939110093671e-05, - "loss": 0.0276, - "num_input_tokens_seen": 9803120, + "loss": 0.0281, + "num_input_tokens_seen": 9896784, "step": 1305 }, { "epoch": 0.3558573853989813, - "grad_norm": 0.28405076265335083, + "grad_norm": 0.320035457611084, "learning_rate": 4.8286475962825825e-05, - "loss": 0.0304, - "num_input_tokens_seen": 9839584, + "loss": 0.03, + "num_input_tokens_seen": 9934032, "step": 1310 }, { "epoch": 0.3572156196943973, - "grad_norm": 0.5876842141151428, + "grad_norm": 0.18483275175094604, "learning_rate": 4.827351370861802e-05, "loss": 0.0281, - "num_input_tokens_seen": 9874928, + "num_input_tokens_seen": 9969840, "step": 1315 }, { "epoch": 0.35857385398981323, - "grad_norm": 0.5225470066070557, + "grad_norm": 0.3832316994667053, "learning_rate": 4.8260504364540145e-05, - "loss": 0.0312, - "num_input_tokens_seen": 9914448, + "loss": 0.0283, + "num_input_tokens_seen": 10010032, "step": 1320 }, { "epoch": 0.3599320882852292, - "grad_norm": 0.16272923350334167, + "grad_norm": 0.38125666975975037, "learning_rate": 4.824744795691432e-05, - "loss": 0.0273, - "num_input_tokens_seen": 9952736, + "loss": 0.0267, + "num_input_tokens_seen": 10048832, "step": 1325 }, { "epoch": 0.36129032258064514, - "grad_norm": 0.14525048434734344, + "grad_norm": 0.1810290515422821, "learning_rate": 4.82343445121579e-05, - "loss": 0.0265, - "num_input_tokens_seen": 9992832, + "loss": 0.0266, + "num_input_tokens_seen": 10089088, "step": 1330 }, { "epoch": 0.3626485568760611, - "grad_norm": 0.20120272040367126, + "grad_norm": 0.2018584907054901, "learning_rate": 4.8221194056783403e-05, - "loss": 0.0312, - "num_input_tokens_seen": 10032064, + "loss": 0.0305, + "num_input_tokens_seen": 10129120, "step": 1335 }, { "epoch": 0.3640067911714771, - "grad_norm": 0.20580318570137024, + "grad_norm": 0.2283191978931427, "learning_rate": 4.8207996617398465e-05, - "loss": 0.0273, - "num_input_tokens_seen": 10071888, + "loss": 0.0268, + "num_input_tokens_seen": 10169200, "step": 1340 }, { "epoch": 0.36536502546689303, - "grad_norm": 0.2656584084033966, + "grad_norm": 0.1422199308872223, "learning_rate": 4.819475222070579e-05, - "loss": 0.0293, - "num_input_tokens_seen": 10114208, + "loss": 0.0267, + "num_input_tokens_seen": 10211552, "step": 1345 }, { "epoch": 0.366723259762309, - "grad_norm": 0.2492864429950714, + "grad_norm": 0.14700937271118164, "learning_rate": 4.818146089350309e-05, - "loss": 0.0258, - "num_input_tokens_seen": 10145376, + "loss": 0.0257, + "num_input_tokens_seen": 10242912, "step": 1350 }, { "epoch": 0.36808149405772495, - "grad_norm": 0.1609681099653244, + "grad_norm": 0.2314162701368332, "learning_rate": 4.8168122662683034e-05, - "loss": 0.0277, - "num_input_tokens_seen": 10180464, + "loss": 0.0251, + "num_input_tokens_seen": 10278352, "step": 1355 }, { "epoch": 0.36943972835314093, - "grad_norm": 0.1978723108768463, + "grad_norm": 0.2256452590227127, "learning_rate": 4.815473755523319e-05, - "loss": 0.0259, - "num_input_tokens_seen": 10217040, + "loss": 0.0245, + "num_input_tokens_seen": 10315616, "step": 1360 }, { "epoch": 0.37079796264855686, - "grad_norm": 0.53437739610672, + "grad_norm": 0.14989019930362701, "learning_rate": 4.8141305598235965e-05, - "loss": 0.0315, - "num_input_tokens_seen": 10255680, + "loss": 0.0266, + "num_input_tokens_seen": 10354384, "step": 1365 }, { "epoch": 0.37215619694397284, - "grad_norm": 0.3903343379497528, + "grad_norm": 0.3487226068973541, "learning_rate": 4.812782681886858e-05, - "loss": 0.0299, - "num_input_tokens_seen": 10293920, + "loss": 0.0302, + "num_input_tokens_seen": 10392464, "step": 1370 }, { "epoch": 0.3735144312393888, - "grad_norm": 0.2682500183582306, + "grad_norm": 0.5510490536689758, "learning_rate": 4.811430124440298e-05, - "loss": 0.025, - "num_input_tokens_seen": 10327136, + "loss": 0.0261, + "num_input_tokens_seen": 10425776, "step": 1375 }, { "epoch": 0.37487266553480475, - "grad_norm": 0.2662067413330078, + "grad_norm": 0.3062779903411865, "learning_rate": 4.810072890220578e-05, - "loss": 0.0257, - "num_input_tokens_seen": 10368288, + "loss": 0.0248, + "num_input_tokens_seen": 10466928, "step": 1380 }, { "epoch": 0.37623089983022073, - "grad_norm": 0.2002120167016983, + "grad_norm": 0.19611434638500214, "learning_rate": 4.808710981973824e-05, "loss": 0.0334, - "num_input_tokens_seen": 10407312, + "num_input_tokens_seen": 10506096, "step": 1385 }, { "epoch": 0.37758913412563666, - "grad_norm": 0.140855610370636, + "grad_norm": 0.1756778359413147, "learning_rate": 4.807344402455618e-05, - "loss": 0.0251, - "num_input_tokens_seen": 10444528, + "loss": 0.0242, + "num_input_tokens_seen": 10543344, "step": 1390 }, { "epoch": 0.37894736842105264, - "grad_norm": 0.4089698791503906, + "grad_norm": 0.20806434750556946, "learning_rate": 4.805973154430993e-05, - "loss": 0.0337, - "num_input_tokens_seen": 10481792, + "loss": 0.0315, + "num_input_tokens_seen": 10580800, "step": 1395 }, { "epoch": 0.38030560271646857, - "grad_norm": 0.32517045736312866, + "grad_norm": 0.17669175565242767, "learning_rate": 4.8045972406744304e-05, - "loss": 0.0259, - "num_input_tokens_seen": 10515840, + "loss": 0.0283, + "num_input_tokens_seen": 10615680, "step": 1400 }, { "epoch": 0.38166383701188455, - "grad_norm": 0.461227685213089, + "grad_norm": 0.2712690234184265, "learning_rate": 4.803216663969849e-05, - "loss": 0.0347, - "num_input_tokens_seen": 10548160, + "loss": 0.0316, + "num_input_tokens_seen": 10648240, "step": 1405 }, { "epoch": 0.3830220713073005, - "grad_norm": 0.22234497964382172, + "grad_norm": 0.3563878834247589, "learning_rate": 4.801831427110603e-05, - "loss": 0.0279, - "num_input_tokens_seen": 10585360, + "loss": 0.0276, + "num_input_tokens_seen": 10685728, "step": 1410 }, { "epoch": 0.38438030560271647, - "grad_norm": 0.28940293192863464, + "grad_norm": 0.1760042905807495, "learning_rate": 4.8004415328994785e-05, - "loss": 0.0281, - "num_input_tokens_seen": 10625824, + "loss": 0.0269, + "num_input_tokens_seen": 10726688, "step": 1415 }, { "epoch": 0.38573853989813245, - "grad_norm": 0.30109554529190063, + "grad_norm": 0.5444195866584778, "learning_rate": 4.7990469841486795e-05, - "loss": 0.033, - "num_input_tokens_seen": 10666768, + "loss": 0.0282, + "num_input_tokens_seen": 10767664, "step": 1420 }, { "epoch": 0.3870967741935484, - "grad_norm": 0.14461523294448853, + "grad_norm": 0.1705370396375656, "learning_rate": 4.797647783679833e-05, - "loss": 0.0299, - "num_input_tokens_seen": 10701488, + "loss": 0.0308, + "num_input_tokens_seen": 10802656, "step": 1425 }, { "epoch": 0.38845500848896436, - "grad_norm": 0.16967199742794037, + "grad_norm": 0.41802701354026794, "learning_rate": 4.796243934323973e-05, - "loss": 0.0313, - "num_input_tokens_seen": 10734592, + "loss": 0.0317, + "num_input_tokens_seen": 10836048, "step": 1430 }, { "epoch": 0.3898132427843803, - "grad_norm": 0.3094119727611542, + "grad_norm": 0.34107738733291626, "learning_rate": 4.7948354389215445e-05, - "loss": 0.0252, - "num_input_tokens_seen": 10770288, + "loss": 0.0237, + "num_input_tokens_seen": 10872640, "step": 1435 }, { "epoch": 0.39117147707979627, - "grad_norm": 0.18236255645751953, + "grad_norm": 0.3206077218055725, "learning_rate": 4.79342230032239e-05, - "loss": 0.0282, - "num_input_tokens_seen": 10809504, + "loss": 0.0296, + "num_input_tokens_seen": 10911840, "step": 1440 }, { "epoch": 0.3925297113752122, - "grad_norm": 0.21678385138511658, + "grad_norm": 0.3506789207458496, "learning_rate": 4.792004521385748e-05, - "loss": 0.0318, - "num_input_tokens_seen": 10839360, + "loss": 0.0272, + "num_input_tokens_seen": 10942256, "step": 1445 }, { "epoch": 0.3938879456706282, - "grad_norm": 0.18937398493289948, + "grad_norm": 0.15703804790973663, "learning_rate": 4.7905821049802436e-05, - "loss": 0.0307, - "num_input_tokens_seen": 10877936, + "loss": 0.0281, + "num_input_tokens_seen": 10980688, "step": 1450 }, { "epoch": 0.39524617996604416, - "grad_norm": 0.2053726315498352, + "grad_norm": 0.2464696168899536, "learning_rate": 4.789155053983889e-05, - "loss": 0.0255, - "num_input_tokens_seen": 10914368, + "loss": 0.025, + "num_input_tokens_seen": 11017184, "step": 1455 }, { "epoch": 0.3966044142614601, - "grad_norm": 0.5340818762779236, + "grad_norm": 0.32383862137794495, "learning_rate": 4.7877233712840695e-05, - "loss": 0.0335, - "num_input_tokens_seen": 10953632, + "loss": 0.0285, + "num_input_tokens_seen": 11056560, "step": 1460 }, { "epoch": 0.3979626485568761, - "grad_norm": 0.3034652769565582, + "grad_norm": 0.33938106894493103, "learning_rate": 4.786287059777545e-05, - "loss": 0.0283, - "num_input_tokens_seen": 10993200, + "loss": 0.0267, + "num_input_tokens_seen": 11096800, "step": 1465 }, { "epoch": 0.399320882852292, - "grad_norm": 0.221482515335083, + "grad_norm": 0.13552449643611908, "learning_rate": 4.784846122370439e-05, - "loss": 0.0292, - "num_input_tokens_seen": 11031936, + "loss": 0.0249, + "num_input_tokens_seen": 11135232, "step": 1470 }, { "epoch": 0.400679117147708, - "grad_norm": 0.3200656771659851, + "grad_norm": 0.3170987069606781, "learning_rate": 4.783400561978235e-05, - "loss": 0.0252, - "num_input_tokens_seen": 11070928, + "loss": 0.023, + "num_input_tokens_seen": 11174656, "step": 1475 }, { "epoch": 0.4020373514431239, - "grad_norm": 0.2188141644001007, + "grad_norm": 0.38975194096565247, "learning_rate": 4.7819503815257726e-05, - "loss": 0.0261, - "num_input_tokens_seen": 11108112, + "loss": 0.0248, + "num_input_tokens_seen": 11212624, "step": 1480 }, { "epoch": 0.4033955857385399, - "grad_norm": 0.3420441746711731, + "grad_norm": 0.2025246024131775, "learning_rate": 4.780495583947236e-05, - "loss": 0.0315, - "num_input_tokens_seen": 11143680, + "loss": 0.0282, + "num_input_tokens_seen": 11248320, "step": 1485 }, { "epoch": 0.4047538200339559, - "grad_norm": 0.16381587088108063, + "grad_norm": 0.14362990856170654, "learning_rate": 4.7790361721861524e-05, - "loss": 0.0314, - "num_input_tokens_seen": 11179680, + "loss": 0.0293, + "num_input_tokens_seen": 11284448, "step": 1490 }, { "epoch": 0.4061120543293718, - "grad_norm": 0.2592351734638214, + "grad_norm": 0.3514206111431122, "learning_rate": 4.777572149195387e-05, - "loss": 0.0275, - "num_input_tokens_seen": 11219328, + "loss": 0.0258, + "num_input_tokens_seen": 11324464, "step": 1495 }, { "epoch": 0.4074702886247878, - "grad_norm": 0.20483994483947754, + "grad_norm": 0.20460715889930725, "learning_rate": 4.776103517937132e-05, - "loss": 0.0282, - "num_input_tokens_seen": 11253776, + "loss": 0.0262, + "num_input_tokens_seen": 11359408, "step": 1500 }, { "epoch": 0.4088285229202037, - "grad_norm": 0.52281254529953, + "grad_norm": 0.68986976146698, "learning_rate": 4.7746302813829045e-05, - "loss": 0.028, - "num_input_tokens_seen": 11293024, + "loss": 0.0276, + "num_input_tokens_seen": 11399040, "step": 1505 }, { "epoch": 0.4101867572156197, - "grad_norm": 0.3621463179588318, + "grad_norm": 0.3801930844783783, "learning_rate": 4.773152442513541e-05, - "loss": 0.0248, - "num_input_tokens_seen": 11333632, + "loss": 0.0238, + "num_input_tokens_seen": 11440544, "step": 1510 }, { "epoch": 0.41154499151103563, - "grad_norm": 0.36055976152420044, + "grad_norm": 0.253498911857605, "learning_rate": 4.77167000431919e-05, - "loss": 0.0291, - "num_input_tokens_seen": 11371040, + "loss": 0.0273, + "num_input_tokens_seen": 11478496, "step": 1515 }, { "epoch": 0.4129032258064516, - "grad_norm": 0.39761507511138916, + "grad_norm": 0.33102190494537354, "learning_rate": 4.770182969799303e-05, - "loss": 0.0285, - "num_input_tokens_seen": 11404752, + "loss": 0.0299, + "num_input_tokens_seen": 11512976, "step": 1520 }, { "epoch": 0.4142614601018676, - "grad_norm": 0.5110732913017273, + "grad_norm": 0.2173449844121933, "learning_rate": 4.768691341962635e-05, - "loss": 0.028, - "num_input_tokens_seen": 11436128, + "loss": 0.0273, + "num_input_tokens_seen": 11544864, "step": 1525 }, { "epoch": 0.4156196943972835, - "grad_norm": 0.2108408808708191, + "grad_norm": 0.18735596537590027, "learning_rate": 4.767195123827232e-05, - "loss": 0.0255, - "num_input_tokens_seen": 11471520, + "loss": 0.026, + "num_input_tokens_seen": 11580736, "step": 1530 }, { "epoch": 0.4169779286926995, - "grad_norm": 0.893544614315033, + "grad_norm": 0.1575510948896408, "learning_rate": 4.7656943184204294e-05, - "loss": 0.0311, - "num_input_tokens_seen": 11512128, + "loss": 0.0268, + "num_input_tokens_seen": 11621552, "step": 1535 }, { "epoch": 0.41833616298811543, - "grad_norm": 0.2384331375360489, + "grad_norm": 0.2084764987230301, "learning_rate": 4.7641889287788435e-05, - "loss": 0.0282, - "num_input_tokens_seen": 11544192, + "loss": 0.0285, + "num_input_tokens_seen": 11653872, "step": 1540 }, { "epoch": 0.4196943972835314, - "grad_norm": 0.22635774314403534, + "grad_norm": 0.16623786091804504, "learning_rate": 4.762678957948366e-05, - "loss": 0.0292, - "num_input_tokens_seen": 11581104, + "loss": 0.027, + "num_input_tokens_seen": 11691136, "step": 1545 }, { "epoch": 0.42105263157894735, - "grad_norm": 0.20283222198486328, + "grad_norm": 0.20180337131023407, "learning_rate": 4.761164408984157e-05, "loss": 0.0267, - "num_input_tokens_seen": 11623152, + "num_input_tokens_seen": 11733456, "step": 1550 }, { "epoch": 0.42241086587436333, - "grad_norm": 0.21886980533599854, + "grad_norm": 0.21168194711208344, "learning_rate": 4.759645284950641e-05, - "loss": 0.0259, - "num_input_tokens_seen": 11660048, + "loss": 0.0247, + "num_input_tokens_seen": 11770864, "step": 1555 }, { "epoch": 0.4237691001697793, - "grad_norm": 0.26957905292510986, + "grad_norm": 0.2326972335577011, "learning_rate": 4.758121588921499e-05, - "loss": 0.0286, - "num_input_tokens_seen": 11696480, + "loss": 0.0273, + "num_input_tokens_seen": 11807680, "step": 1560 }, { "epoch": 0.42512733446519524, - "grad_norm": 0.1835126429796219, + "grad_norm": 0.2745097279548645, "learning_rate": 4.7565933239796635e-05, - "loss": 0.0207, - "num_input_tokens_seen": 11733040, + "loss": 0.0209, + "num_input_tokens_seen": 11844496, "step": 1565 }, { "epoch": 0.4264855687606112, - "grad_norm": 0.14217287302017212, + "grad_norm": 0.4001617431640625, "learning_rate": 4.755060493217309e-05, - "loss": 0.0277, - "num_input_tokens_seen": 11771520, + "loss": 0.0281, + "num_input_tokens_seen": 11883280, "step": 1570 }, { "epoch": 0.42784380305602715, - "grad_norm": 0.18406495451927185, + "grad_norm": 0.19287899136543274, "learning_rate": 4.7535230997358494e-05, - "loss": 0.0296, - "num_input_tokens_seen": 11808976, + "loss": 0.0282, + "num_input_tokens_seen": 11921776, "step": 1575 }, { "epoch": 0.42920203735144313, - "grad_norm": 0.1346544623374939, + "grad_norm": 0.4963703453540802, "learning_rate": 4.751981146645932e-05, - "loss": 0.0256, - "num_input_tokens_seen": 11847568, + "loss": 0.0259, + "num_input_tokens_seen": 11961024, "step": 1580 }, { "epoch": 0.43056027164685906, - "grad_norm": 0.5972577929496765, + "grad_norm": 0.5623283386230469, "learning_rate": 4.750434637067427e-05, - "loss": 0.0308, - "num_input_tokens_seen": 11881040, + "loss": 0.0309, + "num_input_tokens_seen": 11994912, "step": 1585 }, { "epoch": 0.43191850594227504, - "grad_norm": 0.18195278942584991, + "grad_norm": 0.1441938281059265, "learning_rate": 4.748883574129425e-05, - "loss": 0.0283, - "num_input_tokens_seen": 11921616, + "loss": 0.0288, + "num_input_tokens_seen": 12036448, "step": 1590 }, { "epoch": 0.433276740237691, - "grad_norm": 0.2183893322944641, + "grad_norm": 0.21960236132144928, "learning_rate": 4.74732796097023e-05, - "loss": 0.0271, - "num_input_tokens_seen": 11954592, + "loss": 0.0259, + "num_input_tokens_seen": 12070384, "step": 1595 }, { "epoch": 0.43463497453310695, - "grad_norm": 0.18252725899219513, + "grad_norm": 0.14779631793498993, "learning_rate": 4.745767800737352e-05, - "loss": 0.0252, - "num_input_tokens_seen": 11991936, + "loss": 0.0247, + "num_input_tokens_seen": 12108064, "step": 1600 }, { "epoch": 0.43599320882852294, - "grad_norm": 0.1436891257762909, + "grad_norm": 0.26157209277153015, "learning_rate": 4.7442030965875024e-05, - "loss": 0.0265, - "num_input_tokens_seen": 12028624, + "loss": 0.0275, + "num_input_tokens_seen": 12145280, "step": 1605 }, { "epoch": 0.43735144312393887, - "grad_norm": 0.2771035134792328, + "grad_norm": 0.2584926187992096, "learning_rate": 4.742633851686584e-05, - "loss": 0.027, - "num_input_tokens_seen": 12062736, + "loss": 0.0282, + "num_input_tokens_seen": 12180176, "step": 1610 }, { "epoch": 0.43870967741935485, - "grad_norm": 0.1807335615158081, + "grad_norm": 0.30715203285217285, "learning_rate": 4.7410600692096896e-05, - "loss": 0.0283, - "num_input_tokens_seen": 12098096, + "loss": 0.0286, + "num_input_tokens_seen": 12216000, "step": 1615 }, { "epoch": 0.4400679117147708, - "grad_norm": 0.19942434132099152, + "grad_norm": 0.2845460772514343, "learning_rate": 4.739481752341091e-05, - "loss": 0.0278, - "num_input_tokens_seen": 12134832, + "loss": 0.0296, + "num_input_tokens_seen": 12252624, "step": 1620 }, { "epoch": 0.44142614601018676, - "grad_norm": 0.15251871943473816, + "grad_norm": 0.1422175168991089, "learning_rate": 4.737898904274235e-05, - "loss": 0.0317, - "num_input_tokens_seen": 12174384, + "loss": 0.0303, + "num_input_tokens_seen": 12292464, "step": 1625 }, { "epoch": 0.44278438030560274, - "grad_norm": 0.1950218379497528, + "grad_norm": 0.25150784850120544, "learning_rate": 4.736311528211738e-05, - "loss": 0.0259, - "num_input_tokens_seen": 12210864, + "loss": 0.024, + "num_input_tokens_seen": 12328608, "step": 1630 }, { "epoch": 0.44414261460101867, - "grad_norm": 0.35181164741516113, + "grad_norm": 0.21775801479816437, "learning_rate": 4.734719627365377e-05, - "loss": 0.0232, - "num_input_tokens_seen": 12248144, + "loss": 0.0229, + "num_input_tokens_seen": 12366160, "step": 1635 }, { "epoch": 0.44550084889643465, - "grad_norm": 0.32509610056877136, + "grad_norm": 0.16745975613594055, "learning_rate": 4.733123204956082e-05, - "loss": 0.0286, - "num_input_tokens_seen": 12287664, + "loss": 0.0263, + "num_input_tokens_seen": 12406128, "step": 1640 }, { "epoch": 0.4468590831918506, - "grad_norm": 0.13142088055610657, + "grad_norm": 0.14613209664821625, "learning_rate": 4.7315222642139354e-05, - "loss": 0.0293, - "num_input_tokens_seen": 12326016, + "loss": 0.0299, + "num_input_tokens_seen": 12445120, "step": 1645 }, { "epoch": 0.44821731748726656, - "grad_norm": 0.28249120712280273, + "grad_norm": 0.25045669078826904, "learning_rate": 4.729916808378159e-05, - "loss": 0.0263, - "num_input_tokens_seen": 12362688, + "loss": 0.0253, + "num_input_tokens_seen": 12481664, "step": 1650 }, { "epoch": 0.4495755517826825, - "grad_norm": 0.1116885244846344, + "grad_norm": 0.3300274908542633, "learning_rate": 4.728306840697111e-05, - "loss": 0.0284, - "num_input_tokens_seen": 12405664, + "loss": 0.0289, + "num_input_tokens_seen": 12525408, "step": 1655 }, { "epoch": 0.4509337860780985, - "grad_norm": 0.2330375611782074, + "grad_norm": 0.2658381760120392, "learning_rate": 4.726692364428279e-05, - "loss": 0.0277, - "num_input_tokens_seen": 12442976, + "loss": 0.0289, + "num_input_tokens_seen": 12563056, "step": 1660 }, { "epoch": 0.45229202037351446, - "grad_norm": 0.26222336292266846, + "grad_norm": 0.23014353215694427, "learning_rate": 4.725073382838272e-05, - "loss": 0.0269, - "num_input_tokens_seen": 12477280, + "loss": 0.0289, + "num_input_tokens_seen": 12597280, "step": 1665 }, { "epoch": 0.4536502546689304, - "grad_norm": 0.16548945009708405, + "grad_norm": 0.14829672873020172, "learning_rate": 4.723449899202814e-05, - "loss": 0.0232, - "num_input_tokens_seen": 12514912, + "loss": 0.0246, + "num_input_tokens_seen": 12635664, "step": 1670 }, { "epoch": 0.45500848896434637, - "grad_norm": 0.16111703217029572, + "grad_norm": 0.1661553680896759, "learning_rate": 4.721821916806741e-05, - "loss": 0.0235, - "num_input_tokens_seen": 12550768, + "loss": 0.0231, + "num_input_tokens_seen": 12672000, "step": 1675 }, { "epoch": 0.4563667232597623, - "grad_norm": 0.15511640906333923, + "grad_norm": 0.14962486922740936, "learning_rate": 4.720189438943989e-05, - "loss": 0.0273, - "num_input_tokens_seen": 12584224, + "loss": 0.0283, + "num_input_tokens_seen": 12706944, "step": 1680 }, { "epoch": 0.4577249575551783, - "grad_norm": 0.17751246690750122, + "grad_norm": 0.3210920989513397, "learning_rate": 4.71855246891759e-05, - "loss": 0.0268, - "num_input_tokens_seen": 12619744, + "loss": 0.0285, + "num_input_tokens_seen": 12742864, "step": 1685 }, { "epoch": 0.4590831918505942, - "grad_norm": 0.18277476727962494, + "grad_norm": 0.27664288878440857, "learning_rate": 4.716911010039665e-05, - "loss": 0.0273, - "num_input_tokens_seen": 12656096, + "loss": 0.0257, + "num_input_tokens_seen": 12779888, "step": 1690 }, { "epoch": 0.4604414261460102, - "grad_norm": 0.1668909639120102, + "grad_norm": 0.2759992182254791, "learning_rate": 4.71526506563142e-05, - "loss": 0.0285, - "num_input_tokens_seen": 12689616, + "loss": 0.0273, + "num_input_tokens_seen": 12813744, "step": 1695 }, { "epoch": 0.4617996604414261, - "grad_norm": 0.1537911593914032, + "grad_norm": 0.17563682794570923, "learning_rate": 4.713614639023132e-05, - "loss": 0.0263, - "num_input_tokens_seen": 12725888, + "loss": 0.0268, + "num_input_tokens_seen": 12850880, "step": 1700 }, { "epoch": 0.4631578947368421, - "grad_norm": 0.2596486210823059, + "grad_norm": 0.15091799199581146, "learning_rate": 4.711959733554152e-05, - "loss": 0.0272, - "num_input_tokens_seen": 12762736, + "loss": 0.0268, + "num_input_tokens_seen": 12888560, "step": 1705 }, { "epoch": 0.4645161290322581, - "grad_norm": 0.14532437920570374, + "grad_norm": 0.320006787776947, "learning_rate": 4.710300352572888e-05, - "loss": 0.0281, - "num_input_tokens_seen": 12795744, + "loss": 0.0263, + "num_input_tokens_seen": 12921024, "step": 1710 }, { "epoch": 0.465874363327674, - "grad_norm": 0.12359660863876343, + "grad_norm": 0.1724865585565567, "learning_rate": 4.708636499436809e-05, - "loss": 0.0276, - "num_input_tokens_seen": 12832144, + "loss": 0.0283, + "num_input_tokens_seen": 12957360, "step": 1715 }, { "epoch": 0.46723259762309, - "grad_norm": 0.12314409762620926, + "grad_norm": 0.41510701179504395, "learning_rate": 4.706968177512429e-05, - "loss": 0.0277, - "num_input_tokens_seen": 12869984, + "loss": 0.0265, + "num_input_tokens_seen": 12995520, "step": 1720 }, { "epoch": 0.4685908319185059, - "grad_norm": 0.1500571072101593, + "grad_norm": 0.18736788630485535, "learning_rate": 4.705295390175304e-05, - "loss": 0.0282, - "num_input_tokens_seen": 12909792, + "loss": 0.0271, + "num_input_tokens_seen": 13035936, "step": 1725 }, { "epoch": 0.4699490662139219, - "grad_norm": 0.1509535014629364, + "grad_norm": 0.3337257504463196, "learning_rate": 4.703618140810025e-05, - "loss": 0.0238, - "num_input_tokens_seen": 12950608, + "loss": 0.0242, + "num_input_tokens_seen": 13077408, "step": 1730 }, { "epoch": 0.47130730050933783, - "grad_norm": 0.13484525680541992, + "grad_norm": 0.14263437688350677, "learning_rate": 4.701936432810213e-05, - "loss": 0.0273, - "num_input_tokens_seen": 12989392, + "loss": 0.0254, + "num_input_tokens_seen": 13116272, "step": 1735 }, { "epoch": 0.4726655348047538, - "grad_norm": 0.14007943868637085, + "grad_norm": 0.24350497126579285, "learning_rate": 4.700250269578508e-05, - "loss": 0.0254, - "num_input_tokens_seen": 13025376, + "loss": 0.0252, + "num_input_tokens_seen": 13152384, "step": 1740 }, { "epoch": 0.4740237691001698, - "grad_norm": 0.13771818578243256, + "grad_norm": 0.20950376987457275, "learning_rate": 4.698559654526566e-05, - "loss": 0.0238, - "num_input_tokens_seen": 13056672, + "loss": 0.0227, + "num_input_tokens_seen": 13184144, "step": 1745 }, { "epoch": 0.47538200339558573, - "grad_norm": 0.1391749233007431, + "grad_norm": 0.25091150403022766, "learning_rate": 4.69686459107505e-05, - "loss": 0.0256, - "num_input_tokens_seen": 13091216, + "loss": 0.0235, + "num_input_tokens_seen": 13219408, "step": 1750 }, { "epoch": 0.4767402376910017, - "grad_norm": 0.11934872716665268, + "grad_norm": 0.2391463965177536, "learning_rate": 4.695165082653622e-05, - "loss": 0.0258, - "num_input_tokens_seen": 13132928, + "loss": 0.0273, + "num_input_tokens_seen": 13261136, "step": 1755 }, { "epoch": 0.47809847198641764, - "grad_norm": 0.15750788152217865, + "grad_norm": 0.3595443665981293, "learning_rate": 4.69346113270094e-05, - "loss": 0.0255, - "num_input_tokens_seen": 13169040, + "loss": 0.0248, + "num_input_tokens_seen": 13297648, "step": 1760 }, { "epoch": 0.4794567062818336, - "grad_norm": 0.17250920832157135, + "grad_norm": 0.2288849651813507, "learning_rate": 4.6917527446646483e-05, - "loss": 0.0235, - "num_input_tokens_seen": 13205680, + "loss": 0.0238, + "num_input_tokens_seen": 13334832, "step": 1765 }, { "epoch": 0.48081494057724955, - "grad_norm": 0.19248589873313904, + "grad_norm": 0.22412221133708954, "learning_rate": 4.69003992200137e-05, - "loss": 0.0247, - "num_input_tokens_seen": 13241712, + "loss": 0.0284, + "num_input_tokens_seen": 13371456, "step": 1770 }, { "epoch": 0.48217317487266553, - "grad_norm": 0.21364861726760864, + "grad_norm": 0.22353029251098633, "learning_rate": 4.6883226681767004e-05, - "loss": 0.029, - "num_input_tokens_seen": 13274704, + "loss": 0.0322, + "num_input_tokens_seen": 13404896, "step": 1775 }, { "epoch": 0.4835314091680815, - "grad_norm": 0.1440795660018921, + "grad_norm": 0.19858214259147644, "learning_rate": 4.6866009866652005e-05, - "loss": 0.0249, - "num_input_tokens_seen": 13310640, + "loss": 0.0256, + "num_input_tokens_seen": 13441152, "step": 1780 }, { "epoch": 0.48488964346349744, - "grad_norm": 0.13817016780376434, + "grad_norm": 0.18588006496429443, "learning_rate": 4.684874880950392e-05, - "loss": 0.0247, - "num_input_tokens_seen": 13343552, + "loss": 0.03, + "num_input_tokens_seen": 13474320, "step": 1785 }, { "epoch": 0.4862478777589134, - "grad_norm": 0.16756582260131836, + "grad_norm": 0.3674563765525818, "learning_rate": 4.6831443545247466e-05, - "loss": 0.0274, - "num_input_tokens_seen": 13381296, + "loss": 0.0269, + "num_input_tokens_seen": 13512848, "step": 1790 }, { "epoch": 0.48760611205432935, - "grad_norm": 0.14246462285518646, + "grad_norm": 0.4992508888244629, "learning_rate": 4.681409410889681e-05, - "loss": 0.0261, - "num_input_tokens_seen": 13415104, + "loss": 0.026, + "num_input_tokens_seen": 13547040, "step": 1795 }, { "epoch": 0.48896434634974534, - "grad_norm": 0.3316088914871216, + "grad_norm": 0.21229961514472961, "learning_rate": 4.6796700535555485e-05, - "loss": 0.0278, - "num_input_tokens_seen": 13449552, + "loss": 0.0255, + "num_input_tokens_seen": 13581760, "step": 1800 }, { "epoch": 0.49032258064516127, - "grad_norm": 0.1533086746931076, + "grad_norm": 0.15784433484077454, "learning_rate": 4.677926286041634e-05, - "loss": 0.0269, - "num_input_tokens_seen": 13488736, + "loss": 0.0266, + "num_input_tokens_seen": 13620464, "step": 1805 }, { "epoch": 0.49168081494057725, - "grad_norm": 0.2998937666416168, + "grad_norm": 0.33834564685821533, "learning_rate": 4.676178111876145e-05, - "loss": 0.0281, - "num_input_tokens_seen": 13529104, + "loss": 0.0258, + "num_input_tokens_seen": 13661088, "step": 1810 }, { "epoch": 0.49303904923599323, - "grad_norm": 0.2908264100551605, + "grad_norm": 0.24264143407344818, "learning_rate": 4.674425534596204e-05, - "loss": 0.0283, - "num_input_tokens_seen": 13570864, + "loss": 0.0282, + "num_input_tokens_seen": 13703040, "step": 1815 }, { "epoch": 0.49439728353140916, - "grad_norm": 0.14119704067707062, + "grad_norm": 0.30105990171432495, "learning_rate": 4.672668557747845e-05, - "loss": 0.0259, - "num_input_tokens_seen": 13610128, + "loss": 0.0282, + "num_input_tokens_seen": 13742640, "step": 1820 }, { "epoch": 0.49575551782682514, - "grad_norm": 0.12032008171081543, + "grad_norm": 0.2030404657125473, "learning_rate": 4.670907184886001e-05, - "loss": 0.0278, - "num_input_tokens_seen": 13652000, + "loss": 0.0285, + "num_input_tokens_seen": 13784768, "step": 1825 }, { "epoch": 0.49711375212224107, - "grad_norm": 0.11397311836481094, + "grad_norm": 0.1125498041510582, "learning_rate": 4.669141419574501e-05, - "loss": 0.0271, - "num_input_tokens_seen": 13692768, + "loss": 0.027, + "num_input_tokens_seen": 13825856, "step": 1830 }, { "epoch": 0.49847198641765705, - "grad_norm": 0.39403828978538513, + "grad_norm": 0.24217714369297028, "learning_rate": 4.6673712653860625e-05, - "loss": 0.0225, - "num_input_tokens_seen": 13727984, + "loss": 0.025, + "num_input_tokens_seen": 13861392, "step": 1835 }, { "epoch": 0.499830220713073, - "grad_norm": 0.15944211184978485, + "grad_norm": 0.17508868873119354, "learning_rate": 4.66559672590228e-05, - "loss": 0.0264, - "num_input_tokens_seen": 13763472, + "loss": 0.0268, + "num_input_tokens_seen": 13897168, "step": 1840 }, { "epoch": 0.501188455008489, - "grad_norm": 0.2627827525138855, + "grad_norm": 0.1860286444425583, "learning_rate": 4.663817804713624e-05, - "loss": 0.0243, - "num_input_tokens_seen": 13801040, + "loss": 0.0253, + "num_input_tokens_seen": 13935248, "step": 1845 }, { "epoch": 0.5025466893039049, - "grad_norm": 0.2110561579465866, + "grad_norm": 0.33110344409942627, "learning_rate": 4.6620345054194294e-05, - "loss": 0.0263, - "num_input_tokens_seen": 13844384, + "loss": 0.0258, + "num_input_tokens_seen": 13978736, "step": 1850 }, { "epoch": 0.5039049235993209, - "grad_norm": 0.10817272961139679, + "grad_norm": 0.1553443819284439, "learning_rate": 4.660246831627888e-05, - "loss": 0.0283, - "num_input_tokens_seen": 13884464, + "loss": 0.0276, + "num_input_tokens_seen": 14019120, "step": 1855 }, { "epoch": 0.5052631578947369, - "grad_norm": 0.16684973239898682, + "grad_norm": 0.186492457985878, "learning_rate": 4.658454786956045e-05, - "loss": 0.0284, - "num_input_tokens_seen": 13919792, + "loss": 0.0298, + "num_input_tokens_seen": 14055296, "step": 1860 }, { "epoch": 0.5066213921901528, - "grad_norm": 0.1647118330001831, + "grad_norm": 0.1794458031654358, "learning_rate": 4.6566583750297884e-05, - "loss": 0.0282, - "num_input_tokens_seen": 13953808, + "loss": 0.0276, + "num_input_tokens_seen": 14089984, "step": 1865 }, { "epoch": 0.5079796264855687, - "grad_norm": 0.11503421515226364, + "grad_norm": 0.1574382483959198, "learning_rate": 4.6548575994838416e-05, - "loss": 0.0255, - "num_input_tokens_seen": 13995472, + "loss": 0.0257, + "num_input_tokens_seen": 14132128, "step": 1870 }, { "epoch": 0.5093378607809848, - "grad_norm": 0.27880796790122986, + "grad_norm": 0.25899288058280945, "learning_rate": 4.653052463961758e-05, - "loss": 0.029, - "num_input_tokens_seen": 14028560, + "loss": 0.0285, + "num_input_tokens_seen": 14165472, "step": 1875 }, { "epoch": 0.5106960950764007, - "grad_norm": 0.1407376378774643, + "grad_norm": 0.4177064597606659, "learning_rate": 4.651242972115913e-05, - "loss": 0.0285, - "num_input_tokens_seen": 14069184, + "loss": 0.0298, + "num_input_tokens_seen": 14207264, "step": 1880 }, { "epoch": 0.5120543293718166, - "grad_norm": 0.11815478652715683, + "grad_norm": 0.5803402066230774, "learning_rate": 4.649429127607496e-05, - "loss": 0.0256, - "num_input_tokens_seen": 14108000, + "loss": 0.031, + "num_input_tokens_seen": 14246432, "step": 1885 }, { "epoch": 0.5134125636672326, - "grad_norm": 0.12185004353523254, + "grad_norm": 0.10990933328866959, "learning_rate": 4.6476109341065025e-05, - "loss": 0.0224, - "num_input_tokens_seen": 14145392, + "loss": 0.0226, + "num_input_tokens_seen": 14284272, "step": 1890 }, { "epoch": 0.5147707979626486, - "grad_norm": 0.19128887355327606, + "grad_norm": 0.3699043095111847, "learning_rate": 4.645788395291727e-05, - "loss": 0.0248, - "num_input_tokens_seen": 14189520, + "loss": 0.0273, + "num_input_tokens_seen": 14328560, "step": 1895 }, { "epoch": 0.5161290322580645, - "grad_norm": 0.20426525175571442, + "grad_norm": 0.19346311688423157, "learning_rate": 4.6439615148507586e-05, - "loss": 0.0231, - "num_input_tokens_seen": 14222928, + "loss": 0.0236, + "num_input_tokens_seen": 14362448, "step": 1900 }, { "epoch": 0.5174872665534804, - "grad_norm": 0.2289218306541443, + "grad_norm": 0.4302923381328583, "learning_rate": 4.6421302964799686e-05, - "loss": 0.0235, - "num_input_tokens_seen": 14262368, + "loss": 0.0252, + "num_input_tokens_seen": 14402384, "step": 1905 }, { "epoch": 0.5188455008488965, - "grad_norm": 0.24412193894386292, + "grad_norm": 0.2833784818649292, "learning_rate": 4.640294743884505e-05, - "loss": 0.0306, - "num_input_tokens_seen": 14303248, + "loss": 0.028, + "num_input_tokens_seen": 14443648, "step": 1910 }, { "epoch": 0.5202037351443124, - "grad_norm": 0.12855592370033264, + "grad_norm": 0.15876516699790955, "learning_rate": 4.6384548607782875e-05, - "loss": 0.0258, - "num_input_tokens_seen": 14348608, + "loss": 0.0268, + "num_input_tokens_seen": 14489648, "step": 1915 }, { "epoch": 0.5215619694397283, - "grad_norm": 0.12102772295475006, + "grad_norm": 0.17712220549583435, "learning_rate": 4.6366106508839965e-05, - "loss": 0.025, - "num_input_tokens_seen": 14381552, + "loss": 0.0254, + "num_input_tokens_seen": 14522608, "step": 1920 }, { "epoch": 0.5229202037351444, - "grad_norm": 0.430797278881073, + "grad_norm": 0.17544355988502502, "learning_rate": 4.634762117933067e-05, - "loss": 0.0285, - "num_input_tokens_seen": 14423312, + "loss": 0.0261, + "num_input_tokens_seen": 14564320, "step": 1925 }, { "epoch": 0.5242784380305603, - "grad_norm": 0.1516580879688263, + "grad_norm": 0.30121102929115295, "learning_rate": 4.6329092656656805e-05, - "loss": 0.0265, - "num_input_tokens_seen": 14465024, + "loss": 0.026, + "num_input_tokens_seen": 14605696, "step": 1930 }, { "epoch": 0.5256366723259762, - "grad_norm": 0.13434027135372162, + "grad_norm": 0.30117642879486084, "learning_rate": 4.631052097830759e-05, - "loss": 0.0243, - "num_input_tokens_seen": 14509616, + "loss": 0.0266, + "num_input_tokens_seen": 14650272, "step": 1935 }, { "epoch": 0.5269949066213921, - "grad_norm": 0.13272634148597717, + "grad_norm": 0.18201656639575958, "learning_rate": 4.6291906181859545e-05, - "loss": 0.0268, - "num_input_tokens_seen": 14548736, + "loss": 0.0263, + "num_input_tokens_seen": 14689808, "step": 1940 }, { "epoch": 0.5283531409168082, - "grad_norm": 0.2559536099433899, + "grad_norm": 0.1785014122724533, "learning_rate": 4.627324830497645e-05, - "loss": 0.0261, - "num_input_tokens_seen": 14589728, + "loss": 0.0276, + "num_input_tokens_seen": 14730384, "step": 1945 }, { "epoch": 0.5297113752122241, - "grad_norm": 0.1998368501663208, + "grad_norm": 0.26255160570144653, "learning_rate": 4.625454738540925e-05, - "loss": 0.0253, - "num_input_tokens_seen": 14622912, + "loss": 0.026, + "num_input_tokens_seen": 14763824, "step": 1950 }, { "epoch": 0.53106960950764, - "grad_norm": 0.3474322557449341, + "grad_norm": 0.5375460386276245, "learning_rate": 4.623580346099598e-05, - "loss": 0.0277, - "num_input_tokens_seen": 14661056, + "loss": 0.0297, + "num_input_tokens_seen": 14802272, "step": 1955 }, { "epoch": 0.5324278438030561, - "grad_norm": 0.1897965967655182, + "grad_norm": 0.20488527417182922, "learning_rate": 4.621701656966165e-05, - "loss": 0.0253, - "num_input_tokens_seen": 14697168, + "loss": 0.0254, + "num_input_tokens_seen": 14838512, "step": 1960 }, { "epoch": 0.533786078098472, - "grad_norm": 0.10813125222921371, + "grad_norm": 0.10828451812267303, "learning_rate": 4.6198186749418264e-05, - "loss": 0.0281, - "num_input_tokens_seen": 14738704, + "loss": 0.029, + "num_input_tokens_seen": 14880336, "step": 1965 }, { "epoch": 0.5351443123938879, - "grad_norm": 0.1803312450647354, + "grad_norm": 0.1435866355895996, "learning_rate": 4.6179314038364655e-05, - "loss": 0.0241, - "num_input_tokens_seen": 14783952, + "loss": 0.0242, + "num_input_tokens_seen": 14926192, "step": 1970 }, { "epoch": 0.5365025466893039, - "grad_norm": 0.13063985109329224, + "grad_norm": 0.3474206030368805, "learning_rate": 4.616039847468643e-05, - "loss": 0.0258, - "num_input_tokens_seen": 14821488, + "loss": 0.0259, + "num_input_tokens_seen": 14963728, "step": 1975 }, { "epoch": 0.5378607809847199, - "grad_norm": 0.5962217450141907, + "grad_norm": 0.21107153594493866, "learning_rate": 4.614144009665593e-05, - "loss": 0.0267, - "num_input_tokens_seen": 14863040, + "loss": 0.0273, + "num_input_tokens_seen": 15005696, "step": 1980 }, { "epoch": 0.5392190152801358, - "grad_norm": 0.2340570092201233, + "grad_norm": 0.11030969768762589, "learning_rate": 4.612243894263208e-05, - "loss": 0.0254, - "num_input_tokens_seen": 14900080, + "loss": 0.0233, + "num_input_tokens_seen": 15042624, "step": 1985 }, { "epoch": 0.5405772495755518, - "grad_norm": 0.5166763663291931, + "grad_norm": 0.2565583884716034, "learning_rate": 4.6103395051060416e-05, - "loss": 0.0257, - "num_input_tokens_seen": 14943776, + "loss": 0.0247, + "num_input_tokens_seen": 15087152, "step": 1990 }, { "epoch": 0.5419354838709678, - "grad_norm": 0.7536749243736267, + "grad_norm": 0.4781357944011688, "learning_rate": 4.608430846047288e-05, "loss": 0.0261, - "num_input_tokens_seen": 14982416, + "num_input_tokens_seen": 15126336, "step": 1995 }, { "epoch": 0.5432937181663837, - "grad_norm": 0.1266593039035797, + "grad_norm": 0.42503800988197327, "learning_rate": 4.606517920948784e-05, - "loss": 0.0262, - "num_input_tokens_seen": 15019744, + "loss": 0.0264, + "num_input_tokens_seen": 15163696, "step": 2000 }, { "epoch": 0.5446519524617996, - "grad_norm": 0.14823010563850403, + "grad_norm": 0.1270599663257599, "learning_rate": 4.604600733680999e-05, - "loss": 0.0243, - "num_input_tokens_seen": 15057040, + "loss": 0.0245, + "num_input_tokens_seen": 15201376, "step": 2005 }, { "epoch": 0.5460101867572156, - "grad_norm": 0.12136420607566833, + "grad_norm": 0.16608229279518127, "learning_rate": 4.6026792881230245e-05, - "loss": 0.0224, - "num_input_tokens_seen": 15100848, + "loss": 0.0238, + "num_input_tokens_seen": 15245552, "step": 2010 }, { "epoch": 0.5473684210526316, - "grad_norm": 0.1685768961906433, + "grad_norm": 0.17838163673877716, "learning_rate": 4.600753588162568e-05, - "loss": 0.0238, - "num_input_tokens_seen": 15138000, + "loss": 0.024, + "num_input_tokens_seen": 15283280, "step": 2015 }, { "epoch": 0.5487266553480475, - "grad_norm": 0.1304417997598648, + "grad_norm": 0.2895098328590393, "learning_rate": 4.5988236376959456e-05, - "loss": 0.025, - "num_input_tokens_seen": 15178672, + "loss": 0.026, + "num_input_tokens_seen": 15324048, "step": 2020 }, { "epoch": 0.5500848896434635, - "grad_norm": 0.19704753160476685, + "grad_norm": 0.6138197779655457, "learning_rate": 4.5968894406280746e-05, - "loss": 0.029, - "num_input_tokens_seen": 15216368, + "loss": 0.0291, + "num_input_tokens_seen": 15362080, "step": 2025 }, { "epoch": 0.5514431239388795, - "grad_norm": 0.21792292594909668, + "grad_norm": 0.13826696574687958, "learning_rate": 4.594951000872461e-05, - "loss": 0.0256, - "num_input_tokens_seen": 15251536, + "loss": 0.0293, + "num_input_tokens_seen": 15397728, "step": 2030 }, { "epoch": 0.5528013582342954, - "grad_norm": 0.22549626231193542, + "grad_norm": 0.18032673001289368, "learning_rate": 4.593008322351199e-05, - "loss": 0.0273, - "num_input_tokens_seen": 15290752, + "loss": 0.0263, + "num_input_tokens_seen": 15437072, "step": 2035 }, { "epoch": 0.5541595925297114, - "grad_norm": 0.36358192563056946, + "grad_norm": 0.20598404109477997, "learning_rate": 4.5910614089949565e-05, - "loss": 0.0286, - "num_input_tokens_seen": 15328576, + "loss": 0.0279, + "num_input_tokens_seen": 15475424, "step": 2040 }, { "epoch": 0.5555178268251273, - "grad_norm": 0.15428614616394043, + "grad_norm": 0.2108670026063919, "learning_rate": 4.589110264742972e-05, - "loss": 0.0243, - "num_input_tokens_seen": 15369712, + "loss": 0.0256, + "num_input_tokens_seen": 15516784, "step": 2045 }, { "epoch": 0.5568760611205433, - "grad_norm": 0.13032685220241547, + "grad_norm": 0.22188526391983032, "learning_rate": 4.5871548935430425e-05, - "loss": 0.028, - "num_input_tokens_seen": 15410688, + "loss": 0.0272, + "num_input_tokens_seen": 15557824, "step": 2050 }, { "epoch": 0.5582342954159593, - "grad_norm": 0.5173495411872864, + "grad_norm": 0.12047035992145538, "learning_rate": 4.585195299351519e-05, - "loss": 0.0265, - "num_input_tokens_seen": 15450144, + "loss": 0.026, + "num_input_tokens_seen": 15597520, "step": 2055 }, { "epoch": 0.5595925297113752, - "grad_norm": 0.4513947367668152, + "grad_norm": 0.11901520937681198, "learning_rate": 4.583231486133297e-05, - "loss": 0.0276, - "num_input_tokens_seen": 15482736, + "loss": 0.0261, + "num_input_tokens_seen": 15629936, "step": 2060 }, { "epoch": 0.5609507640067912, - "grad_norm": 0.19882826507091522, + "grad_norm": 0.11543812602758408, "learning_rate": 4.581263457861806e-05, - "loss": 0.0248, - "num_input_tokens_seen": 15518944, + "loss": 0.0241, + "num_input_tokens_seen": 15666720, "step": 2065 }, { "epoch": 0.5623089983022072, - "grad_norm": 0.2639986276626587, + "grad_norm": 0.4095693528652191, "learning_rate": 4.5792912185190086e-05, - "loss": 0.0283, - "num_input_tokens_seen": 15556848, + "loss": 0.0284, + "num_input_tokens_seen": 15704304, "step": 2070 }, { "epoch": 0.5636672325976231, - "grad_norm": 0.18964111804962158, + "grad_norm": 0.35938504338264465, "learning_rate": 4.577314772095382e-05, - "loss": 0.0272, - "num_input_tokens_seen": 15598608, + "loss": 0.0279, + "num_input_tokens_seen": 15746800, "step": 2075 }, { "epoch": 0.565025466893039, - "grad_norm": 0.24403981864452362, + "grad_norm": 0.2134908139705658, "learning_rate": 4.57533412258992e-05, - "loss": 0.026, - "num_input_tokens_seen": 15642512, + "loss": 0.0249, + "num_input_tokens_seen": 15791024, "step": 2080 }, { "epoch": 0.566383701188455, - "grad_norm": 0.1339646279811859, + "grad_norm": 0.31043681502342224, "learning_rate": 4.5733492740101194e-05, - "loss": 0.0269, - "num_input_tokens_seen": 15680160, + "loss": 0.0273, + "num_input_tokens_seen": 15828640, "step": 2085 }, { "epoch": 0.567741935483871, - "grad_norm": 0.3777838945388794, + "grad_norm": 0.34750521183013916, "learning_rate": 4.5713602303719724e-05, - "loss": 0.028, - "num_input_tokens_seen": 15716384, + "loss": 0.0314, + "num_input_tokens_seen": 15865248, "step": 2090 }, { "epoch": 0.5691001697792869, - "grad_norm": 0.22131739556789398, + "grad_norm": 0.2740504741668701, "learning_rate": 4.569366995699961e-05, - "loss": 0.0281, - "num_input_tokens_seen": 15750208, + "loss": 0.0284, + "num_input_tokens_seen": 15899120, "step": 2095 }, { "epoch": 0.5704584040747029, - "grad_norm": 0.1790771633386612, + "grad_norm": 0.2650560140609741, "learning_rate": 4.567369574027044e-05, "loss": 0.0247, - "num_input_tokens_seen": 15785904, + "num_input_tokens_seen": 15935312, "step": 2100 }, { "epoch": 0.5718166383701189, - "grad_norm": 0.3013524115085602, + "grad_norm": 0.12665826082229614, "learning_rate": 4.565367969394654e-05, "loss": 0.0242, - "num_input_tokens_seen": 15829680, + "num_input_tokens_seen": 15979776, "step": 2105 }, { "epoch": 0.5731748726655348, - "grad_norm": 0.2135661393404007, + "grad_norm": 0.22043241560459137, "learning_rate": 4.563362185852687e-05, - "loss": 0.0281, - "num_input_tokens_seen": 15868944, + "loss": 0.0301, + "num_input_tokens_seen": 16019264, "step": 2110 }, { "epoch": 0.5745331069609507, - "grad_norm": 0.18564175069332123, + "grad_norm": 0.1927461177110672, "learning_rate": 4.561352227459494e-05, "loss": 0.027, - "num_input_tokens_seen": 15902960, + "num_input_tokens_seen": 16053056, "step": 2115 }, { "epoch": 0.5758913412563668, - "grad_norm": 0.19253119826316833, + "grad_norm": 0.20486770570278168, "learning_rate": 4.5593380982818734e-05, - "loss": 0.0265, - "num_input_tokens_seen": 15939120, + "loss": 0.0267, + "num_input_tokens_seen": 16089648, "step": 2120 }, { "epoch": 0.5772495755517827, - "grad_norm": 0.15768589079380035, + "grad_norm": 0.20726722478866577, "learning_rate": 4.5573198023950616e-05, - "loss": 0.0235, - "num_input_tokens_seen": 15977280, + "loss": 0.0249, + "num_input_tokens_seen": 16128304, "step": 2125 }, { "epoch": 0.5786078098471986, - "grad_norm": 0.18736566603183746, + "grad_norm": 0.18296000361442566, "learning_rate": 4.555297343882725e-05, - "loss": 0.0268, - "num_input_tokens_seen": 16011824, + "loss": 0.0277, + "num_input_tokens_seen": 16162928, "step": 2130 }, { "epoch": 0.5799660441426147, - "grad_norm": 0.22006145119667053, + "grad_norm": 0.24425186216831207, "learning_rate": 4.553270726836955e-05, - "loss": 0.0244, - "num_input_tokens_seen": 16044304, + "loss": 0.0271, + "num_input_tokens_seen": 16195552, "step": 2135 }, { "epoch": 0.5813242784380306, - "grad_norm": 0.12608657777309418, + "grad_norm": 0.26592743396759033, "learning_rate": 4.551239955358255e-05, - "loss": 0.0262, - "num_input_tokens_seen": 16081520, + "loss": 0.027, + "num_input_tokens_seen": 16233472, "step": 2140 }, { "epoch": 0.5826825127334465, - "grad_norm": 0.41784927248954773, + "grad_norm": 0.26992639899253845, "learning_rate": 4.549205033555535e-05, - "loss": 0.0262, - "num_input_tokens_seen": 16119696, + "loss": 0.0282, + "num_input_tokens_seen": 16272080, "step": 2145 }, { "epoch": 0.5840407470288624, - "grad_norm": 0.13431523740291595, + "grad_norm": 0.8555641770362854, "learning_rate": 4.5471659655461005e-05, - "loss": 0.0231, - "num_input_tokens_seen": 16156480, + "loss": 0.0259, + "num_input_tokens_seen": 16309056, "step": 2150 }, { "epoch": 0.5853989813242785, - "grad_norm": 0.20990709960460663, + "grad_norm": 0.29441556334495544, "learning_rate": 4.5451227554556506e-05, - "loss": 0.0262, - "num_input_tokens_seen": 16194240, + "loss": 0.026, + "num_input_tokens_seen": 16347344, "step": 2155 }, { "epoch": 0.5867572156196944, - "grad_norm": 0.18062323331832886, + "grad_norm": 0.15875646471977234, "learning_rate": 4.5430754074182596e-05, - "loss": 0.0262, - "num_input_tokens_seen": 16231072, + "loss": 0.0242, + "num_input_tokens_seen": 16384592, "step": 2160 }, { "epoch": 0.5881154499151103, - "grad_norm": 0.20339329540729523, + "grad_norm": 0.21036814153194427, "learning_rate": 4.541023925576378e-05, - "loss": 0.0228, - "num_input_tokens_seen": 16269600, + "loss": 0.0212, + "num_input_tokens_seen": 16423472, "step": 2165 }, { "epoch": 0.5894736842105263, - "grad_norm": 0.11199624836444855, + "grad_norm": 0.1776880919933319, "learning_rate": 4.538968314080819e-05, - "loss": 0.0229, - "num_input_tokens_seen": 16314304, + "loss": 0.0258, + "num_input_tokens_seen": 16468512, "step": 2170 }, { "epoch": 0.5908319185059423, - "grad_norm": 0.46193984150886536, + "grad_norm": 0.29343488812446594, "learning_rate": 4.5369085770907526e-05, - "loss": 0.0254, - "num_input_tokens_seen": 16349488, + "loss": 0.0273, + "num_input_tokens_seen": 16504256, "step": 2175 }, { "epoch": 0.5921901528013582, - "grad_norm": 0.10603177547454834, + "grad_norm": 0.11752564460039139, "learning_rate": 4.534844718773694e-05, - "loss": 0.0237, - "num_input_tokens_seen": 16394416, + "loss": 0.0262, + "num_input_tokens_seen": 16549328, "step": 2180 }, { "epoch": 0.5935483870967742, - "grad_norm": 0.1664574295282364, + "grad_norm": 0.21026937663555145, "learning_rate": 4.5327767433054986e-05, - "loss": 0.0245, - "num_input_tokens_seen": 16429664, + "loss": 0.0242, + "num_input_tokens_seen": 16584832, "step": 2185 }, { "epoch": 0.5949066213921902, - "grad_norm": 0.14920122921466827, + "grad_norm": 0.2376561462879181, "learning_rate": 4.5307046548703516e-05, - "loss": 0.0209, - "num_input_tokens_seen": 16466720, + "loss": 0.0204, + "num_input_tokens_seen": 16621968, "step": 2190 }, { "epoch": 0.5962648556876061, - "grad_norm": 0.11372318118810654, + "grad_norm": 0.10410748422145844, "learning_rate": 4.5286284576607606e-05, - "loss": 0.0273, - "num_input_tokens_seen": 16501984, + "loss": 0.0267, + "num_input_tokens_seen": 16656896, "step": 2195 }, { "epoch": 0.597623089983022, - "grad_norm": 0.12579818069934845, + "grad_norm": 0.11979422718286514, "learning_rate": 4.5265481558775455e-05, - "loss": 0.0242, - "num_input_tokens_seen": 16542864, + "loss": 0.024, + "num_input_tokens_seen": 16698192, "step": 2200 }, { "epoch": 0.598981324278438, - "grad_norm": 0.15003421902656555, + "grad_norm": 0.11816656589508057, "learning_rate": 4.5244637537298326e-05, - "loss": 0.0255, - "num_input_tokens_seen": 16576064, + "loss": 0.0247, + "num_input_tokens_seen": 16731152, "step": 2205 }, { "epoch": 0.600339558573854, - "grad_norm": 0.12731970846652985, + "grad_norm": 0.1390502154827118, "learning_rate": 4.5223752554350444e-05, - "loss": 0.0259, - "num_input_tokens_seen": 16608272, + "loss": 0.0255, + "num_input_tokens_seen": 16764464, "step": 2210 }, { "epoch": 0.6016977928692699, - "grad_norm": 0.13704077899456024, + "grad_norm": 0.15591445565223694, "learning_rate": 4.520282665218889e-05, - "loss": 0.0291, - "num_input_tokens_seen": 16644352, + "loss": 0.0294, + "num_input_tokens_seen": 16801200, "step": 2215 }, { "epoch": 0.6030560271646859, - "grad_norm": 0.6394554972648621, + "grad_norm": 0.2112419158220291, "learning_rate": 4.518185987315357e-05, - "loss": 0.0279, - "num_input_tokens_seen": 16680880, + "loss": 0.0286, + "num_input_tokens_seen": 16838384, "step": 2220 }, { "epoch": 0.6044142614601019, - "grad_norm": 0.13698209822177887, + "grad_norm": 0.28153911232948303, "learning_rate": 4.516085225966707e-05, - "loss": 0.0273, - "num_input_tokens_seen": 16717872, + "loss": 0.0265, + "num_input_tokens_seen": 16875744, "step": 2225 }, { "epoch": 0.6057724957555178, - "grad_norm": 0.09434419125318527, + "grad_norm": 0.16385655105113983, "learning_rate": 4.513980385423461e-05, - "loss": 0.024, - "num_input_tokens_seen": 16754208, + "loss": 0.0256, + "num_input_tokens_seen": 16912368, "step": 2230 }, { "epoch": 0.6071307300509338, - "grad_norm": 0.20352062582969666, + "grad_norm": 0.2537645995616913, "learning_rate": 4.5118714699443945e-05, - "loss": 0.0259, - "num_input_tokens_seen": 16788736, + "loss": 0.0269, + "num_input_tokens_seen": 16947056, "step": 2235 }, { "epoch": 0.6084889643463497, - "grad_norm": 0.3498345911502838, + "grad_norm": 0.18609529733657837, "learning_rate": 4.509758483796529e-05, - "loss": 0.0275, - "num_input_tokens_seen": 16829536, + "loss": 0.0271, + "num_input_tokens_seen": 16987728, "step": 2240 }, { "epoch": 0.6098471986417657, - "grad_norm": 0.17752856016159058, + "grad_norm": 0.1468685269355774, "learning_rate": 4.507641431255119e-05, - "loss": 0.0252, - "num_input_tokens_seen": 16865712, + "loss": 0.025, + "num_input_tokens_seen": 17024624, "step": 2245 }, { "epoch": 0.6112054329371817, - "grad_norm": 0.29275667667388916, + "grad_norm": 0.12068816274404526, "learning_rate": 4.50552031660365e-05, - "loss": 0.0327, - "num_input_tokens_seen": 16900352, + "loss": 0.0285, + "num_input_tokens_seen": 17059712, "step": 2250 }, { "epoch": 0.6125636672325976, - "grad_norm": 0.6861482262611389, + "grad_norm": 0.2306588739156723, "learning_rate": 4.503395144133826e-05, - "loss": 0.0257, - "num_input_tokens_seen": 16938592, + "loss": 0.0232, + "num_input_tokens_seen": 17098528, "step": 2255 }, { "epoch": 0.6139219015280136, - "grad_norm": 0.2180936634540558, + "grad_norm": 0.19265441596508026, "learning_rate": 4.5012659181455584e-05, - "loss": 0.0255, - "num_input_tokens_seen": 16979968, + "loss": 0.0233, + "num_input_tokens_seen": 17139600, "step": 2260 }, { "epoch": 0.6152801358234296, - "grad_norm": 0.21258105337619781, + "grad_norm": 0.09756200015544891, "learning_rate": 4.499132642946964e-05, - "loss": 0.0239, - "num_input_tokens_seen": 17022208, + "loss": 0.0218, + "num_input_tokens_seen": 17181984, "step": 2265 }, { "epoch": 0.6166383701188455, - "grad_norm": 0.1807708889245987, + "grad_norm": 0.5074624419212341, "learning_rate": 4.496995322854349e-05, - "loss": 0.0271, - "num_input_tokens_seen": 17060048, + "loss": 0.025, + "num_input_tokens_seen": 17220080, "step": 2270 }, { "epoch": 0.6179966044142614, - "grad_norm": 0.13885177671909332, + "grad_norm": 0.12793275713920593, "learning_rate": 4.494853962192208e-05, - "loss": 0.0261, - "num_input_tokens_seen": 17096960, + "loss": 0.0241, + "num_input_tokens_seen": 17256928, "step": 2275 }, { "epoch": 0.6193548387096774, - "grad_norm": 0.09891972690820694, + "grad_norm": 0.3631274700164795, "learning_rate": 4.4927085652932065e-05, - "loss": 0.0241, - "num_input_tokens_seen": 17142224, + "loss": 0.0225, + "num_input_tokens_seen": 17302912, "step": 2280 }, { "epoch": 0.6207130730050934, - "grad_norm": 0.1127699613571167, + "grad_norm": 0.33916783332824707, "learning_rate": 4.490559136498179e-05, - "loss": 0.0257, - "num_input_tokens_seen": 17180048, + "loss": 0.0259, + "num_input_tokens_seen": 17341248, "step": 2285 }, { "epoch": 0.6220713073005093, - "grad_norm": 0.35028254985809326, + "grad_norm": 0.25044992566108704, "learning_rate": 4.488405680156117e-05, - "loss": 0.0289, - "num_input_tokens_seen": 17215616, + "loss": 0.0285, + "num_input_tokens_seen": 17376704, "step": 2290 }, { "epoch": 0.6234295415959253, - "grad_norm": 0.387719988822937, + "grad_norm": 0.17477650940418243, "learning_rate": 4.486248200624163e-05, - "loss": 0.0249, - "num_input_tokens_seen": 17256720, + "loss": 0.0222, + "num_input_tokens_seen": 17417984, "step": 2295 }, { "epoch": 0.6247877758913413, - "grad_norm": 0.6141459941864014, + "grad_norm": 0.5116367936134338, "learning_rate": 4.4840867022675956e-05, - "loss": 0.0235, - "num_input_tokens_seen": 17298208, + "loss": 0.0269, + "num_input_tokens_seen": 17459712, "step": 2300 }, { "epoch": 0.6261460101867572, - "grad_norm": 0.3162858784198761, + "grad_norm": 0.21305516362190247, "learning_rate": 4.481921189459831e-05, - "loss": 0.0268, - "num_input_tokens_seen": 17338896, + "loss": 0.0235, + "num_input_tokens_seen": 17500576, "step": 2305 }, { "epoch": 0.6275042444821731, - "grad_norm": 0.37655404210090637, + "grad_norm": 0.1127389445900917, "learning_rate": 4.4797516665824e-05, - "loss": 0.0269, - "num_input_tokens_seen": 17378640, + "loss": 0.0247, + "num_input_tokens_seen": 17539936, "step": 2310 }, { "epoch": 0.6288624787775892, - "grad_norm": 0.1231534481048584, + "grad_norm": 0.10590340197086334, "learning_rate": 4.477578138024955e-05, - "loss": 0.0273, - "num_input_tokens_seen": 17418256, + "loss": 0.0259, + "num_input_tokens_seen": 17580160, "step": 2315 }, { "epoch": 0.6302207130730051, - "grad_norm": 0.6492210626602173, + "grad_norm": 0.2787824869155884, "learning_rate": 4.4754006081852475e-05, - "loss": 0.0296, - "num_input_tokens_seen": 17456256, + "loss": 0.0266, + "num_input_tokens_seen": 17618608, "step": 2320 }, { "epoch": 0.631578947368421, - "grad_norm": 0.11649473756551743, + "grad_norm": 0.1470077782869339, "learning_rate": 4.473219081469127e-05, - "loss": 0.0249, - "num_input_tokens_seen": 17488880, + "loss": 0.0276, + "num_input_tokens_seen": 17651632, "step": 2325 }, { "epoch": 0.6329371816638371, - "grad_norm": 0.11178428679704666, + "grad_norm": 0.22611761093139648, "learning_rate": 4.4710335622905306e-05, - "loss": 0.0251, - "num_input_tokens_seen": 17531264, + "loss": 0.0249, + "num_input_tokens_seen": 17694128, "step": 2330 }, { "epoch": 0.634295415959253, - "grad_norm": 0.32606610655784607, + "grad_norm": 0.5525403618812561, "learning_rate": 4.468844055071473e-05, - "loss": 0.0266, - "num_input_tokens_seen": 17565808, + "loss": 0.0262, + "num_input_tokens_seen": 17728768, "step": 2335 }, { "epoch": 0.6356536502546689, - "grad_norm": 0.1995607167482376, + "grad_norm": 0.33610549569129944, "learning_rate": 4.4666505642420365e-05, - "loss": 0.027, - "num_input_tokens_seen": 17597904, + "loss": 0.026, + "num_input_tokens_seen": 17761184, "step": 2340 }, { "epoch": 0.6370118845500848, - "grad_norm": 0.2187601625919342, + "grad_norm": 0.3351183533668518, "learning_rate": 4.4644530942403664e-05, - "loss": 0.031, - "num_input_tokens_seen": 17635904, + "loss": 0.0283, + "num_input_tokens_seen": 17799056, "step": 2345 }, { "epoch": 0.6383701188455009, - "grad_norm": 0.2723333537578583, + "grad_norm": 0.35296234488487244, "learning_rate": 4.462251649512656e-05, - "loss": 0.0315, - "num_input_tokens_seen": 17673744, + "loss": 0.0285, + "num_input_tokens_seen": 17836640, "step": 2350 }, { "epoch": 0.6397283531409168, - "grad_norm": 0.3148815929889679, + "grad_norm": 0.20329713821411133, "learning_rate": 4.460046234513144e-05, - "loss": 0.0266, - "num_input_tokens_seen": 17706192, + "loss": 0.0253, + "num_input_tokens_seen": 17869696, "step": 2355 }, { "epoch": 0.6410865874363327, - "grad_norm": 0.3615208566188812, + "grad_norm": 0.1659703552722931, "learning_rate": 4.4578368537040985e-05, - "loss": 0.0274, - "num_input_tokens_seen": 17739920, + "loss": 0.0249, + "num_input_tokens_seen": 17903760, "step": 2360 }, { "epoch": 0.6424448217317488, - "grad_norm": 0.5001614093780518, + "grad_norm": 0.115732342004776, "learning_rate": 4.455623511555815e-05, - "loss": 0.0292, - "num_input_tokens_seen": 17777920, + "loss": 0.0242, + "num_input_tokens_seen": 17942224, "step": 2365 }, { "epoch": 0.6438030560271647, - "grad_norm": 0.23898570239543915, + "grad_norm": 0.12361738830804825, "learning_rate": 4.453406212546604e-05, - "loss": 0.0282, - "num_input_tokens_seen": 17816048, + "loss": 0.0242, + "num_input_tokens_seen": 17980592, "step": 2370 }, { "epoch": 0.6451612903225806, - "grad_norm": 0.1742892563343048, + "grad_norm": 0.17419292032718658, "learning_rate": 4.451184961162779e-05, - "loss": 0.0277, - "num_input_tokens_seen": 17852848, + "loss": 0.0282, + "num_input_tokens_seen": 18017536, "step": 2375 }, { "epoch": 0.6465195246179966, - "grad_norm": 0.34150776267051697, + "grad_norm": 0.2260887175798416, "learning_rate": 4.4489597618986525e-05, - "loss": 0.0282, - "num_input_tokens_seen": 17892176, + "loss": 0.0251, + "num_input_tokens_seen": 18057536, "step": 2380 }, { "epoch": 0.6478777589134126, - "grad_norm": 0.22130222618579865, + "grad_norm": 0.23515047132968903, "learning_rate": 4.446730619256525e-05, - "loss": 0.0271, - "num_input_tokens_seen": 17928256, + "loss": 0.0266, + "num_input_tokens_seen": 18093728, "step": 2385 }, { "epoch": 0.6492359932088285, - "grad_norm": 0.19967462122440338, + "grad_norm": 0.13124418258666992, "learning_rate": 4.444497537746676e-05, - "loss": 0.0322, - "num_input_tokens_seen": 17963936, + "loss": 0.0271, + "num_input_tokens_seen": 18129680, "step": 2390 }, { "epoch": 0.6505942275042444, - "grad_norm": 0.45904630422592163, + "grad_norm": 0.17432615160942078, "learning_rate": 4.442260521887353e-05, - "loss": 0.0286, - "num_input_tokens_seen": 18002736, + "loss": 0.0257, + "num_input_tokens_seen": 18168752, "step": 2395 }, { "epoch": 0.6519524617996605, - "grad_norm": 0.14646275341510773, + "grad_norm": 0.22730430960655212, "learning_rate": 4.440019576204766e-05, - "loss": 0.0296, - "num_input_tokens_seen": 18041696, + "loss": 0.0264, + "num_input_tokens_seen": 18208256, "step": 2400 }, { "epoch": 0.6533106960950764, - "grad_norm": 0.2030028998851776, + "grad_norm": 0.11903349310159683, "learning_rate": 4.4377747052330735e-05, - "loss": 0.0248, - "num_input_tokens_seen": 18075152, + "loss": 0.0226, + "num_input_tokens_seen": 18241520, "step": 2405 }, { "epoch": 0.6546689303904923, - "grad_norm": 0.2630223035812378, + "grad_norm": 0.249945729970932, "learning_rate": 4.43552591351438e-05, - "loss": 0.0292, - "num_input_tokens_seen": 18113280, + "loss": 0.0296, + "num_input_tokens_seen": 18279808, "step": 2410 }, { "epoch": 0.6560271646859083, - "grad_norm": 0.11152347922325134, + "grad_norm": 0.10452856123447418, "learning_rate": 4.4332732055987195e-05, - "loss": 0.0268, - "num_input_tokens_seen": 18151696, + "loss": 0.0253, + "num_input_tokens_seen": 18318080, "step": 2415 }, { "epoch": 0.6573853989813243, - "grad_norm": 0.22519704699516296, + "grad_norm": 0.23551829159259796, "learning_rate": 4.4310165860440516e-05, "loss": 0.0261, - "num_input_tokens_seen": 18192272, + "num_input_tokens_seen": 18358688, "step": 2420 }, { "epoch": 0.6587436332767402, - "grad_norm": 0.15470291674137115, + "grad_norm": 0.11425314843654633, "learning_rate": 4.4287560594162506e-05, - "loss": 0.0235, - "num_input_tokens_seen": 18232384, + "loss": 0.0225, + "num_input_tokens_seen": 18399072, "step": 2425 }, { "epoch": 0.6601018675721562, - "grad_norm": 0.4941823482513428, + "grad_norm": 0.5756813883781433, "learning_rate": 4.426491630289093e-05, - "loss": 0.0264, - "num_input_tokens_seen": 18268352, + "loss": 0.0266, + "num_input_tokens_seen": 18435456, "step": 2430 }, { "epoch": 0.6614601018675722, - "grad_norm": 0.6315612196922302, + "grad_norm": 0.118367999792099, "learning_rate": 4.424223303244256e-05, - "loss": 0.0303, - "num_input_tokens_seen": 18300656, + "loss": 0.0297, + "num_input_tokens_seen": 18468080, "step": 2435 }, { "epoch": 0.6628183361629881, - "grad_norm": 0.12763909995555878, + "grad_norm": 0.1625058650970459, "learning_rate": 4.421951082871301e-05, - "loss": 0.0248, - "num_input_tokens_seen": 18340144, + "loss": 0.0219, + "num_input_tokens_seen": 18508096, "step": 2440 }, { "epoch": 0.6641765704584041, - "grad_norm": 0.14142149686813354, + "grad_norm": 0.11004887521266937, "learning_rate": 4.4196749737676654e-05, - "loss": 0.0277, - "num_input_tokens_seen": 18371024, + "loss": 0.0263, + "num_input_tokens_seen": 18539664, "step": 2445 }, { "epoch": 0.66553480475382, - "grad_norm": 0.3923947513103485, + "grad_norm": 0.2465517818927765, "learning_rate": 4.417394980538658e-05, - "loss": 0.0268, - "num_input_tokens_seen": 18412736, + "loss": 0.027, + "num_input_tokens_seen": 18581520, "step": 2450 }, { "epoch": 0.666893039049236, - "grad_norm": 0.13830935955047607, + "grad_norm": 0.1776878982782364, "learning_rate": 4.415111107797445e-05, - "loss": 0.0281, - "num_input_tokens_seen": 18452944, + "loss": 0.0266, + "num_input_tokens_seen": 18621872, "step": 2455 }, { "epoch": 0.668251273344652, - "grad_norm": 0.3059670329093933, + "grad_norm": 0.2720218896865845, "learning_rate": 4.4128233601650415e-05, - "loss": 0.0255, - "num_input_tokens_seen": 18491024, + "loss": 0.0257, + "num_input_tokens_seen": 18660208, "step": 2460 }, { "epoch": 0.6696095076400679, - "grad_norm": 0.30845439434051514, + "grad_norm": 0.11636809259653091, "learning_rate": 4.4105317422703037e-05, - "loss": 0.0265, - "num_input_tokens_seen": 18530992, + "loss": 0.0253, + "num_input_tokens_seen": 18700624, "step": 2465 }, { "epoch": 0.6709677419354839, - "grad_norm": 0.35695680975914, + "grad_norm": 0.1320783644914627, "learning_rate": 4.408236258749917e-05, - "loss": 0.0272, - "num_input_tokens_seen": 18565984, + "loss": 0.0219, + "num_input_tokens_seen": 18736336, "step": 2470 }, { "epoch": 0.6723259762308998, - "grad_norm": 0.23867429792881012, + "grad_norm": 0.16952580213546753, "learning_rate": 4.4059369142483914e-05, - "loss": 0.0239, - "num_input_tokens_seen": 18602704, + "loss": 0.0227, + "num_input_tokens_seen": 18773600, "step": 2475 }, { "epoch": 0.6736842105263158, - "grad_norm": 0.4084792733192444, + "grad_norm": 0.14464515447616577, "learning_rate": 4.403633713418045e-05, - "loss": 0.0272, - "num_input_tokens_seen": 18638928, + "loss": 0.0248, + "num_input_tokens_seen": 18810304, "step": 2480 }, { "epoch": 0.6750424448217317, - "grad_norm": 0.38006100058555603, + "grad_norm": 0.19801065325737, "learning_rate": 4.401326660919002e-05, - "loss": 0.0285, - "num_input_tokens_seen": 18672144, + "loss": 0.0263, + "num_input_tokens_seen": 18843888, "step": 2485 }, { "epoch": 0.6764006791171477, - "grad_norm": 0.3477916717529297, + "grad_norm": 0.22485984861850739, "learning_rate": 4.399015761419176e-05, - "loss": 0.0272, - "num_input_tokens_seen": 18716688, + "loss": 0.0264, + "num_input_tokens_seen": 18888720, "step": 2490 }, { "epoch": 0.6777589134125637, - "grad_norm": 0.22653205692768097, + "grad_norm": 0.09697315841913223, "learning_rate": 4.396701019594269e-05, - "loss": 0.0291, - "num_input_tokens_seen": 18758928, + "loss": 0.0259, + "num_input_tokens_seen": 18931200, "step": 2495 }, { "epoch": 0.6791171477079796, - "grad_norm": 0.1076347827911377, + "grad_norm": 0.13689640164375305, "learning_rate": 4.3943824401277546e-05, - "loss": 0.0259, - "num_input_tokens_seen": 18798080, + "loss": 0.0235, + "num_input_tokens_seen": 18970560, "step": 2500 }, { "epoch": 0.6804753820033956, - "grad_norm": 0.1599104404449463, + "grad_norm": 0.1327354907989502, "learning_rate": 4.392060027710869e-05, - "loss": 0.0247, - "num_input_tokens_seen": 18837104, + "loss": 0.0231, + "num_input_tokens_seen": 19009744, "step": 2505 }, { "epoch": 0.6818336162988116, - "grad_norm": 0.09773153066635132, + "grad_norm": 0.14102785289287567, "learning_rate": 4.3897337870426085e-05, - "loss": 0.0238, - "num_input_tokens_seen": 18878928, + "loss": 0.0248, + "num_input_tokens_seen": 19051728, "step": 2510 }, { "epoch": 0.6831918505942275, - "grad_norm": 0.23311588168144226, + "grad_norm": 0.4447525441646576, "learning_rate": 4.387403722829711e-05, - "loss": 0.0274, - "num_input_tokens_seen": 18917680, + "loss": 0.0259, + "num_input_tokens_seen": 19090800, "step": 2515 }, { "epoch": 0.6845500848896434, - "grad_norm": 0.1395694464445114, + "grad_norm": 0.11741897463798523, "learning_rate": 4.385069839786655e-05, - "loss": 0.0258, - "num_input_tokens_seen": 18961104, + "loss": 0.0262, + "num_input_tokens_seen": 19134704, "step": 2520 }, { "epoch": 0.6859083191850595, - "grad_norm": 0.158938929438591, + "grad_norm": 0.10196805745363235, "learning_rate": 4.382732142635641e-05, - "loss": 0.0274, - "num_input_tokens_seen": 19002176, + "loss": 0.0257, + "num_input_tokens_seen": 19176192, "step": 2525 }, { "epoch": 0.6872665534804754, - "grad_norm": 0.14974190294742584, + "grad_norm": 0.10233482718467712, "learning_rate": 4.380390636106589e-05, - "loss": 0.0253, - "num_input_tokens_seen": 19036656, + "loss": 0.0233, + "num_input_tokens_seen": 19211056, "step": 2530 }, { "epoch": 0.6886247877758913, - "grad_norm": 0.1805826723575592, + "grad_norm": 0.1705702543258667, "learning_rate": 4.378045324937128e-05, - "loss": 0.0251, - "num_input_tokens_seen": 19069872, + "loss": 0.0268, + "num_input_tokens_seen": 19244512, "step": 2535 }, { "epoch": 0.6899830220713074, - "grad_norm": 0.23182888329029083, + "grad_norm": 0.3287692964076996, "learning_rate": 4.375696213872582e-05, - "loss": 0.0259, - "num_input_tokens_seen": 19108576, + "loss": 0.0257, + "num_input_tokens_seen": 19283680, "step": 2540 }, { "epoch": 0.6913412563667233, - "grad_norm": 0.18740224838256836, + "grad_norm": 0.1752558797597885, "learning_rate": 4.3733433076659654e-05, - "loss": 0.0253, - "num_input_tokens_seen": 19147840, + "loss": 0.0272, + "num_input_tokens_seen": 19323456, "step": 2545 }, { "epoch": 0.6926994906621392, - "grad_norm": 0.12707221508026123, + "grad_norm": 0.11579793691635132, "learning_rate": 4.3709866110779706e-05, - "loss": 0.0289, - "num_input_tokens_seen": 19183568, + "loss": 0.0273, + "num_input_tokens_seen": 19359600, "step": 2550 }, { "epoch": 0.6940577249575551, - "grad_norm": 0.0957900732755661, + "grad_norm": 0.09934961050748825, "learning_rate": 4.36862612887696e-05, - "loss": 0.0282, - "num_input_tokens_seen": 19221776, + "loss": 0.0269, + "num_input_tokens_seen": 19397920, "step": 2555 }, { "epoch": 0.6954159592529712, - "grad_norm": 0.22898326814174652, + "grad_norm": 0.2692791521549225, "learning_rate": 4.3662618658389555e-05, - "loss": 0.0254, - "num_input_tokens_seen": 19269136, + "loss": 0.026, + "num_input_tokens_seen": 19445776, "step": 2560 }, { "epoch": 0.6967741935483871, - "grad_norm": 0.19019053876399994, + "grad_norm": 0.2398129552602768, "learning_rate": 4.363893826747628e-05, - "loss": 0.0257, - "num_input_tokens_seen": 19306208, + "loss": 0.0261, + "num_input_tokens_seen": 19482544, "step": 2565 }, { "epoch": 0.698132427843803, - "grad_norm": 0.1544862985610962, + "grad_norm": 0.12823820114135742, "learning_rate": 4.361522016394288e-05, - "loss": 0.0253, - "num_input_tokens_seen": 19343616, + "loss": 0.0236, + "num_input_tokens_seen": 19520656, "step": 2570 }, { "epoch": 0.6994906621392191, - "grad_norm": 0.13102605938911438, + "grad_norm": 0.23941999673843384, "learning_rate": 4.359146439577878e-05, - "loss": 0.027, - "num_input_tokens_seen": 19384960, + "loss": 0.0271, + "num_input_tokens_seen": 19561632, "step": 2575 }, { "epoch": 0.700848896434635, - "grad_norm": 0.15033715963363647, + "grad_norm": 0.3269849717617035, "learning_rate": 4.356767101104961e-05, - "loss": 0.0269, - "num_input_tokens_seen": 19417616, + "loss": 0.0239, + "num_input_tokens_seen": 19594352, "step": 2580 }, { "epoch": 0.7022071307300509, - "grad_norm": 0.13110975921154022, + "grad_norm": 0.12291065603494644, "learning_rate": 4.354384005789711e-05, - "loss": 0.0277, - "num_input_tokens_seen": 19453792, + "loss": 0.0308, + "num_input_tokens_seen": 19630608, "step": 2585 }, { "epoch": 0.7035653650254668, - "grad_norm": 0.1275266855955124, + "grad_norm": 0.28482213616371155, "learning_rate": 4.3519971584539033e-05, "loss": 0.0227, - "num_input_tokens_seen": 19491632, + "num_input_tokens_seen": 19668592, "step": 2590 }, { "epoch": 0.7049235993208829, - "grad_norm": 0.08242320269346237, + "grad_norm": 0.1939803808927536, "learning_rate": 4.349606563926904e-05, - "loss": 0.0231, - "num_input_tokens_seen": 19533056, + "loss": 0.0249, + "num_input_tokens_seen": 19710400, "step": 2595 }, { "epoch": 0.7062818336162988, - "grad_norm": 0.10469743609428406, + "grad_norm": 0.10331094264984131, "learning_rate": 4.347212227045661e-05, - "loss": 0.0259, - "num_input_tokens_seen": 19574656, + "loss": 0.0243, + "num_input_tokens_seen": 19752672, "step": 2600 }, { "epoch": 0.7076400679117147, - "grad_norm": 0.1187630295753479, + "grad_norm": 0.12295158952474594, "learning_rate": 4.344814152654696e-05, - "loss": 0.0229, - "num_input_tokens_seen": 19614736, + "loss": 0.0219, + "num_input_tokens_seen": 19792896, "step": 2605 }, { "epoch": 0.7089983022071308, - "grad_norm": 0.10973688960075378, + "grad_norm": 0.22433802485466003, "learning_rate": 4.342412345606091e-05, - "loss": 0.0244, - "num_input_tokens_seen": 19657696, + "loss": 0.0251, + "num_input_tokens_seen": 19835872, "step": 2610 }, { "epoch": 0.7103565365025467, - "grad_norm": 0.21055111289024353, + "grad_norm": 0.10111822187900543, "learning_rate": 4.34000681075948e-05, - "loss": 0.0256, - "num_input_tokens_seen": 19694080, + "loss": 0.0238, + "num_input_tokens_seen": 19871744, "step": 2615 }, { "epoch": 0.7117147707979626, - "grad_norm": 0.15255151689052582, + "grad_norm": 0.2434312254190445, "learning_rate": 4.3375975529820414e-05, - "loss": 0.0256, - "num_input_tokens_seen": 19733472, + "loss": 0.0231, + "num_input_tokens_seen": 19911568, "step": 2620 }, { "epoch": 0.7130730050933786, - "grad_norm": 0.11297976970672607, + "grad_norm": 0.14898686110973358, "learning_rate": 4.335184577148487e-05, - "loss": 0.0249, - "num_input_tokens_seen": 19770976, + "loss": 0.0259, + "num_input_tokens_seen": 19949136, "step": 2625 }, { "epoch": 0.7144312393887946, - "grad_norm": 0.18722788989543915, + "grad_norm": 0.16223596036434174, "learning_rate": 4.332767888141047e-05, - "loss": 0.0275, - "num_input_tokens_seen": 19807664, + "loss": 0.027, + "num_input_tokens_seen": 19986208, "step": 2630 }, { "epoch": 0.7157894736842105, - "grad_norm": 0.5051147937774658, + "grad_norm": 0.2031611055135727, "learning_rate": 4.33034749084947e-05, - "loss": 0.0248, - "num_input_tokens_seen": 19841264, + "loss": 0.0238, + "num_input_tokens_seen": 20020224, "step": 2635 }, { "epoch": 0.7171477079796265, - "grad_norm": 0.09064794331789017, + "grad_norm": 0.09968316555023193, "learning_rate": 4.3279233901710036e-05, - "loss": 0.0242, - "num_input_tokens_seen": 19875200, + "loss": 0.0236, + "num_input_tokens_seen": 20054272, "step": 2640 }, { "epoch": 0.7185059422750425, - "grad_norm": 0.23146286606788635, + "grad_norm": 0.0974685475230217, "learning_rate": 4.3254955910103903e-05, - "loss": 0.0258, - "num_input_tokens_seen": 19914816, + "loss": 0.0231, + "num_input_tokens_seen": 20094544, "step": 2645 }, { "epoch": 0.7198641765704584, - "grad_norm": 0.2573460042476654, + "grad_norm": 0.2270750105381012, "learning_rate": 4.3230640982798554e-05, - "loss": 0.0278, - "num_input_tokens_seen": 19953728, + "loss": 0.0248, + "num_input_tokens_seen": 20133296, "step": 2650 }, { "epoch": 0.7212224108658744, - "grad_norm": 0.14494869112968445, + "grad_norm": 0.36050906777381897, "learning_rate": 4.3206289168990984e-05, - "loss": 0.0297, - "num_input_tokens_seen": 19999296, + "loss": 0.0283, + "num_input_tokens_seen": 20179216, "step": 2655 }, { "epoch": 0.7225806451612903, - "grad_norm": 0.3849429786205292, + "grad_norm": 0.14084535837173462, "learning_rate": 4.318190051795281e-05, - "loss": 0.0268, - "num_input_tokens_seen": 20034576, + "loss": 0.0254, + "num_input_tokens_seen": 20214656, "step": 2660 }, { "epoch": 0.7239388794567063, - "grad_norm": 0.25382348895072937, + "grad_norm": 0.15756800770759583, "learning_rate": 4.315747507903018e-05, - "loss": 0.0277, - "num_input_tokens_seen": 20064576, + "loss": 0.0257, + "num_input_tokens_seen": 20245248, "step": 2665 }, { "epoch": 0.7252971137521222, - "grad_norm": 0.35885384678840637, + "grad_norm": 0.1485278457403183, "learning_rate": 4.313301290164369e-05, - "loss": 0.0262, - "num_input_tokens_seen": 20100272, + "loss": 0.0232, + "num_input_tokens_seen": 20281040, "step": 2670 }, { "epoch": 0.7266553480475382, - "grad_norm": 0.3799697160720825, + "grad_norm": 0.10186439007520676, "learning_rate": 4.3108514035288274e-05, - "loss": 0.033, - "num_input_tokens_seen": 20141120, + "loss": 0.0268, + "num_input_tokens_seen": 20322064, "step": 2675 }, { "epoch": 0.7280135823429542, - "grad_norm": 0.40845340490341187, + "grad_norm": 0.15325681865215302, "learning_rate": 4.3083978529533065e-05, - "loss": 0.0289, - "num_input_tokens_seen": 20182192, + "loss": 0.0252, + "num_input_tokens_seen": 20363328, "step": 2680 }, { "epoch": 0.7293718166383701, - "grad_norm": 0.11542440205812454, + "grad_norm": 0.13095000386238098, "learning_rate": 4.305940643402136e-05, - "loss": 0.0232, - "num_input_tokens_seen": 20221056, + "loss": 0.0223, + "num_input_tokens_seen": 20402464, "step": 2685 }, { "epoch": 0.7307300509337861, - "grad_norm": 0.12018931657075882, + "grad_norm": 0.17261378467082977, "learning_rate": 4.30347977984705e-05, - "loss": 0.0236, - "num_input_tokens_seen": 20261712, + "loss": 0.0221, + "num_input_tokens_seen": 20443776, "step": 2690 }, { "epoch": 0.732088285229202, - "grad_norm": 0.33706116676330566, + "grad_norm": 0.1092795878648758, "learning_rate": 4.30101526726717e-05, - "loss": 0.0268, - "num_input_tokens_seen": 20305168, + "loss": 0.0253, + "num_input_tokens_seen": 20486560, "step": 2695 }, { "epoch": 0.733446519524618, - "grad_norm": 0.19121593236923218, + "grad_norm": 0.15452566742897034, "learning_rate": 4.298547110649009e-05, - "loss": 0.0254, - "num_input_tokens_seen": 20344240, + "loss": 0.0233, + "num_input_tokens_seen": 20525632, "step": 2700 }, { "epoch": 0.734804753820034, - "grad_norm": 0.10274410247802734, + "grad_norm": 0.09394962340593338, "learning_rate": 4.296075314986446e-05, - "loss": 0.026, - "num_input_tokens_seen": 20382400, + "loss": 0.0232, + "num_input_tokens_seen": 20564720, "step": 2705 }, { "epoch": 0.7361629881154499, - "grad_norm": 0.11251187324523926, + "grad_norm": 0.09640908241271973, "learning_rate": 4.293599885280727e-05, - "loss": 0.0297, - "num_input_tokens_seen": 20419584, + "loss": 0.0281, + "num_input_tokens_seen": 20603040, "step": 2710 }, { "epoch": 0.7375212224108659, - "grad_norm": 0.1792198121547699, + "grad_norm": 0.09165510535240173, "learning_rate": 4.291120826540448e-05, - "loss": 0.0261, - "num_input_tokens_seen": 20457856, + "loss": 0.0242, + "num_input_tokens_seen": 20641776, "step": 2715 }, { "epoch": 0.7388794567062819, - "grad_norm": 0.22624213993549347, + "grad_norm": 0.11923347413539886, "learning_rate": 4.288638143781551e-05, - "loss": 0.0287, - "num_input_tokens_seen": 20491840, + "loss": 0.0223, + "num_input_tokens_seen": 20675584, "step": 2720 }, { "epoch": 0.7402376910016978, - "grad_norm": 0.12644752860069275, + "grad_norm": 0.08171697705984116, "learning_rate": 4.2861518420273075e-05, - "loss": 0.0244, - "num_input_tokens_seen": 20522848, + "loss": 0.0227, + "num_input_tokens_seen": 20707536, "step": 2725 }, { "epoch": 0.7415959252971137, - "grad_norm": 0.09887095540761948, + "grad_norm": 0.08581460267305374, "learning_rate": 4.2836619263083136e-05, - "loss": 0.0291, - "num_input_tokens_seen": 20563280, + "loss": 0.0263, + "num_input_tokens_seen": 20748832, "step": 2730 }, { "epoch": 0.7429541595925298, - "grad_norm": 0.32503917813301086, + "grad_norm": 0.11031202226877213, "learning_rate": 4.281168401662476e-05, - "loss": 0.027, - "num_input_tokens_seen": 20600416, + "loss": 0.0258, + "num_input_tokens_seen": 20786656, "step": 2735 }, { "epoch": 0.7443123938879457, - "grad_norm": 0.187294602394104, + "grad_norm": 0.15797507762908936, "learning_rate": 4.2786712731350054e-05, - "loss": 0.0265, - "num_input_tokens_seen": 20631360, + "loss": 0.0248, + "num_input_tokens_seen": 20817968, "step": 2740 }, { "epoch": 0.7456706281833616, - "grad_norm": 0.10749641805887222, + "grad_norm": 0.10637655854225159, "learning_rate": 4.276170545778402e-05, - "loss": 0.0254, - "num_input_tokens_seen": 20667536, + "loss": 0.0231, + "num_input_tokens_seen": 20854032, "step": 2745 }, { "epoch": 0.7470288624787776, - "grad_norm": 0.19483469426631927, + "grad_norm": 0.16178366541862488, "learning_rate": 4.2736662246524484e-05, - "loss": 0.0301, - "num_input_tokens_seen": 20701968, + "loss": 0.0276, + "num_input_tokens_seen": 20888880, "step": 2750 }, { "epoch": 0.7483870967741936, - "grad_norm": 0.11209449917078018, + "grad_norm": 0.1685052216053009, "learning_rate": 4.2711583148241994e-05, - "loss": 0.0249, - "num_input_tokens_seen": 20736736, + "loss": 0.0256, + "num_input_tokens_seen": 20924096, "step": 2755 }, { "epoch": 0.7497453310696095, - "grad_norm": 0.12636344134807587, + "grad_norm": 0.0903959721326828, "learning_rate": 4.26864682136797e-05, - "loss": 0.0276, - "num_input_tokens_seen": 20773600, + "loss": 0.0258, + "num_input_tokens_seen": 20961184, "step": 2760 }, { "epoch": 0.7511035653650254, - "grad_norm": 0.23362913727760315, + "grad_norm": 0.10294344276189804, "learning_rate": 4.266131749365327e-05, - "loss": 0.025, - "num_input_tokens_seen": 20819552, + "loss": 0.0239, + "num_input_tokens_seen": 21007552, "step": 2765 }, { "epoch": 0.7524617996604415, - "grad_norm": 0.24717861413955688, + "grad_norm": 0.07302305102348328, "learning_rate": 4.263613103905077e-05, - "loss": 0.0234, - "num_input_tokens_seen": 20859216, + "loss": 0.0221, + "num_input_tokens_seen": 21047088, "step": 2770 }, { "epoch": 0.7538200339558574, - "grad_norm": 0.10336605459451675, + "grad_norm": 0.11375316232442856, "learning_rate": 4.261090890083257e-05, - "loss": 0.0222, - "num_input_tokens_seen": 20900080, + "loss": 0.0211, + "num_input_tokens_seen": 21088176, "step": 2775 }, { "epoch": 0.7551782682512733, - "grad_norm": 0.10610651969909668, + "grad_norm": 0.09445972740650177, "learning_rate": 4.2585651130031233e-05, - "loss": 0.0286, - "num_input_tokens_seen": 20940096, + "loss": 0.026, + "num_input_tokens_seen": 21128752, "step": 2780 }, { "epoch": 0.7565365025466892, - "grad_norm": 0.16359758377075195, + "grad_norm": 0.22416584193706512, "learning_rate": 4.2560357777751435e-05, - "loss": 0.0231, - "num_input_tokens_seen": 20973952, + "loss": 0.0224, + "num_input_tokens_seen": 21163408, "step": 2785 }, { "epoch": 0.7578947368421053, - "grad_norm": 0.10654280334711075, + "grad_norm": 0.11565962433815002, "learning_rate": 4.2535028895169825e-05, - "loss": 0.0248, - "num_input_tokens_seen": 21015392, + "loss": 0.0241, + "num_input_tokens_seen": 21205248, "step": 2790 }, { "epoch": 0.7592529711375212, - "grad_norm": 0.15208609402179718, + "grad_norm": 0.42465367913246155, "learning_rate": 4.2509664533534966e-05, - "loss": 0.0266, - "num_input_tokens_seen": 21054048, + "loss": 0.0267, + "num_input_tokens_seen": 21243984, "step": 2795 }, { "epoch": 0.7606112054329371, - "grad_norm": 0.17940235137939453, + "grad_norm": 0.2211800515651703, "learning_rate": 4.2484264744167176e-05, - "loss": 0.0238, - "num_input_tokens_seen": 21097344, + "loss": 0.0226, + "num_input_tokens_seen": 21287392, "step": 2800 }, { "epoch": 0.7619694397283532, - "grad_norm": 0.12031520903110504, + "grad_norm": 0.10530754923820496, "learning_rate": 4.245882957845848e-05, - "loss": 0.0255, - "num_input_tokens_seen": 21135392, + "loss": 0.026, + "num_input_tokens_seen": 21326064, "step": 2805 }, { "epoch": 0.7633276740237691, - "grad_norm": 0.1506468504667282, + "grad_norm": 0.398556649684906, "learning_rate": 4.2433359087872466e-05, - "loss": 0.025, - "num_input_tokens_seen": 21171104, + "loss": 0.028, + "num_input_tokens_seen": 21362304, "step": 2810 }, { "epoch": 0.764685908319185, - "grad_norm": 0.09945458918809891, + "grad_norm": 0.3231535851955414, "learning_rate": 4.24078533239442e-05, - "loss": 0.024, - "num_input_tokens_seen": 21207488, + "loss": 0.0224, + "num_input_tokens_seen": 21399360, "step": 2815 }, { "epoch": 0.766044142614601, - "grad_norm": 0.10572070628404617, + "grad_norm": 0.19478896260261536, "learning_rate": 4.238231233828013e-05, - "loss": 0.0217, - "num_input_tokens_seen": 21243168, + "loss": 0.0225, + "num_input_tokens_seen": 21435456, "step": 2820 }, { "epoch": 0.767402376910017, - "grad_norm": 0.10251931101083755, + "grad_norm": 0.09493184089660645, "learning_rate": 4.235673618255795e-05, - "loss": 0.0248, - "num_input_tokens_seen": 21279104, + "loss": 0.0254, + "num_input_tokens_seen": 21472096, "step": 2825 }, { "epoch": 0.7687606112054329, - "grad_norm": 0.11500995606184006, + "grad_norm": 0.11927445232868195, "learning_rate": 4.233112490852652e-05, - "loss": 0.0242, - "num_input_tokens_seen": 21319040, + "loss": 0.0247, + "num_input_tokens_seen": 21511584, "step": 2830 }, { "epoch": 0.7701188455008489, - "grad_norm": 0.15229789912700653, + "grad_norm": 0.1958806961774826, "learning_rate": 4.2305478568005756e-05, "loss": 0.0247, - "num_input_tokens_seen": 21358992, + "num_input_tokens_seen": 21551920, "step": 2835 }, { "epoch": 0.7714770797962649, - "grad_norm": 0.13738100230693817, + "grad_norm": 0.1231204941868782, "learning_rate": 4.227979721288653e-05, - "loss": 0.0261, - "num_input_tokens_seen": 21403296, + "loss": 0.0242, + "num_input_tokens_seen": 21596384, "step": 2840 }, { "epoch": 0.7728353140916808, - "grad_norm": 0.09878403693437576, + "grad_norm": 0.13310499489307404, "learning_rate": 4.2254080895130566e-05, - "loss": 0.0213, - "num_input_tokens_seen": 21441328, + "loss": 0.0228, + "num_input_tokens_seen": 21635168, "step": 2845 }, { "epoch": 0.7741935483870968, - "grad_norm": 0.07812755554914474, + "grad_norm": 0.11617301404476166, "learning_rate": 4.2228329666770305e-05, - "loss": 0.0222, - "num_input_tokens_seen": 21481104, + "loss": 0.0245, + "num_input_tokens_seen": 21675120, "step": 2850 }, { "epoch": 0.7755517826825127, - "grad_norm": 0.08588600158691406, + "grad_norm": 0.20366749167442322, "learning_rate": 4.2202543579908834e-05, - "loss": 0.0236, - "num_input_tokens_seen": 21520832, + "loss": 0.0229, + "num_input_tokens_seen": 21715392, "step": 2855 }, { "epoch": 0.7769100169779287, - "grad_norm": 0.09116164594888687, + "grad_norm": 0.10008393228054047, "learning_rate": 4.21767226867198e-05, - "loss": 0.0232, - "num_input_tokens_seen": 21561360, + "loss": 0.0222, + "num_input_tokens_seen": 21756176, "step": 2860 }, { "epoch": 0.7782682512733446, - "grad_norm": 0.10332862287759781, + "grad_norm": 0.3099018931388855, "learning_rate": 4.2150867039447225e-05, - "loss": 0.0205, - "num_input_tokens_seen": 21598112, + "loss": 0.0216, + "num_input_tokens_seen": 21793648, "step": 2865 }, { "epoch": 0.7796264855687606, - "grad_norm": 0.20034222304821014, + "grad_norm": 0.09043870866298676, "learning_rate": 4.212497669040547e-05, - "loss": 0.024, - "num_input_tokens_seen": 21642720, + "loss": 0.0219, + "num_input_tokens_seen": 21838384, "step": 2870 }, { "epoch": 0.7809847198641766, - "grad_norm": 0.10174475610256195, + "grad_norm": 0.42638877034187317, "learning_rate": 4.209905169197913e-05, - "loss": 0.0231, - "num_input_tokens_seen": 21682256, + "loss": 0.0252, + "num_input_tokens_seen": 21878272, "step": 2875 }, { "epoch": 0.7823429541595925, - "grad_norm": 0.5580098032951355, + "grad_norm": 0.5964987874031067, "learning_rate": 4.207309209662288e-05, - "loss": 0.0273, - "num_input_tokens_seen": 21720240, + "loss": 0.0266, + "num_input_tokens_seen": 21916976, "step": 2880 }, { "epoch": 0.7837011884550085, - "grad_norm": 0.11146796494722366, + "grad_norm": 0.08453542739152908, "learning_rate": 4.204709795686141e-05, - "loss": 0.0258, - "num_input_tokens_seen": 21757520, + "loss": 0.0236, + "num_input_tokens_seen": 21954800, "step": 2885 }, { "epoch": 0.7850594227504244, - "grad_norm": 0.1110483705997467, + "grad_norm": 0.5072471499443054, "learning_rate": 4.202106932528929e-05, - "loss": 0.0252, - "num_input_tokens_seen": 21797312, + "loss": 0.0266, + "num_input_tokens_seen": 21994688, "step": 2890 }, { "epoch": 0.7864176570458404, - "grad_norm": 0.2990809679031372, + "grad_norm": 0.10051149874925613, "learning_rate": 4.199500625457089e-05, - "loss": 0.0266, - "num_input_tokens_seen": 21834000, + "loss": 0.0231, + "num_input_tokens_seen": 22031776, "step": 2895 }, { "epoch": 0.7877758913412564, - "grad_norm": 0.13568764925003052, + "grad_norm": 0.25388047099113464, "learning_rate": 4.196890879744026e-05, - "loss": 0.0255, - "num_input_tokens_seen": 21872720, + "loss": 0.0252, + "num_input_tokens_seen": 22071408, "step": 2900 }, { "epoch": 0.7891341256366723, - "grad_norm": 0.15363161265850067, + "grad_norm": 0.35128772258758545, "learning_rate": 4.194277700670103e-05, - "loss": 0.0253, - "num_input_tokens_seen": 21907968, + "loss": 0.0233, + "num_input_tokens_seen": 22106528, "step": 2905 }, { "epoch": 0.7904923599320883, - "grad_norm": 0.2428089678287506, + "grad_norm": 0.20641636848449707, "learning_rate": 4.191661093522629e-05, - "loss": 0.0262, - "num_input_tokens_seen": 21945808, + "loss": 0.025, + "num_input_tokens_seen": 22144784, "step": 2910 }, { "epoch": 0.7918505942275043, - "grad_norm": 0.3486040532588959, + "grad_norm": 0.15850429236888885, "learning_rate": 4.189041063595848e-05, - "loss": 0.0238, - "num_input_tokens_seen": 21980112, + "loss": 0.0235, + "num_input_tokens_seen": 22179504, "step": 2915 }, { "epoch": 0.7932088285229202, - "grad_norm": 0.07942713052034378, + "grad_norm": 0.12939998507499695, "learning_rate": 4.1864176161909324e-05, - "loss": 0.026, - "num_input_tokens_seen": 22013008, + "loss": 0.0229, + "num_input_tokens_seen": 22212912, "step": 2920 }, { "epoch": 0.7945670628183361, - "grad_norm": 0.5337576270103455, + "grad_norm": 0.2017057240009308, "learning_rate": 4.183790756615966e-05, - "loss": 0.0289, - "num_input_tokens_seen": 22048704, + "loss": 0.0264, + "num_input_tokens_seen": 22249312, "step": 2925 }, { "epoch": 0.7959252971137522, - "grad_norm": 0.13330139219760895, + "grad_norm": 0.10830770432949066, "learning_rate": 4.18116049018594e-05, - "loss": 0.0287, - "num_input_tokens_seen": 22084464, + "loss": 0.0274, + "num_input_tokens_seen": 22285168, "step": 2930 }, { "epoch": 0.7972835314091681, - "grad_norm": 0.17016777396202087, + "grad_norm": 0.10986758023500443, "learning_rate": 4.178526822222736e-05, - "loss": 0.0276, - "num_input_tokens_seen": 22118688, + "loss": 0.0272, + "num_input_tokens_seen": 22319872, "step": 2935 }, { "epoch": 0.798641765704584, - "grad_norm": 0.12980084121227264, + "grad_norm": 0.13273011147975922, "learning_rate": 4.175889758055118e-05, - "loss": 0.0243, - "num_input_tokens_seen": 22155472, + "loss": 0.0228, + "num_input_tokens_seen": 22356816, "step": 2940 }, { "epoch": 0.8, - "grad_norm": 0.13821285963058472, + "grad_norm": 0.10676322132349014, "learning_rate": 4.173249303018723e-05, - "loss": 0.0257, - "num_input_tokens_seen": 22192192, + "loss": 0.0238, + "num_input_tokens_seen": 22394112, "step": 2945 }, { "epoch": 0.801358234295416, - "grad_norm": 0.30048829317092896, + "grad_norm": 0.3012424111366272, "learning_rate": 4.170605462456049e-05, - "loss": 0.0239, - "num_input_tokens_seen": 22226160, + "loss": 0.022, + "num_input_tokens_seen": 22428656, "step": 2950 }, { "epoch": 0.8027164685908319, - "grad_norm": 0.422430157661438, + "grad_norm": 0.08964573591947556, "learning_rate": 4.167958241716443e-05, - "loss": 0.0253, - "num_input_tokens_seen": 22267712, + "loss": 0.0225, + "num_input_tokens_seen": 22470768, "step": 2955 }, { "epoch": 0.8040747028862478, - "grad_norm": 0.15200664103031158, + "grad_norm": 0.18770115077495575, "learning_rate": 4.1653076461560915e-05, - "loss": 0.0259, - "num_input_tokens_seen": 22309136, + "loss": 0.0242, + "num_input_tokens_seen": 22512016, "step": 2960 }, { "epoch": 0.8054329371816639, - "grad_norm": 0.2439376562833786, + "grad_norm": 0.24836091697216034, "learning_rate": 4.162653681138009e-05, - "loss": 0.0248, - "num_input_tokens_seen": 22353200, + "loss": 0.0229, + "num_input_tokens_seen": 22556048, "step": 2965 }, { "epoch": 0.8067911714770798, - "grad_norm": 0.11424458026885986, + "grad_norm": 0.11890601366758347, "learning_rate": 4.159996352032027e-05, - "loss": 0.0263, - "num_input_tokens_seen": 22386992, + "loss": 0.0237, + "num_input_tokens_seen": 22590096, "step": 2970 }, { "epoch": 0.8081494057724957, - "grad_norm": 0.10261579602956772, + "grad_norm": 0.10518820583820343, "learning_rate": 4.157335664214787e-05, - "loss": 0.0278, - "num_input_tokens_seen": 22427920, + "loss": 0.0261, + "num_input_tokens_seen": 22631936, "step": 2975 }, { "epoch": 0.8095076400679118, - "grad_norm": 0.1063750684261322, + "grad_norm": 0.10727889835834503, "learning_rate": 4.1546716230697226e-05, - "loss": 0.0288, - "num_input_tokens_seen": 22462016, + "loss": 0.0284, + "num_input_tokens_seen": 22666128, "step": 2980 }, { "epoch": 0.8108658743633277, - "grad_norm": 0.09581667929887772, + "grad_norm": 0.08503438532352448, "learning_rate": 4.1520042339870524e-05, - "loss": 0.0256, - "num_input_tokens_seen": 22495344, + "loss": 0.0241, + "num_input_tokens_seen": 22699728, "step": 2985 }, { "epoch": 0.8122241086587436, - "grad_norm": 0.45115751028060913, + "grad_norm": 0.17671318352222443, "learning_rate": 4.149333502363772e-05, - "loss": 0.0266, - "num_input_tokens_seen": 22532144, + "loss": 0.0235, + "num_input_tokens_seen": 22736800, "step": 2990 }, { "epoch": 0.8135823429541595, - "grad_norm": 0.1769515872001648, + "grad_norm": 0.0852126032114029, "learning_rate": 4.146659433603637e-05, - "loss": 0.0268, - "num_input_tokens_seen": 22574688, + "loss": 0.0243, + "num_input_tokens_seen": 22779680, "step": 2995 }, { "epoch": 0.8149405772495756, - "grad_norm": 0.8368432521820068, + "grad_norm": 0.5707030296325684, "learning_rate": 4.143982033117155e-05, - "loss": 0.0221, - "num_input_tokens_seen": 22609072, + "loss": 0.0238, + "num_input_tokens_seen": 22814400, "step": 3000 }, { "epoch": 0.8162988115449915, - "grad_norm": 0.31787559390068054, + "grad_norm": 0.13374647498130798, "learning_rate": 4.1413013063215784e-05, - "loss": 0.0241, - "num_input_tokens_seen": 22654464, + "loss": 0.0228, + "num_input_tokens_seen": 22859760, "step": 3005 }, { "epoch": 0.8176570458404074, - "grad_norm": 0.14264586567878723, + "grad_norm": 0.26835396885871887, "learning_rate": 4.1386172586408855e-05, - "loss": 0.0255, - "num_input_tokens_seen": 22687408, + "loss": 0.0216, + "num_input_tokens_seen": 22893376, "step": 3010 }, { "epoch": 0.8190152801358235, - "grad_norm": 0.2657187581062317, + "grad_norm": 0.17906704545021057, "learning_rate": 4.1359298955057766e-05, - "loss": 0.0356, - "num_input_tokens_seen": 22722816, + "loss": 0.0244, + "num_input_tokens_seen": 22929440, "step": 3015 }, { "epoch": 0.8203735144312394, - "grad_norm": 0.170993834733963, + "grad_norm": 0.11421557515859604, "learning_rate": 4.13323922235366e-05, - "loss": 0.038, - "num_input_tokens_seen": 22763552, + "loss": 0.0304, + "num_input_tokens_seen": 22970736, "step": 3020 }, { "epoch": 0.8217317487266553, - "grad_norm": 0.3268808424472809, + "grad_norm": 0.3831428289413452, "learning_rate": 4.130545244628639e-05, - "loss": 0.0279, - "num_input_tokens_seen": 22801760, + "loss": 0.0274, + "num_input_tokens_seen": 23009328, "step": 3025 }, { "epoch": 0.8230899830220713, - "grad_norm": 0.22961309552192688, + "grad_norm": 0.2644425332546234, "learning_rate": 4.1278479677815054e-05, - "loss": 0.0278, - "num_input_tokens_seen": 22844208, + "loss": 0.0268, + "num_input_tokens_seen": 23052112, "step": 3030 }, { "epoch": 0.8244482173174873, - "grad_norm": 0.07827386260032654, + "grad_norm": 0.07142507284879684, "learning_rate": 4.125147397269725e-05, - "loss": 0.023, - "num_input_tokens_seen": 22882384, + "loss": 0.0211, + "num_input_tokens_seen": 23090720, "step": 3035 }, { "epoch": 0.8258064516129032, - "grad_norm": 0.09792392700910568, + "grad_norm": 0.511797308921814, "learning_rate": 4.122443538557429e-05, - "loss": 0.0285, - "num_input_tokens_seen": 22919680, + "loss": 0.0278, + "num_input_tokens_seen": 23128560, "step": 3040 }, { "epoch": 0.8271646859083192, - "grad_norm": 0.2871212363243103, + "grad_norm": 0.2325364351272583, "learning_rate": 4.1197363971154e-05, - "loss": 0.0262, - "num_input_tokens_seen": 22956560, + "loss": 0.0226, + "num_input_tokens_seen": 23165968, "step": 3045 }, { "epoch": 0.8285229202037352, - "grad_norm": 0.13483591377735138, + "grad_norm": 0.09653971344232559, "learning_rate": 4.1170259784210644e-05, - "loss": 0.0292, - "num_input_tokens_seen": 22986832, + "loss": 0.0271, + "num_input_tokens_seen": 23196896, "step": 3050 }, { "epoch": 0.8298811544991511, - "grad_norm": 0.3542821407318115, + "grad_norm": 0.09324201941490173, "learning_rate": 4.114312287958479e-05, - "loss": 0.0303, - "num_input_tokens_seen": 23028560, + "loss": 0.0282, + "num_input_tokens_seen": 23238688, "step": 3055 }, { "epoch": 0.831239388794567, - "grad_norm": 0.15552017092704773, + "grad_norm": 0.15973159670829773, "learning_rate": 4.1115953312183183e-05, - "loss": 0.0286, - "num_input_tokens_seen": 23070976, + "loss": 0.028, + "num_input_tokens_seen": 23281696, "step": 3060 }, { "epoch": 0.832597623089983, - "grad_norm": 0.4257282018661499, + "grad_norm": 0.10325570404529572, "learning_rate": 4.10887511369787e-05, - "loss": 0.0244, - "num_input_tokens_seen": 23112496, + "loss": 0.0227, + "num_input_tokens_seen": 23323264, "step": 3065 }, { "epoch": 0.833955857385399, - "grad_norm": 0.21572604775428772, + "grad_norm": 0.0978979840874672, "learning_rate": 4.106151640901015e-05, - "loss": 0.0274, - "num_input_tokens_seen": 23142816, + "loss": 0.0257, + "num_input_tokens_seen": 23354080, "step": 3070 }, { "epoch": 0.8353140916808149, - "grad_norm": 0.1560722291469574, + "grad_norm": 0.2833722233772278, "learning_rate": 4.1034249183382235e-05, - "loss": 0.0265, - "num_input_tokens_seen": 23182784, + "loss": 0.0261, + "num_input_tokens_seen": 23393760, "step": 3075 }, { "epoch": 0.8366723259762309, - "grad_norm": 0.1308613121509552, + "grad_norm": 0.12695789337158203, "learning_rate": 4.100694951526538e-05, - "loss": 0.0224, - "num_input_tokens_seen": 23220512, + "loss": 0.0207, + "num_input_tokens_seen": 23431856, "step": 3080 }, { "epoch": 0.8380305602716469, - "grad_norm": 0.1546028107404709, + "grad_norm": 0.2918871343135834, "learning_rate": 4.09796174598957e-05, - "loss": 0.0272, - "num_input_tokens_seen": 23257456, + "loss": 0.0255, + "num_input_tokens_seen": 23469040, "step": 3085 }, { "epoch": 0.8393887945670628, - "grad_norm": 0.16179592907428741, + "grad_norm": 0.12460378557443619, "learning_rate": 4.0952253072574795e-05, - "loss": 0.0283, - "num_input_tokens_seen": 23297808, + "loss": 0.027, + "num_input_tokens_seen": 23510448, "step": 3090 }, { "epoch": 0.8407470288624788, - "grad_norm": 0.16771242022514343, + "grad_norm": 0.189557746052742, "learning_rate": 4.09248564086697e-05, - "loss": 0.0223, - "num_input_tokens_seen": 23334864, + "loss": 0.0217, + "num_input_tokens_seen": 23547808, "step": 3095 }, { "epoch": 0.8421052631578947, - "grad_norm": 0.10512012988328934, + "grad_norm": 0.19312237203121185, "learning_rate": 4.089742752361276e-05, - "loss": 0.0293, - "num_input_tokens_seen": 23369248, + "loss": 0.0273, + "num_input_tokens_seen": 23582864, "step": 3100 }, { "epoch": 0.8434634974533107, - "grad_norm": 0.22870028018951416, + "grad_norm": 0.08195208758115768, "learning_rate": 4.086996647290151e-05, - "loss": 0.0289, - "num_input_tokens_seen": 23410064, + "loss": 0.0252, + "num_input_tokens_seen": 23623936, "step": 3105 }, { "epoch": 0.8448217317487267, - "grad_norm": 0.09854773432016373, + "grad_norm": 0.15256577730178833, "learning_rate": 4.084247331209857e-05, - "loss": 0.0242, - "num_input_tokens_seen": 23455856, + "loss": 0.0233, + "num_input_tokens_seen": 23670160, "step": 3110 }, { "epoch": 0.8461799660441426, - "grad_norm": 0.10869055241346359, + "grad_norm": 0.09262879937887192, "learning_rate": 4.081494809683151e-05, - "loss": 0.023, - "num_input_tokens_seen": 23495344, + "loss": 0.0231, + "num_input_tokens_seen": 23709680, "step": 3115 }, { "epoch": 0.8475382003395586, - "grad_norm": 0.3154117465019226, + "grad_norm": 0.07214374840259552, "learning_rate": 4.078739088279279e-05, - "loss": 0.0257, - "num_input_tokens_seen": 23534704, + "loss": 0.0232, + "num_input_tokens_seen": 23749552, "step": 3120 }, { "epoch": 0.8488964346349746, - "grad_norm": 0.43066495656967163, + "grad_norm": 0.4541591703891754, "learning_rate": 4.075980172573959e-05, - "loss": 0.0205, - "num_input_tokens_seen": 23568768, + "loss": 0.0181, + "num_input_tokens_seen": 23783616, "step": 3125 }, { "epoch": 0.8502546689303905, - "grad_norm": 0.09728015959262848, + "grad_norm": 0.0776195302605629, "learning_rate": 4.0732180681493715e-05, - "loss": 0.0238, - "num_input_tokens_seen": 23612992, + "loss": 0.0225, + "num_input_tokens_seen": 23827920, "step": 3130 }, { "epoch": 0.8516129032258064, - "grad_norm": 0.1603078693151474, + "grad_norm": 0.220168799161911, "learning_rate": 4.070452780594152e-05, - "loss": 0.0249, - "num_input_tokens_seen": 23656320, + "loss": 0.0211, + "num_input_tokens_seen": 23870960, "step": 3135 }, { "epoch": 0.8529711375212224, - "grad_norm": 0.14865460991859436, + "grad_norm": 0.29644858837127686, "learning_rate": 4.067684315503373e-05, - "loss": 0.0261, - "num_input_tokens_seen": 23698512, + "loss": 0.0265, + "num_input_tokens_seen": 23913632, "step": 3140 }, { "epoch": 0.8543293718166384, - "grad_norm": 0.3723386228084564, + "grad_norm": 0.10122260451316833, "learning_rate": 4.064912678478537e-05, - "loss": 0.0272, - "num_input_tokens_seen": 23733616, + "loss": 0.0237, + "num_input_tokens_seen": 23949328, "step": 3145 }, { "epoch": 0.8556876061120543, - "grad_norm": 0.18173463642597198, + "grad_norm": 0.10225596278905869, "learning_rate": 4.0621378751275686e-05, - "loss": 0.0268, - "num_input_tokens_seen": 23769200, + "loss": 0.0244, + "num_input_tokens_seen": 23985872, "step": 3150 }, { "epoch": 0.8570458404074703, - "grad_norm": 0.31310731172561646, + "grad_norm": 0.08008911460638046, "learning_rate": 4.0593599110647915e-05, - "loss": 0.0266, - "num_input_tokens_seen": 23805488, + "loss": 0.0233, + "num_input_tokens_seen": 24022288, "step": 3155 }, { "epoch": 0.8584040747028863, - "grad_norm": 0.3578246533870697, + "grad_norm": 0.2684822082519531, "learning_rate": 4.0565787919109307e-05, - "loss": 0.0268, - "num_input_tokens_seen": 23840880, + "loss": 0.0221, + "num_input_tokens_seen": 24058240, "step": 3160 }, { "epoch": 0.8597623089983022, - "grad_norm": 0.08418084681034088, + "grad_norm": 0.08221771568059921, "learning_rate": 4.053794523293091e-05, - "loss": 0.0247, - "num_input_tokens_seen": 23880288, + "loss": 0.0228, + "num_input_tokens_seen": 24098064, "step": 3165 }, { "epoch": 0.8611205432937181, - "grad_norm": 0.3498416543006897, + "grad_norm": 0.2744966745376587, "learning_rate": 4.051007110844752e-05, - "loss": 0.0256, - "num_input_tokens_seen": 23916720, + "loss": 0.0238, + "num_input_tokens_seen": 24134448, "step": 3170 }, { "epoch": 0.8624787775891342, - "grad_norm": 0.10751106590032578, + "grad_norm": 0.08680921047925949, "learning_rate": 4.048216560205755e-05, - "loss": 0.0257, - "num_input_tokens_seen": 23956640, + "loss": 0.0252, + "num_input_tokens_seen": 24174608, "step": 3175 }, { "epoch": 0.8638370118845501, - "grad_norm": 0.24830509722232819, + "grad_norm": 0.12440554052591324, "learning_rate": 4.045422877022287e-05, - "loss": 0.03, - "num_input_tokens_seen": 23993616, + "loss": 0.028, + "num_input_tokens_seen": 24211904, "step": 3180 }, { "epoch": 0.865195246179966, - "grad_norm": 0.15149499475955963, + "grad_norm": 0.09384052455425262, "learning_rate": 4.0426260669468767e-05, - "loss": 0.0271, - "num_input_tokens_seen": 24033536, + "loss": 0.0255, + "num_input_tokens_seen": 24252272, "step": 3185 }, { "epoch": 0.866553480475382, - "grad_norm": 0.1268114149570465, + "grad_norm": 0.09361809492111206, "learning_rate": 4.0398261356383796e-05, - "loss": 0.0234, - "num_input_tokens_seen": 24071184, + "loss": 0.0225, + "num_input_tokens_seen": 24289760, "step": 3190 }, { "epoch": 0.867911714770798, - "grad_norm": 0.21822918951511383, + "grad_norm": 0.4511979818344116, "learning_rate": 4.037023088761963e-05, - "loss": 0.0247, - "num_input_tokens_seen": 24108176, + "loss": 0.0239, + "num_input_tokens_seen": 24326416, "step": 3195 }, { "epoch": 0.8692699490662139, - "grad_norm": 0.3256119191646576, + "grad_norm": 0.14989538490772247, "learning_rate": 4.034216931989101e-05, - "loss": 0.0241, - "num_input_tokens_seen": 24141904, + "loss": 0.0246, + "num_input_tokens_seen": 24360160, "step": 3200 }, { "epoch": 0.8706281833616298, - "grad_norm": 0.2524334490299225, + "grad_norm": 0.09322134405374527, "learning_rate": 4.03140767099756e-05, - "loss": 0.0321, - "num_input_tokens_seen": 24177344, + "loss": 0.0268, + "num_input_tokens_seen": 24396208, "step": 3205 }, { "epoch": 0.8719864176570459, - "grad_norm": 0.15473684668540955, + "grad_norm": 0.1411059945821762, "learning_rate": 4.028595311471386e-05, - "loss": 0.022, - "num_input_tokens_seen": 24212704, + "loss": 0.0213, + "num_input_tokens_seen": 24431856, "step": 3210 }, { "epoch": 0.8733446519524618, - "grad_norm": 0.286958783864975, + "grad_norm": 0.08727291226387024, "learning_rate": 4.025779859100895e-05, - "loss": 0.0262, - "num_input_tokens_seen": 24248784, + "loss": 0.0241, + "num_input_tokens_seen": 24468208, "step": 3215 }, { "epoch": 0.8747028862478777, - "grad_norm": 0.1472889482975006, + "grad_norm": 0.21357126533985138, "learning_rate": 4.022961319582662e-05, - "loss": 0.0272, - "num_input_tokens_seen": 24285584, + "loss": 0.0267, + "num_input_tokens_seen": 24505344, "step": 3220 }, { "epoch": 0.8760611205432938, - "grad_norm": 0.20608732104301453, + "grad_norm": 0.1623540073633194, "learning_rate": 4.0201396986195046e-05, - "loss": 0.0254, - "num_input_tokens_seen": 24316048, + "loss": 0.0244, + "num_input_tokens_seen": 24536688, "step": 3225 }, { "epoch": 0.8774193548387097, - "grad_norm": 0.08930043131113052, + "grad_norm": 0.10114876925945282, "learning_rate": 4.0173150019204785e-05, - "loss": 0.0276, - "num_input_tokens_seen": 24358560, + "loss": 0.0239, + "num_input_tokens_seen": 24579952, "step": 3230 }, { "epoch": 0.8787775891341256, - "grad_norm": 0.30763623118400574, + "grad_norm": 0.38358286023139954, "learning_rate": 4.014487235200862e-05, - "loss": 0.0264, - "num_input_tokens_seen": 24393280, + "loss": 0.0243, + "num_input_tokens_seen": 24615280, "step": 3235 }, { "epoch": 0.8801358234295416, - "grad_norm": 0.19571910798549652, + "grad_norm": 0.17811471223831177, "learning_rate": 4.0116564041821455e-05, - "loss": 0.028, - "num_input_tokens_seen": 24427744, + "loss": 0.0258, + "num_input_tokens_seen": 24650976, "step": 3240 }, { "epoch": 0.8814940577249576, - "grad_norm": 0.3469042479991913, + "grad_norm": 0.10775120556354523, "learning_rate": 4.008822514592017e-05, - "loss": 0.0286, - "num_input_tokens_seen": 24468528, + "loss": 0.0268, + "num_input_tokens_seen": 24692016, "step": 3245 }, { "epoch": 0.8828522920203735, - "grad_norm": 0.12331614643335342, + "grad_norm": 0.11703498661518097, "learning_rate": 4.005985572164356e-05, - "loss": 0.0252, - "num_input_tokens_seen": 24506448, + "loss": 0.0259, + "num_input_tokens_seen": 24729984, "step": 3250 }, { "epoch": 0.8842105263157894, - "grad_norm": 0.24299688637256622, + "grad_norm": 0.11841461062431335, "learning_rate": 4.003145582639217e-05, - "loss": 0.0271, - "num_input_tokens_seen": 24546720, + "loss": 0.0222, + "num_input_tokens_seen": 24771200, "step": 3255 }, { "epoch": 0.8855687606112055, - "grad_norm": 0.09995856881141663, + "grad_norm": 0.39829981327056885, "learning_rate": 4.000302551762821e-05, - "loss": 0.0229, - "num_input_tokens_seen": 24580816, + "loss": 0.0231, + "num_input_tokens_seen": 24806000, "step": 3260 }, { "epoch": 0.8869269949066214, - "grad_norm": 0.07987441122531891, + "grad_norm": 0.07647863030433655, "learning_rate": 3.9974564852875404e-05, - "loss": 0.0234, - "num_input_tokens_seen": 24622128, + "loss": 0.0245, + "num_input_tokens_seen": 24847568, "step": 3265 }, { "epoch": 0.8882852292020373, - "grad_norm": 0.1700727790594101, + "grad_norm": 0.2620609998703003, "learning_rate": 3.994607388971893e-05, - "loss": 0.0231, - "num_input_tokens_seen": 24663424, + "loss": 0.0236, + "num_input_tokens_seen": 24889200, "step": 3270 }, { "epoch": 0.8896434634974533, - "grad_norm": 0.4049353003501892, + "grad_norm": 0.32725203037261963, "learning_rate": 3.9917552685805234e-05, - "loss": 0.0236, - "num_input_tokens_seen": 24700560, + "loss": 0.0225, + "num_input_tokens_seen": 24926880, "step": 3275 }, { "epoch": 0.8910016977928693, - "grad_norm": 0.07362957298755646, + "grad_norm": 0.1986543983221054, "learning_rate": 3.9889001298841985e-05, - "loss": 0.0278, - "num_input_tokens_seen": 24737920, + "loss": 0.0254, + "num_input_tokens_seen": 24964656, "step": 3280 }, { "epoch": 0.8923599320882852, - "grad_norm": 0.1000157818198204, + "grad_norm": 0.20958147943019867, "learning_rate": 3.9860419786597886e-05, - "loss": 0.0277, - "num_input_tokens_seen": 24775584, + "loss": 0.0246, + "num_input_tokens_seen": 25002464, "step": 3285 }, { "epoch": 0.8937181663837012, - "grad_norm": 0.3297850489616394, + "grad_norm": 0.5910336971282959, "learning_rate": 3.9831808206902626e-05, - "loss": 0.0236, - "num_input_tokens_seen": 24813360, + "loss": 0.0254, + "num_input_tokens_seen": 25040752, "step": 3290 }, { "epoch": 0.8950764006791172, - "grad_norm": 0.17710565030574799, + "grad_norm": 0.5625556111335754, "learning_rate": 3.98031666176467e-05, - "loss": 0.0218, - "num_input_tokens_seen": 24848784, + "loss": 0.0217, + "num_input_tokens_seen": 25076480, "step": 3295 }, { "epoch": 0.8964346349745331, - "grad_norm": 0.11080403625965118, + "grad_norm": 0.18064579367637634, "learning_rate": 3.977449507678135e-05, - "loss": 0.0236, - "num_input_tokens_seen": 24887392, + "loss": 0.0238, + "num_input_tokens_seen": 25115968, "step": 3300 }, { "epoch": 0.8977928692699491, - "grad_norm": 0.07551813870668411, + "grad_norm": 0.11882834881544113, "learning_rate": 3.9745793642318395e-05, - "loss": 0.0221, - "num_input_tokens_seen": 24926336, + "loss": 0.0244, + "num_input_tokens_seen": 25156048, "step": 3305 }, { "epoch": 0.899151103565365, - "grad_norm": 0.11275363713502884, + "grad_norm": 0.28816017508506775, "learning_rate": 3.9717062372330146e-05, - "loss": 0.0264, - "num_input_tokens_seen": 24964240, + "loss": 0.0304, + "num_input_tokens_seen": 25193696, "step": 3310 }, { "epoch": 0.900509337860781, - "grad_norm": 0.08940527588129044, + "grad_norm": 0.2769068777561188, "learning_rate": 3.968830132494931e-05, - "loss": 0.0249, - "num_input_tokens_seen": 25002016, + "loss": 0.0262, + "num_input_tokens_seen": 25231840, "step": 3315 }, { "epoch": 0.901867572156197, - "grad_norm": 0.2024724781513214, + "grad_norm": 0.15474078059196472, "learning_rate": 3.96595105583688e-05, - "loss": 0.0271, - "num_input_tokens_seen": 25032720, + "loss": 0.026, + "num_input_tokens_seen": 25262960, "step": 3320 }, { "epoch": 0.9032258064516129, - "grad_norm": 0.18724653124809265, + "grad_norm": 0.23395748436450958, "learning_rate": 3.963069013084167e-05, - "loss": 0.0248, - "num_input_tokens_seen": 25071392, + "loss": 0.0256, + "num_input_tokens_seen": 25302224, "step": 3325 }, { "epoch": 0.9045840407470289, - "grad_norm": 0.26380112767219543, + "grad_norm": 0.1987045854330063, "learning_rate": 3.960184010068102e-05, - "loss": 0.0289, - "num_input_tokens_seen": 25116864, + "loss": 0.0267, + "num_input_tokens_seen": 25347264, "step": 3330 }, { "epoch": 0.9059422750424448, - "grad_norm": 0.08949603885412216, + "grad_norm": 0.38707950711250305, "learning_rate": 3.957296052625981e-05, - "loss": 0.0257, - "num_input_tokens_seen": 25151280, + "loss": 0.0277, + "num_input_tokens_seen": 25382480, "step": 3335 }, { "epoch": 0.9073005093378608, - "grad_norm": 0.11599620431661606, + "grad_norm": 0.2877531945705414, "learning_rate": 3.954405146601079e-05, - "loss": 0.0239, - "num_input_tokens_seen": 25185936, + "loss": 0.0238, + "num_input_tokens_seen": 25417328, "step": 3340 }, { "epoch": 0.9086587436332767, - "grad_norm": 0.1264057606458664, + "grad_norm": 0.11280710995197296, "learning_rate": 3.951511297842636e-05, - "loss": 0.0253, - "num_input_tokens_seen": 25220880, + "loss": 0.024, + "num_input_tokens_seen": 25452736, "step": 3345 }, { "epoch": 0.9100169779286927, - "grad_norm": 0.08667906373739243, + "grad_norm": 0.08446758985519409, "learning_rate": 3.948614512205848e-05, - "loss": 0.0241, - "num_input_tokens_seen": 25254848, + "loss": 0.0227, + "num_input_tokens_seen": 25487184, "step": 3350 }, { "epoch": 0.9113752122241087, - "grad_norm": 0.10110685229301453, + "grad_norm": 0.144576296210289, "learning_rate": 3.9457147955518536e-05, - "loss": 0.0276, - "num_input_tokens_seen": 25291520, + "loss": 0.0271, + "num_input_tokens_seen": 25524400, "step": 3355 }, { "epoch": 0.9127334465195246, - "grad_norm": 0.20827028155326843, + "grad_norm": 0.07253797352313995, "learning_rate": 3.942812153747718e-05, - "loss": 0.0245, - "num_input_tokens_seen": 25330880, + "loss": 0.0242, + "num_input_tokens_seen": 25564272, "step": 3360 }, { "epoch": 0.9140916808149406, - "grad_norm": 0.10603941977024078, + "grad_norm": 0.09933298826217651, "learning_rate": 3.9399065926664295e-05, - "loss": 0.0247, - "num_input_tokens_seen": 25371456, + "loss": 0.0248, + "num_input_tokens_seen": 25605536, "step": 3365 }, { "epoch": 0.9154499151103566, - "grad_norm": 0.08557388186454773, + "grad_norm": 0.1270226687192917, "learning_rate": 3.93699811818688e-05, - "loss": 0.0251, - "num_input_tokens_seen": 25404448, + "loss": 0.0309, + "num_input_tokens_seen": 25638784, "step": 3370 }, { "epoch": 0.9168081494057725, - "grad_norm": 0.10554657876491547, + "grad_norm": 0.2065504491329193, "learning_rate": 3.934086736193856e-05, - "loss": 0.0261, - "num_input_tokens_seen": 25438496, + "loss": 0.0255, + "num_input_tokens_seen": 25673376, "step": 3375 }, { "epoch": 0.9181663837011884, - "grad_norm": 0.09735367447137833, + "grad_norm": 0.09200476109981537, "learning_rate": 3.9311724525780304e-05, - "loss": 0.0239, - "num_input_tokens_seen": 25477056, + "loss": 0.023, + "num_input_tokens_seen": 25711824, "step": 3380 }, { "epoch": 0.9195246179966045, - "grad_norm": 0.13253183662891388, + "grad_norm": 0.33107006549835205, "learning_rate": 3.928255273235943e-05, - "loss": 0.0248, - "num_input_tokens_seen": 25513472, + "loss": 0.0253, + "num_input_tokens_seen": 25748384, "step": 3385 }, { "epoch": 0.9208828522920204, - "grad_norm": 0.2867678105831146, + "grad_norm": 0.2066636085510254, "learning_rate": 3.925335204069995e-05, - "loss": 0.0254, - "num_input_tokens_seen": 25549856, + "loss": 0.0246, + "num_input_tokens_seen": 25785520, "step": 3390 }, { "epoch": 0.9222410865874363, - "grad_norm": 0.3077731728553772, + "grad_norm": 0.08885836601257324, "learning_rate": 3.9224122509884327e-05, - "loss": 0.0242, - "num_input_tokens_seen": 25589632, + "loss": 0.0227, + "num_input_tokens_seen": 25825520, "step": 3395 }, { "epoch": 0.9235993208828522, - "grad_norm": 0.3050723969936371, + "grad_norm": 0.12466949224472046, "learning_rate": 3.9194864199053394e-05, - "loss": 0.0235, - "num_input_tokens_seen": 25627664, + "loss": 0.0246, + "num_input_tokens_seen": 25863760, "step": 3400 }, { "epoch": 0.9249575551782683, - "grad_norm": 0.3298235833644867, + "grad_norm": 0.32585999369621277, "learning_rate": 3.916557716740621e-05, - "loss": 0.0241, - "num_input_tokens_seen": 25664144, + "loss": 0.0264, + "num_input_tokens_seen": 25901056, "step": 3405 }, { "epoch": 0.9263157894736842, - "grad_norm": 0.19131943583488464, + "grad_norm": 0.08928327262401581, "learning_rate": 3.913626147419993e-05, - "loss": 0.0237, - "num_input_tokens_seen": 25697728, + "loss": 0.0261, + "num_input_tokens_seen": 25935584, "step": 3410 }, { "epoch": 0.9276740237691001, - "grad_norm": 0.28251913189888, + "grad_norm": 0.18055592477321625, "learning_rate": 3.910691717874974e-05, - "loss": 0.0253, - "num_input_tokens_seen": 25737984, + "loss": 0.0262, + "num_input_tokens_seen": 25976320, "step": 3415 }, { "epoch": 0.9290322580645162, - "grad_norm": 0.0789017602801323, + "grad_norm": 0.17600642144680023, "learning_rate": 3.907754434042864e-05, - "loss": 0.0224, - "num_input_tokens_seen": 25776256, + "loss": 0.0236, + "num_input_tokens_seen": 26014528, "step": 3420 }, { "epoch": 0.9303904923599321, - "grad_norm": 0.20686453580856323, + "grad_norm": 0.1978112906217575, "learning_rate": 3.904814301866744e-05, - "loss": 0.0279, - "num_input_tokens_seen": 25807776, + "loss": 0.0298, + "num_input_tokens_seen": 26046480, "step": 3425 }, { "epoch": 0.931748726655348, - "grad_norm": 0.18918631970882416, + "grad_norm": 0.08275721222162247, "learning_rate": 3.901871327295453e-05, - "loss": 0.0252, - "num_input_tokens_seen": 25846592, + "loss": 0.0229, + "num_input_tokens_seen": 26085408, "step": 3430 }, { "epoch": 0.933106960950764, - "grad_norm": 0.1975216418504715, + "grad_norm": 0.11931162327528, "learning_rate": 3.898925516283585e-05, - "loss": 0.0257, - "num_input_tokens_seen": 25884528, + "loss": 0.0238, + "num_input_tokens_seen": 26123984, "step": 3435 }, { "epoch": 0.93446519524618, - "grad_norm": 0.10534167289733887, + "grad_norm": 0.16882748901844025, "learning_rate": 3.8959768747914715e-05, - "loss": 0.0269, - "num_input_tokens_seen": 25918000, + "loss": 0.0289, + "num_input_tokens_seen": 26157968, "step": 3440 }, { "epoch": 0.9358234295415959, - "grad_norm": 0.25008442997932434, + "grad_norm": 0.09522487223148346, "learning_rate": 3.8930254087851706e-05, - "loss": 0.0268, - "num_input_tokens_seen": 25954352, + "loss": 0.0249, + "num_input_tokens_seen": 26194640, "step": 3445 }, { "epoch": 0.9371816638370118, - "grad_norm": 0.07646093517541885, + "grad_norm": 0.07781017571687698, "learning_rate": 3.890071124236456e-05, - "loss": 0.0248, - "num_input_tokens_seen": 25986544, + "loss": 0.0234, + "num_input_tokens_seen": 26227328, "step": 3450 }, { "epoch": 0.9385398981324279, - "grad_norm": 0.29700377583503723, + "grad_norm": 0.34776604175567627, "learning_rate": 3.8871140271228045e-05, - "loss": 0.0265, - "num_input_tokens_seen": 26025344, + "loss": 0.0262, + "num_input_tokens_seen": 26266752, "step": 3455 }, { "epoch": 0.9398981324278438, - "grad_norm": 0.15192335844039917, + "grad_norm": 0.44109052419662476, "learning_rate": 3.884154123427381e-05, - "loss": 0.028, - "num_input_tokens_seen": 26063712, + "loss": 0.0262, + "num_input_tokens_seen": 26305392, "step": 3460 }, { "epoch": 0.9412563667232597, - "grad_norm": 0.08836814016103745, + "grad_norm": 0.08484750986099243, "learning_rate": 3.881191419139034e-05, - "loss": 0.0271, - "num_input_tokens_seen": 26094736, + "loss": 0.0259, + "num_input_tokens_seen": 26337264, "step": 3465 }, { "epoch": 0.9426146010186757, - "grad_norm": 0.1890663504600525, + "grad_norm": 0.12495789676904678, "learning_rate": 3.878225920252273e-05, - "loss": 0.027, - "num_input_tokens_seen": 26127936, + "loss": 0.026, + "num_input_tokens_seen": 26371264, "step": 3470 }, { "epoch": 0.9439728353140917, - "grad_norm": 0.10624194890260696, + "grad_norm": 0.13088493049144745, "learning_rate": 3.875257632767266e-05, - "loss": 0.0242, - "num_input_tokens_seen": 26168720, + "loss": 0.0255, + "num_input_tokens_seen": 26412496, "step": 3475 }, { "epoch": 0.9453310696095076, - "grad_norm": 0.21465562283992767, + "grad_norm": 0.32497847080230713, "learning_rate": 3.872286562689821e-05, - "loss": 0.0263, - "num_input_tokens_seen": 26207280, + "loss": 0.0278, + "num_input_tokens_seen": 26451472, "step": 3480 }, { "epoch": 0.9466893039049236, - "grad_norm": 0.11618019640445709, + "grad_norm": 0.27053210139274597, "learning_rate": 3.869312716031378e-05, - "loss": 0.0223, - "num_input_tokens_seen": 26248176, + "loss": 0.0231, + "num_input_tokens_seen": 26492736, "step": 3485 }, { "epoch": 0.9480475382003396, - "grad_norm": 0.2576579749584198, + "grad_norm": 0.26849424839019775, "learning_rate": 3.866336098808993e-05, - "loss": 0.0227, - "num_input_tokens_seen": 26284592, + "loss": 0.0228, + "num_input_tokens_seen": 26529120, "step": 3490 }, { "epoch": 0.9494057724957555, - "grad_norm": 0.10191572457551956, + "grad_norm": 0.11046132445335388, "learning_rate": 3.8633567170453284e-05, - "loss": 0.0234, - "num_input_tokens_seen": 26318832, + "loss": 0.0258, + "num_input_tokens_seen": 26563952, "step": 3495 }, { "epoch": 0.9507640067911715, - "grad_norm": 0.12467186897993088, + "grad_norm": 0.08443234860897064, "learning_rate": 3.860374576768639e-05, - "loss": 0.0248, - "num_input_tokens_seen": 26355376, + "loss": 0.0243, + "num_input_tokens_seen": 26600816, "step": 3500 }, { "epoch": 0.9521222410865874, - "grad_norm": 0.08407965302467346, + "grad_norm": 0.2568768858909607, "learning_rate": 3.857389684012765e-05, - "loss": 0.0246, - "num_input_tokens_seen": 26389200, + "loss": 0.0248, + "num_input_tokens_seen": 26634384, "step": 3505 }, { "epoch": 0.9534804753820034, - "grad_norm": 0.11814547330141068, + "grad_norm": 0.19267386198043823, "learning_rate": 3.854402044817111e-05, - "loss": 0.024, - "num_input_tokens_seen": 26422448, + "loss": 0.0238, + "num_input_tokens_seen": 26667328, "step": 3510 }, { "epoch": 0.9548387096774194, - "grad_norm": 0.4267798066139221, + "grad_norm": 0.1544167846441269, "learning_rate": 3.8514116652266417e-05, - "loss": 0.0239, - "num_input_tokens_seen": 26457888, + "loss": 0.0238, + "num_input_tokens_seen": 26702896, "step": 3515 }, { "epoch": 0.9561969439728353, - "grad_norm": 0.12411440908908844, + "grad_norm": 0.09675651043653488, "learning_rate": 3.848418551291864e-05, - "loss": 0.0253, - "num_input_tokens_seen": 26496128, + "loss": 0.0243, + "num_input_tokens_seen": 26741504, "step": 3520 }, { "epoch": 0.9575551782682513, - "grad_norm": 0.10156453400850296, + "grad_norm": 0.21343913674354553, "learning_rate": 3.84542270906882e-05, - "loss": 0.0257, - "num_input_tokens_seen": 26537168, + "loss": 0.0263, + "num_input_tokens_seen": 26782944, "step": 3525 }, { "epoch": 0.9589134125636672, - "grad_norm": 0.10094168037176132, + "grad_norm": 0.09185512363910675, "learning_rate": 3.8424241446190695e-05, - "loss": 0.0288, - "num_input_tokens_seen": 26575600, + "loss": 0.0267, + "num_input_tokens_seen": 26821616, "step": 3530 }, { "epoch": 0.9602716468590832, - "grad_norm": 0.15422159433364868, + "grad_norm": 0.36283716559410095, "learning_rate": 3.839422864009682e-05, - "loss": 0.0258, - "num_input_tokens_seen": 26619936, + "loss": 0.0246, + "num_input_tokens_seen": 26865984, "step": 3535 }, { "epoch": 0.9616298811544991, - "grad_norm": 0.09068328887224197, + "grad_norm": 0.09579097479581833, "learning_rate": 3.836418873313221e-05, - "loss": 0.022, - "num_input_tokens_seen": 26657680, + "loss": 0.0217, + "num_input_tokens_seen": 26903760, "step": 3540 }, { "epoch": 0.9629881154499151, - "grad_norm": 0.2723589837551117, + "grad_norm": 0.08224410563707352, "learning_rate": 3.833412178607736e-05, - "loss": 0.0263, - "num_input_tokens_seen": 26700656, + "loss": 0.0245, + "num_input_tokens_seen": 26947152, "step": 3545 }, { "epoch": 0.9643463497453311, - "grad_norm": 0.21860073506832123, + "grad_norm": 0.15407314896583557, "learning_rate": 3.8304027859767456e-05, - "loss": 0.025, - "num_input_tokens_seen": 26738256, + "loss": 0.0241, + "num_input_tokens_seen": 26985296, "step": 3550 }, { "epoch": 0.965704584040747, - "grad_norm": 0.5195443630218506, + "grad_norm": 0.09518634527921677, "learning_rate": 3.827390701509226e-05, - "loss": 0.0269, - "num_input_tokens_seen": 26771760, + "loss": 0.0249, + "num_input_tokens_seen": 27018416, "step": 3555 }, { "epoch": 0.967062818336163, - "grad_norm": 0.08357148617506027, + "grad_norm": 0.15789446234703064, "learning_rate": 3.8243759312996044e-05, - "loss": 0.0244, - "num_input_tokens_seen": 26809456, + "loss": 0.0227, + "num_input_tokens_seen": 27056448, "step": 3560 }, { "epoch": 0.968421052631579, - "grad_norm": 0.08939073234796524, + "grad_norm": 0.07335119694471359, "learning_rate": 3.8213584814477364e-05, - "loss": 0.0224, - "num_input_tokens_seen": 26850512, + "loss": 0.0244, + "num_input_tokens_seen": 27097840, "step": 3565 }, { "epoch": 0.9697792869269949, - "grad_norm": 0.16334711015224457, + "grad_norm": 0.0730302631855011, "learning_rate": 3.8183383580589035e-05, - "loss": 0.0264, - "num_input_tokens_seen": 26887440, + "loss": 0.0261, + "num_input_tokens_seen": 27135632, "step": 3570 }, { "epoch": 0.9711375212224108, - "grad_norm": 0.25211113691329956, + "grad_norm": 0.281752347946167, "learning_rate": 3.8153155672437945e-05, - "loss": 0.0229, - "num_input_tokens_seen": 26927984, + "loss": 0.0214, + "num_input_tokens_seen": 27176560, "step": 3575 }, { "epoch": 0.9724957555178269, - "grad_norm": 0.07389184087514877, + "grad_norm": 0.0961889773607254, "learning_rate": 3.812290115118497e-05, - "loss": 0.0236, - "num_input_tokens_seen": 26968768, + "loss": 0.0235, + "num_input_tokens_seen": 27217184, "step": 3580 }, { "epoch": 0.9738539898132428, - "grad_norm": 0.08744581788778305, + "grad_norm": 0.07744790613651276, "learning_rate": 3.809262007804482e-05, - "loss": 0.0201, - "num_input_tokens_seen": 27005456, + "loss": 0.0203, + "num_input_tokens_seen": 27254384, "step": 3585 }, { "epoch": 0.9752122241086587, - "grad_norm": 0.42951634526252747, + "grad_norm": 0.16246242821216583, "learning_rate": 3.8062312514285936e-05, - "loss": 0.0231, - "num_input_tokens_seen": 27039488, + "loss": 0.0213, + "num_input_tokens_seen": 27288848, "step": 3590 }, { "epoch": 0.9765704584040747, - "grad_norm": 1.0128060579299927, + "grad_norm": 0.3341984152793884, "learning_rate": 3.803197852123034e-05, "loss": 0.0262, - "num_input_tokens_seen": 27074944, + "num_input_tokens_seen": 27323872, "step": 3595 }, { "epoch": 0.9779286926994907, - "grad_norm": 0.11034579575061798, + "grad_norm": 0.08802422881126404, "learning_rate": 3.8001618160253546e-05, - "loss": 0.0257, - "num_input_tokens_seen": 27111840, + "loss": 0.0251, + "num_input_tokens_seen": 27360656, "step": 3600 }, { "epoch": 0.9792869269949066, - "grad_norm": 0.840183675289154, + "grad_norm": 0.18551838397979736, "learning_rate": 3.7971231492784434e-05, - "loss": 0.0249, - "num_input_tokens_seen": 27149856, + "loss": 0.0238, + "num_input_tokens_seen": 27398496, "step": 3605 }, { "epoch": 0.9806451612903225, - "grad_norm": 0.08251819759607315, + "grad_norm": 0.0802384540438652, "learning_rate": 3.794081858030507e-05, - "loss": 0.0241, - "num_input_tokens_seen": 27188048, + "loss": 0.0222, + "num_input_tokens_seen": 27437312, "step": 3610 }, { "epoch": 0.9820033955857386, - "grad_norm": 0.24625907838344574, + "grad_norm": 0.1064113974571228, "learning_rate": 3.791037948435066e-05, - "loss": 0.0307, - "num_input_tokens_seen": 27223936, + "loss": 0.027, + "num_input_tokens_seen": 27473136, "step": 3615 }, { "epoch": 0.9833616298811545, - "grad_norm": 0.15371429920196533, + "grad_norm": 0.2235875129699707, "learning_rate": 3.7879914266509364e-05, - "loss": 0.0236, - "num_input_tokens_seen": 27261120, + "loss": 0.0218, + "num_input_tokens_seen": 27510800, "step": 3620 }, { "epoch": 0.9847198641765704, - "grad_norm": 0.256661981344223, + "grad_norm": 0.07036598771810532, "learning_rate": 3.784942298842221e-05, - "loss": 0.0238, - "num_input_tokens_seen": 27299920, + "loss": 0.0212, + "num_input_tokens_seen": 27549872, "step": 3625 }, { "epoch": 0.9860780984719865, - "grad_norm": 0.26132458448410034, + "grad_norm": 0.1253422051668167, "learning_rate": 3.781890571178294e-05, - "loss": 0.0318, - "num_input_tokens_seen": 27333584, + "loss": 0.0259, + "num_input_tokens_seen": 27584128, "step": 3630 }, { "epoch": 0.9874363327674024, - "grad_norm": 0.16090653836727142, + "grad_norm": 0.09603866189718246, "learning_rate": 3.7788362498337904e-05, - "loss": 0.0266, - "num_input_tokens_seen": 27373920, + "loss": 0.0239, + "num_input_tokens_seen": 27624480, "step": 3635 }, { "epoch": 0.9887945670628183, - "grad_norm": 0.1401430070400238, + "grad_norm": 0.09933606535196304, "learning_rate": 3.775779340988594e-05, - "loss": 0.0277, - "num_input_tokens_seen": 27407488, + "loss": 0.0257, + "num_input_tokens_seen": 27658656, "step": 3640 }, { "epoch": 0.9901528013582342, - "grad_norm": 0.09232348203659058, + "grad_norm": 0.10294133424758911, "learning_rate": 3.7727198508278235e-05, - "loss": 0.0264, - "num_input_tokens_seen": 27444864, + "loss": 0.0228, + "num_input_tokens_seen": 27696384, "step": 3645 }, { "epoch": 0.9915110356536503, - "grad_norm": 0.08758342266082764, + "grad_norm": 0.06922685354948044, "learning_rate": 3.769657785541819e-05, - "loss": 0.0254, - "num_input_tokens_seen": 27478448, + "loss": 0.0225, + "num_input_tokens_seen": 27730400, "step": 3650 }, { "epoch": 0.9928692699490662, - "grad_norm": 0.22011806070804596, + "grad_norm": 0.10030097514390945, "learning_rate": 3.766593151326134e-05, - "loss": 0.0286, - "num_input_tokens_seen": 27517920, + "loss": 0.0264, + "num_input_tokens_seen": 27769888, "step": 3655 }, { "epoch": 0.9942275042444821, - "grad_norm": 0.1175965666770935, + "grad_norm": 0.077107734978199, "learning_rate": 3.763525954381517e-05, - "loss": 0.0225, - "num_input_tokens_seen": 27552256, + "loss": 0.0222, + "num_input_tokens_seen": 27804560, "step": 3660 }, { "epoch": 0.9955857385398982, - "grad_norm": 0.21363253891468048, + "grad_norm": 0.088266521692276, "learning_rate": 3.760456200913903e-05, - "loss": 0.0227, - "num_input_tokens_seen": 27588992, + "loss": 0.0221, + "num_input_tokens_seen": 27842032, "step": 3665 }, { "epoch": 0.9969439728353141, - "grad_norm": 0.07457183301448822, + "grad_norm": 0.07193887233734131, "learning_rate": 3.757383897134399e-05, - "loss": 0.0314, - "num_input_tokens_seen": 27625024, + "loss": 0.0267, + "num_input_tokens_seen": 27878992, "step": 3670 }, { "epoch": 0.99830220713073, - "grad_norm": 0.08248250186443329, + "grad_norm": 0.07234394550323486, "learning_rate": 3.754309049259271e-05, - "loss": 0.023, - "num_input_tokens_seen": 27659792, + "loss": 0.0225, + "num_input_tokens_seen": 27913568, "step": 3675 }, { "epoch": 0.999660441426146, - "grad_norm": 0.09152770042419434, + "grad_norm": 0.09619062393903732, "learning_rate": 3.751231663509937e-05, - "loss": 0.0249, - "num_input_tokens_seen": 27699088, + "loss": 0.0236, + "num_input_tokens_seen": 27953072, "step": 3680 }, { "epoch": 1.0008149405772495, - "grad_norm": 0.06874056160449982, + "grad_norm": 0.2007799595594406, "learning_rate": 3.748151746112945e-05, - "loss": 0.023, - "num_input_tokens_seen": 27733216, + "loss": 0.0231, + "num_input_tokens_seen": 27987936, "step": 3685 }, { "epoch": 1.0021731748726654, - "grad_norm": 0.10735509544610977, + "grad_norm": 0.09904617816209793, "learning_rate": 3.745069303299968e-05, - "loss": 0.0226, - "num_input_tokens_seen": 27772736, + "loss": 0.0219, + "num_input_tokens_seen": 28027680, "step": 3690 }, { "epoch": 1.0035314091680816, - "grad_norm": 0.08760124444961548, + "grad_norm": 0.08135796338319778, "learning_rate": 3.741984341307788e-05, - "loss": 0.0249, - "num_input_tokens_seen": 27812464, + "loss": 0.0239, + "num_input_tokens_seen": 28068144, "step": 3695 }, { "epoch": 1.0048896434634975, - "grad_norm": 0.43399128317832947, + "grad_norm": 0.10360568761825562, "learning_rate": 3.738896866378283e-05, - "loss": 0.0291, - "num_input_tokens_seen": 27848528, + "loss": 0.0248, + "num_input_tokens_seen": 28104816, "step": 3700 }, { "epoch": 1.0062478777589134, - "grad_norm": 0.07515572756528854, + "grad_norm": 0.08884073048830032, "learning_rate": 3.735806884758417e-05, - "loss": 0.0235, - "num_input_tokens_seen": 27885344, + "loss": 0.0228, + "num_input_tokens_seen": 28142000, "step": 3705 }, { "epoch": 1.0076061120543294, - "grad_norm": 0.42366108298301697, + "grad_norm": 0.20642374455928802, "learning_rate": 3.732714402700226e-05, - "loss": 0.0228, - "num_input_tokens_seen": 27920752, + "loss": 0.0223, + "num_input_tokens_seen": 28178176, "step": 3710 }, { "epoch": 1.0089643463497453, - "grad_norm": 0.29840391874313354, + "grad_norm": 0.07251933962106705, "learning_rate": 3.729619426460805e-05, - "loss": 0.022, - "num_input_tokens_seen": 27962368, + "loss": 0.0208, + "num_input_tokens_seen": 28220544, "step": 3715 }, { "epoch": 1.0103225806451612, - "grad_norm": 0.08226434141397476, + "grad_norm": 0.07380595058202744, "learning_rate": 3.726521962302295e-05, - "loss": 0.0201, - "num_input_tokens_seen": 27999552, + "loss": 0.0186, + "num_input_tokens_seen": 28257856, "step": 3720 }, { "epoch": 1.0116808149405772, - "grad_norm": 0.06982453167438507, + "grad_norm": 0.06861138343811035, "learning_rate": 3.7234220164918716e-05, - "loss": 0.0235, - "num_input_tokens_seen": 28039184, + "loss": 0.0236, + "num_input_tokens_seen": 28297664, "step": 3725 }, { "epoch": 1.0130390492359933, - "grad_norm": 0.09503108263015747, + "grad_norm": 0.08345624804496765, "learning_rate": 3.720319595301729e-05, - "loss": 0.0247, - "num_input_tokens_seen": 28075072, + "loss": 0.0234, + "num_input_tokens_seen": 28333744, "step": 3730 }, { "epoch": 1.0143972835314092, - "grad_norm": 0.14893609285354614, + "grad_norm": 0.08169401437044144, "learning_rate": 3.7172147050090735e-05, - "loss": 0.03, - "num_input_tokens_seen": 28112880, + "loss": 0.0234, + "num_input_tokens_seen": 28372016, "step": 3735 }, { "epoch": 1.0157555178268252, - "grad_norm": 0.08982798457145691, + "grad_norm": 0.0744674876332283, "learning_rate": 3.714107351896105e-05, - "loss": 0.0241, - "num_input_tokens_seen": 28157824, + "loss": 0.0224, + "num_input_tokens_seen": 28417376, "step": 3740 }, { "epoch": 1.017113752122241, - "grad_norm": 0.0861971452832222, + "grad_norm": 0.16714537143707275, "learning_rate": 3.7109975422500075e-05, - "loss": 0.0208, - "num_input_tokens_seen": 28200656, + "loss": 0.0204, + "num_input_tokens_seen": 28460544, "step": 3745 }, { "epoch": 1.018471986417657, - "grad_norm": 0.09259362518787384, + "grad_norm": 0.08339356631040573, "learning_rate": 3.7078852823629356e-05, - "loss": 0.0282, - "num_input_tokens_seen": 28237856, + "loss": 0.0263, + "num_input_tokens_seen": 28498432, "step": 3750 }, { "epoch": 1.019830220713073, - "grad_norm": 0.10196099430322647, + "grad_norm": 0.09094273298978806, "learning_rate": 3.704770578532e-05, - "loss": 0.0261, - "num_input_tokens_seen": 28273408, + "loss": 0.025, + "num_input_tokens_seen": 28534560, "step": 3755 }, { "epoch": 1.0211884550084889, - "grad_norm": 0.06037357822060585, + "grad_norm": 0.05246599763631821, "learning_rate": 3.7016534370592574e-05, - "loss": 0.0187, - "num_input_tokens_seen": 28307872, + "loss": 0.0184, + "num_input_tokens_seen": 28569440, "step": 3760 }, { "epoch": 1.022546689303905, - "grad_norm": 0.0861475020647049, + "grad_norm": 0.06984378397464752, "learning_rate": 3.698533864251697e-05, - "loss": 0.021, - "num_input_tokens_seen": 28339424, + "loss": 0.0193, + "num_input_tokens_seen": 28601104, "step": 3765 }, { "epoch": 1.023904923599321, - "grad_norm": 0.07410363107919693, + "grad_norm": 0.06280329823493958, "learning_rate": 3.695411866421226e-05, - "loss": 0.0238, - "num_input_tokens_seen": 28378752, + "loss": 0.0214, + "num_input_tokens_seen": 28640336, "step": 3770 }, { "epoch": 1.0252631578947369, - "grad_norm": 0.0657435804605484, + "grad_norm": 0.06340809911489487, "learning_rate": 3.692287449884659e-05, - "loss": 0.026, - "num_input_tokens_seen": 28412544, + "loss": 0.0246, + "num_input_tokens_seen": 28674704, "step": 3775 }, { "epoch": 1.0266213921901528, - "grad_norm": 0.10810165107250214, + "grad_norm": 0.09807753562927246, "learning_rate": 3.689160620963706e-05, - "loss": 0.0218, - "num_input_tokens_seen": 28456576, + "loss": 0.0217, + "num_input_tokens_seen": 28719024, "step": 3780 }, { "epoch": 1.0279796264855687, - "grad_norm": 0.09696964919567108, + "grad_norm": 0.16712361574172974, "learning_rate": 3.6860313859849544e-05, - "loss": 0.0247, - "num_input_tokens_seen": 28489952, + "loss": 0.0235, + "num_input_tokens_seen": 28752496, "step": 3785 }, { "epoch": 1.0293378607809847, - "grad_norm": 0.0995006188750267, + "grad_norm": 0.13929122686386108, "learning_rate": 3.682899751279863e-05, - "loss": 0.0239, - "num_input_tokens_seen": 28531104, + "loss": 0.0236, + "num_input_tokens_seen": 28794464, "step": 3790 }, { "epoch": 1.0306960950764006, - "grad_norm": 0.07536152005195618, + "grad_norm": 0.08720802515745163, "learning_rate": 3.679765723184746e-05, - "loss": 0.0223, - "num_input_tokens_seen": 28566912, + "loss": 0.0222, + "num_input_tokens_seen": 28830768, "step": 3795 }, { "epoch": 1.0320543293718167, - "grad_norm": 0.06820250302553177, + "grad_norm": 0.061560824513435364, "learning_rate": 3.676629308040758e-05, - "loss": 0.0201, - "num_input_tokens_seen": 28604736, + "loss": 0.0198, + "num_input_tokens_seen": 28868912, "step": 3800 }, { "epoch": 1.0334125636672327, - "grad_norm": 0.28411903977394104, + "grad_norm": 0.1294608861207962, "learning_rate": 3.6734905121938836e-05, - "loss": 0.0245, - "num_input_tokens_seen": 28647376, + "loss": 0.0234, + "num_input_tokens_seen": 28912176, "step": 3805 }, { "epoch": 1.0347707979626486, - "grad_norm": 0.0722360610961914, + "grad_norm": 0.06444506347179413, "learning_rate": 3.6703493419949265e-05, - "loss": 0.0235, - "num_input_tokens_seen": 28682656, + "loss": 0.0217, + "num_input_tokens_seen": 28948208, "step": 3810 }, { "epoch": 1.0361290322580645, - "grad_norm": 0.098227858543396, + "grad_norm": 0.14500606060028076, "learning_rate": 3.667205803799494e-05, - "loss": 0.0245, - "num_input_tokens_seen": 28716320, + "loss": 0.0244, + "num_input_tokens_seen": 28982560, "step": 3815 }, { "epoch": 1.0374872665534804, - "grad_norm": 0.08983462303876877, + "grad_norm": 0.08217432349920273, "learning_rate": 3.664059903967982e-05, - "loss": 0.025, - "num_input_tokens_seen": 28749600, + "loss": 0.0236, + "num_input_tokens_seen": 29016528, "step": 3820 }, { "epoch": 1.0388455008488964, - "grad_norm": 0.09873993694782257, + "grad_norm": 0.10487231612205505, "learning_rate": 3.6609116488655684e-05, - "loss": 0.0241, - "num_input_tokens_seen": 28782720, + "loss": 0.0238, + "num_input_tokens_seen": 29050144, "step": 3825 }, { "epoch": 1.0402037351443123, - "grad_norm": 0.07854326069355011, + "grad_norm": 0.2495347112417221, "learning_rate": 3.6577610448621935e-05, - "loss": 0.0251, - "num_input_tokens_seen": 28816336, + "loss": 0.0252, + "num_input_tokens_seen": 29084064, "step": 3830 }, { "epoch": 1.0415619694397285, - "grad_norm": 0.08286363631486893, + "grad_norm": 0.09312200546264648, "learning_rate": 3.6546080983325527e-05, - "loss": 0.0241, - "num_input_tokens_seen": 28849584, + "loss": 0.0236, + "num_input_tokens_seen": 29117920, "step": 3835 }, { "epoch": 1.0429202037351444, - "grad_norm": 0.06731267273426056, + "grad_norm": 0.1875970959663391, "learning_rate": 3.651452815656079e-05, - "loss": 0.0205, - "num_input_tokens_seen": 28889232, + "loss": 0.0194, + "num_input_tokens_seen": 29157984, "step": 3840 }, { "epoch": 1.0442784380305603, - "grad_norm": 0.07566747069358826, + "grad_norm": 0.08533722907304764, "learning_rate": 3.648295203216933e-05, - "loss": 0.0246, - "num_input_tokens_seen": 28925984, + "loss": 0.0237, + "num_input_tokens_seen": 29194560, "step": 3845 }, { "epoch": 1.0456366723259762, - "grad_norm": 0.088098905980587, + "grad_norm": 0.17633217573165894, "learning_rate": 3.6451352674039895e-05, - "loss": 0.0212, - "num_input_tokens_seen": 28961984, + "loss": 0.0207, + "num_input_tokens_seen": 29230496, "step": 3850 }, { "epoch": 1.0469949066213922, - "grad_norm": 0.07702454179525375, + "grad_norm": 0.11769000440835953, "learning_rate": 3.6419730146108245e-05, - "loss": 0.0225, - "num_input_tokens_seen": 29001632, + "loss": 0.0224, + "num_input_tokens_seen": 29270448, "step": 3855 }, { "epoch": 1.048353140916808, - "grad_norm": 0.20206433534622192, + "grad_norm": 0.07671354711055756, "learning_rate": 3.638808451235702e-05, - "loss": 0.0246, - "num_input_tokens_seen": 29041344, + "loss": 0.024, + "num_input_tokens_seen": 29310464, "step": 3860 }, { "epoch": 1.049711375212224, - "grad_norm": 0.07146162539720535, + "grad_norm": 0.08134815841913223, "learning_rate": 3.6356415836815586e-05, - "loss": 0.0227, - "num_input_tokens_seen": 29082480, + "loss": 0.0219, + "num_input_tokens_seen": 29351536, "step": 3865 }, { "epoch": 1.0510696095076402, - "grad_norm": 0.08371078968048096, + "grad_norm": 0.10614635795354843, "learning_rate": 3.632472418355996e-05, - "loss": 0.0222, - "num_input_tokens_seen": 29123488, + "loss": 0.0209, + "num_input_tokens_seen": 29392688, "step": 3870 }, { "epoch": 1.052427843803056, - "grad_norm": 0.09149320423603058, + "grad_norm": 0.0913131907582283, "learning_rate": 3.6293009616712645e-05, - "loss": 0.0256, - "num_input_tokens_seen": 29160608, + "loss": 0.0247, + "num_input_tokens_seen": 29430016, "step": 3875 }, { "epoch": 1.053786078098472, - "grad_norm": 0.07225057482719421, + "grad_norm": 0.12118882685899734, "learning_rate": 3.626127220044249e-05, - "loss": 0.0236, - "num_input_tokens_seen": 29195840, + "loss": 0.0239, + "num_input_tokens_seen": 29465616, "step": 3880 }, { "epoch": 1.055144312393888, - "grad_norm": 0.07430986315011978, + "grad_norm": 0.06921906769275665, "learning_rate": 3.6229511998964596e-05, "loss": 0.0205, - "num_input_tokens_seen": 29236224, + "num_input_tokens_seen": 29506656, "step": 3885 }, { "epoch": 1.0565025466893039, - "grad_norm": 0.2048701047897339, + "grad_norm": 0.08091718703508377, "learning_rate": 3.619772907654014e-05, - "loss": 0.0233, - "num_input_tokens_seen": 29271328, + "loss": 0.0234, + "num_input_tokens_seen": 29542320, "step": 3890 }, { "epoch": 1.0578607809847198, - "grad_norm": 0.08069735765457153, + "grad_norm": 0.10159622877836227, "learning_rate": 3.6165923497476316e-05, - "loss": 0.0219, - "num_input_tokens_seen": 29307520, + "loss": 0.0209, + "num_input_tokens_seen": 29578912, "step": 3895 }, { "epoch": 1.0592190152801357, - "grad_norm": 0.18664628267288208, + "grad_norm": 0.09191789478063583, "learning_rate": 3.6134095326126106e-05, - "loss": 0.0253, - "num_input_tokens_seen": 29347200, + "loss": 0.023, + "num_input_tokens_seen": 29618736, "step": 3900 }, { "epoch": 1.0605772495755519, - "grad_norm": 0.1084238812327385, + "grad_norm": 0.08836675435304642, "learning_rate": 3.610224462688824e-05, - "loss": 0.022, - "num_input_tokens_seen": 29386512, + "loss": 0.021, + "num_input_tokens_seen": 29658128, "step": 3905 }, { "epoch": 1.0619354838709678, - "grad_norm": 0.07478206604719162, + "grad_norm": 0.07996409386396408, "learning_rate": 3.6070371464207015e-05, "loss": 0.0219, - "num_input_tokens_seen": 29428176, + "num_input_tokens_seen": 29700496, "step": 3910 }, { "epoch": 1.0632937181663837, - "grad_norm": 0.09718232601881027, + "grad_norm": 0.08598874509334564, "learning_rate": 3.6038475902572175e-05, - "loss": 0.0244, - "num_input_tokens_seen": 29466432, + "loss": 0.023, + "num_input_tokens_seen": 29739248, "step": 3915 }, { "epoch": 1.0646519524617997, - "grad_norm": 0.08483519405126572, + "grad_norm": 0.08084091544151306, "learning_rate": 3.60065580065188e-05, - "loss": 0.0249, - "num_input_tokens_seen": 29504096, + "loss": 0.0237, + "num_input_tokens_seen": 29777200, "step": 3920 }, { "epoch": 1.0660101867572156, - "grad_norm": 0.12965719401836395, + "grad_norm": 0.10807369649410248, "learning_rate": 3.597461784062715e-05, - "loss": 0.0234, - "num_input_tokens_seen": 29542512, + "loss": 0.0206, + "num_input_tokens_seen": 29815744, "step": 3925 }, { "epoch": 1.0673684210526315, - "grad_norm": 0.1676773875951767, + "grad_norm": 0.22470815479755402, "learning_rate": 3.594265546952253e-05, - "loss": 0.0216, - "num_input_tokens_seen": 29579248, + "loss": 0.0225, + "num_input_tokens_seen": 29852736, "step": 3930 }, { "epoch": 1.0687266553480474, - "grad_norm": 0.0902162715792656, + "grad_norm": 0.08870202302932739, "learning_rate": 3.5910670957875204e-05, - "loss": 0.0251, - "num_input_tokens_seen": 29617120, + "loss": 0.0233, + "num_input_tokens_seen": 29890688, "step": 3935 }, { "epoch": 1.0700848896434636, - "grad_norm": 0.08212680369615555, + "grad_norm": 0.07169273495674133, "learning_rate": 3.587866437040021e-05, - "loss": 0.0212, - "num_input_tokens_seen": 29656688, + "loss": 0.0223, + "num_input_tokens_seen": 29930352, "step": 3940 }, { "epoch": 1.0714431239388795, - "grad_norm": 0.18601396679878235, + "grad_norm": 0.29948797821998596, "learning_rate": 3.584663577185727e-05, - "loss": 0.0285, - "num_input_tokens_seen": 29688096, + "loss": 0.0248, + "num_input_tokens_seen": 29962368, "step": 3945 }, { "epoch": 1.0728013582342955, - "grad_norm": 0.3284305930137634, + "grad_norm": 0.08686472475528717, "learning_rate": 3.581458522705062e-05, - "loss": 0.0215, - "num_input_tokens_seen": 29721792, + "loss": 0.0205, + "num_input_tokens_seen": 29996528, "step": 3950 }, { "epoch": 1.0741595925297114, - "grad_norm": 0.08925528079271317, + "grad_norm": 0.1167028546333313, "learning_rate": 3.578251280082892e-05, - "loss": 0.0233, - "num_input_tokens_seen": 29761712, + "loss": 0.0211, + "num_input_tokens_seen": 30036912, "step": 3955 }, { "epoch": 1.0755178268251273, - "grad_norm": 0.15745700895786285, + "grad_norm": 0.073319211602211, "learning_rate": 3.5750418558085084e-05, - "loss": 0.0288, - "num_input_tokens_seen": 29801664, + "loss": 0.0256, + "num_input_tokens_seen": 30077248, "step": 3960 }, { "epoch": 1.0768760611205432, - "grad_norm": 0.0709497258067131, + "grad_norm": 0.06311651319265366, "learning_rate": 3.571830256375619e-05, - "loss": 0.0208, - "num_input_tokens_seen": 29838336, + "loss": 0.0206, + "num_input_tokens_seen": 30114496, "step": 3965 }, { "epoch": 1.0782342954159592, - "grad_norm": 0.09036830812692642, + "grad_norm": 0.06357723474502563, "learning_rate": 3.5686164882823314e-05, - "loss": 0.0206, - "num_input_tokens_seen": 29880336, + "loss": 0.02, + "num_input_tokens_seen": 30156800, "step": 3970 }, { "epoch": 1.0795925297113753, - "grad_norm": 0.07453510165214539, + "grad_norm": 0.08202455192804337, "learning_rate": 3.565400558031141e-05, - "loss": 0.0231, - "num_input_tokens_seen": 29911536, + "loss": 0.0208, + "num_input_tokens_seen": 30188256, "step": 3975 }, { "epoch": 1.0809507640067912, - "grad_norm": 0.18060772120952606, + "grad_norm": 0.0838884562253952, "learning_rate": 3.5621824721289176e-05, - "loss": 0.0281, - "num_input_tokens_seen": 29947296, + "loss": 0.0257, + "num_input_tokens_seen": 30224576, "step": 3980 }, { "epoch": 1.0823089983022072, - "grad_norm": 0.08471189439296722, + "grad_norm": 0.0790356770157814, "learning_rate": 3.558962237086894e-05, - "loss": 0.0231, - "num_input_tokens_seen": 29991440, + "loss": 0.0217, + "num_input_tokens_seen": 30269232, "step": 3985 }, { "epoch": 1.083667232597623, - "grad_norm": 0.08320647478103638, + "grad_norm": 0.08136173337697983, "learning_rate": 3.555739859420651e-05, - "loss": 0.0291, - "num_input_tokens_seen": 30031600, + "loss": 0.0268, + "num_input_tokens_seen": 30309680, "step": 3990 }, { "epoch": 1.085025466893039, - "grad_norm": 0.11337485909461975, + "grad_norm": 0.07531648874282837, "learning_rate": 3.5525153456501036e-05, - "loss": 0.0256, - "num_input_tokens_seen": 30071568, + "loss": 0.0236, + "num_input_tokens_seen": 30350128, "step": 3995 }, { "epoch": 1.086383701188455, - "grad_norm": 0.08390840142965317, + "grad_norm": 0.08252733200788498, "learning_rate": 3.5492887022994896e-05, - "loss": 0.0245, - "num_input_tokens_seen": 30114080, + "loss": 0.024, + "num_input_tokens_seen": 30393136, "step": 4000 }, { "epoch": 1.0877419354838709, - "grad_norm": 0.08553528040647507, + "grad_norm": 0.08160780370235443, "learning_rate": 3.5460599358973554e-05, - "loss": 0.0236, - "num_input_tokens_seen": 30154608, + "loss": 0.0229, + "num_input_tokens_seen": 30433952, "step": 4005 }, { "epoch": 1.089100169779287, - "grad_norm": 0.09116280823945999, + "grad_norm": 0.08231871575117111, "learning_rate": 3.542829052976543e-05, - "loss": 0.0278, - "num_input_tokens_seen": 30187824, + "loss": 0.0255, + "num_input_tokens_seen": 30467488, "step": 4010 }, { "epoch": 1.090458404074703, - "grad_norm": 0.09407933056354523, + "grad_norm": 0.07797353714704514, "learning_rate": 3.539596060074177e-05, - "loss": 0.0223, - "num_input_tokens_seen": 30225536, + "loss": 0.0213, + "num_input_tokens_seen": 30505776, "step": 4015 }, { "epoch": 1.0918166383701189, - "grad_norm": 0.16944824159145355, + "grad_norm": 0.08830048143863678, "learning_rate": 3.5363609637316506e-05, - "loss": 0.0251, - "num_input_tokens_seen": 30262320, + "loss": 0.024, + "num_input_tokens_seen": 30542832, "step": 4020 }, { "epoch": 1.0931748726655348, - "grad_norm": 0.086267851293087, + "grad_norm": 0.07271230965852737, "learning_rate": 3.5331237704946144e-05, - "loss": 0.0234, - "num_input_tokens_seen": 30297968, + "loss": 0.0221, + "num_input_tokens_seen": 30578704, "step": 4025 }, { "epoch": 1.0945331069609507, - "grad_norm": 0.08400574326515198, + "grad_norm": 0.08650582283735275, "learning_rate": 3.529884486912961e-05, - "loss": 0.0227, - "num_input_tokens_seen": 30338368, + "loss": 0.0225, + "num_input_tokens_seen": 30619152, "step": 4030 }, { "epoch": 1.0958913412563667, - "grad_norm": 0.09039977192878723, + "grad_norm": 0.08701939880847931, "learning_rate": 3.5266431195408105e-05, - "loss": 0.0245, - "num_input_tokens_seen": 30380272, + "loss": 0.0236, + "num_input_tokens_seen": 30661632, "step": 4035 }, { "epoch": 1.0972495755517826, - "grad_norm": 0.11315695196390152, + "grad_norm": 0.08764513581991196, "learning_rate": 3.5233996749365025e-05, - "loss": 0.0226, - "num_input_tokens_seen": 30418384, + "loss": 0.0231, + "num_input_tokens_seen": 30700016, "step": 4040 }, { "epoch": 1.0986078098471985, - "grad_norm": 0.1361483335494995, + "grad_norm": 0.06451082974672318, "learning_rate": 3.5201541596625766e-05, - "loss": 0.0213, - "num_input_tokens_seen": 30450944, + "loss": 0.0201, + "num_input_tokens_seen": 30733024, "step": 4045 }, { "epoch": 1.0999660441426147, - "grad_norm": 0.0778949037194252, + "grad_norm": 0.07159922271966934, "learning_rate": 3.5169065802857644e-05, - "loss": 0.0232, - "num_input_tokens_seen": 30485280, + "loss": 0.022, + "num_input_tokens_seen": 30768000, "step": 4050 }, { "epoch": 1.1013242784380306, - "grad_norm": 0.08951839804649353, + "grad_norm": 0.0796465054154396, "learning_rate": 3.513656943376972e-05, - "loss": 0.0221, - "num_input_tokens_seen": 30524416, + "loss": 0.0216, + "num_input_tokens_seen": 30807456, "step": 4055 }, { "epoch": 1.1026825127334465, - "grad_norm": 0.07097277045249939, + "grad_norm": 0.0680486336350441, "learning_rate": 3.51040525551127e-05, - "loss": 0.0214, - "num_input_tokens_seen": 30565936, + "loss": 0.021, + "num_input_tokens_seen": 30849552, "step": 4060 }, { "epoch": 1.1040407470288625, - "grad_norm": 0.07842448353767395, + "grad_norm": 0.07398637384176254, "learning_rate": 3.507151523267878e-05, - "loss": 0.0224, - "num_input_tokens_seen": 30601376, + "loss": 0.0213, + "num_input_tokens_seen": 30885088, "step": 4065 }, { "epoch": 1.1053989813242784, - "grad_norm": 0.0863199308514595, + "grad_norm": 0.25612425804138184, "learning_rate": 3.50389575323015e-05, - "loss": 0.0217, - "num_input_tokens_seen": 30641520, + "loss": 0.0224, + "num_input_tokens_seen": 30925488, "step": 4070 }, { "epoch": 1.1067572156196943, - "grad_norm": 0.06002289429306984, + "grad_norm": 0.0579836368560791, "learning_rate": 3.5006379519855684e-05, - "loss": 0.0222, - "num_input_tokens_seen": 30680128, + "loss": 0.0206, + "num_input_tokens_seen": 30963488, "step": 4075 }, { "epoch": 1.1081154499151102, - "grad_norm": 0.0673811063170433, + "grad_norm": 0.0656048059463501, "learning_rate": 3.4973781261257204e-05, - "loss": 0.0227, - "num_input_tokens_seen": 30717248, + "loss": 0.0218, + "num_input_tokens_seen": 31001040, "step": 4080 }, { "epoch": 1.1094736842105264, - "grad_norm": 0.07987792044878006, + "grad_norm": 0.0953967273235321, "learning_rate": 3.494116282246292e-05, - "loss": 0.0225, - "num_input_tokens_seen": 30748240, + "loss": 0.0224, + "num_input_tokens_seen": 31032720, "step": 4085 }, { "epoch": 1.1108319185059423, - "grad_norm": 0.08532380312681198, + "grad_norm": 0.08868949860334396, "learning_rate": 3.4908524269470514e-05, "loss": 0.0227, - "num_input_tokens_seen": 30785168, + "num_input_tokens_seen": 31070400, "step": 4090 }, { "epoch": 1.1121901528013582, - "grad_norm": 0.09270403534173965, + "grad_norm": 0.16424648463726044, "learning_rate": 3.4875865668318374e-05, - "loss": 0.0192, - "num_input_tokens_seen": 30830752, + "loss": 0.0201, + "num_input_tokens_seen": 31116640, "step": 4095 }, { "epoch": 1.1135483870967742, - "grad_norm": 0.06957785040140152, + "grad_norm": 0.07035622000694275, "learning_rate": 3.484318708508545e-05, - "loss": 0.0219, - "num_input_tokens_seen": 30868048, + "loss": 0.0208, + "num_input_tokens_seen": 31154240, "step": 4100 }, { "epoch": 1.11490662139219, - "grad_norm": 0.12577490508556366, + "grad_norm": 0.08726867288351059, "learning_rate": 3.4810488585891106e-05, - "loss": 0.0255, - "num_input_tokens_seen": 30904384, + "loss": 0.0251, + "num_input_tokens_seen": 31190416, "step": 4105 }, { "epoch": 1.116264855687606, - "grad_norm": 0.0762590691447258, + "grad_norm": 0.0733194425702095, "learning_rate": 3.4777770236895026e-05, - "loss": 0.0243, - "num_input_tokens_seen": 30937936, + "loss": 0.0233, + "num_input_tokens_seen": 31224656, "step": 4110 }, { "epoch": 1.117623089983022, - "grad_norm": 0.09059552848339081, + "grad_norm": 0.11096934229135513, "learning_rate": 3.474503210429705e-05, - "loss": 0.0217, - "num_input_tokens_seen": 30972208, + "loss": 0.0211, + "num_input_tokens_seen": 31259040, "step": 4115 }, { "epoch": 1.118981324278438, - "grad_norm": 0.08072424679994583, + "grad_norm": 0.08438804000616074, "learning_rate": 3.471227425433703e-05, - "loss": 0.0241, - "num_input_tokens_seen": 31006176, + "loss": 0.0225, + "num_input_tokens_seen": 31293536, "step": 4120 }, { "epoch": 1.120339558573854, - "grad_norm": 0.07660835236310959, + "grad_norm": 0.22982807457447052, "learning_rate": 3.4679496753294746e-05, - "loss": 0.0206, - "num_input_tokens_seen": 31042960, + "loss": 0.0218, + "num_input_tokens_seen": 31331024, "step": 4125 }, { "epoch": 1.12169779286927, - "grad_norm": 0.07504867762327194, + "grad_norm": 0.12264291942119598, "learning_rate": 3.46466996674897e-05, - "loss": 0.0197, - "num_input_tokens_seen": 31083104, + "loss": 0.0195, + "num_input_tokens_seen": 31371136, "step": 4130 }, { "epoch": 1.1230560271646859, - "grad_norm": 0.09652435779571533, + "grad_norm": 0.07922866940498352, "learning_rate": 3.461388306328104e-05, - "loss": 0.0213, - "num_input_tokens_seen": 31116976, + "loss": 0.022, + "num_input_tokens_seen": 31405056, "step": 4135 }, { "epoch": 1.1244142614601018, - "grad_norm": 0.0947829857468605, + "grad_norm": 0.0855497345328331, "learning_rate": 3.458104700706742e-05, - "loss": 0.0227, - "num_input_tokens_seen": 31158496, + "loss": 0.0215, + "num_input_tokens_seen": 31447184, "step": 4140 }, { "epoch": 1.1257724957555177, - "grad_norm": 0.08102014660835266, + "grad_norm": 0.06249786168336868, "learning_rate": 3.454819156528682e-05, - "loss": 0.0239, - "num_input_tokens_seen": 31194672, + "loss": 0.0238, + "num_input_tokens_seen": 31483744, "step": 4145 }, { "epoch": 1.1271307300509337, - "grad_norm": 0.0878044068813324, + "grad_norm": 0.06895138323307037, "learning_rate": 3.4515316804416465e-05, - "loss": 0.0226, - "num_input_tokens_seen": 31233568, + "loss": 0.0231, + "num_input_tokens_seen": 31522880, "step": 4150 }, { "epoch": 1.1284889643463498, - "grad_norm": 0.07816632091999054, + "grad_norm": 0.06270083785057068, "learning_rate": 3.448242279097267e-05, - "loss": 0.0233, - "num_input_tokens_seen": 31276192, + "loss": 0.0231, + "num_input_tokens_seen": 31565936, "step": 4155 }, { "epoch": 1.1298471986417657, - "grad_norm": 0.06844462454319, + "grad_norm": 0.06782715767621994, "learning_rate": 3.444950959151068e-05, - "loss": 0.0207, - "num_input_tokens_seen": 31315392, + "loss": 0.0187, + "num_input_tokens_seen": 31605536, "step": 4160 }, { "epoch": 1.1312054329371817, - "grad_norm": 0.05639954283833504, + "grad_norm": 0.05129249021410942, "learning_rate": 3.4416577272624604e-05, - "loss": 0.0237, - "num_input_tokens_seen": 31355760, + "loss": 0.0224, + "num_input_tokens_seen": 31646080, "step": 4165 }, { "epoch": 1.1325636672325976, - "grad_norm": 0.08227114379405975, + "grad_norm": 0.1041538193821907, "learning_rate": 3.438362590094719e-05, - "loss": 0.0212, - "num_input_tokens_seen": 31391136, + "loss": 0.0209, + "num_input_tokens_seen": 31681472, "step": 4170 }, { "epoch": 1.1339219015280135, - "grad_norm": 0.08707451075315475, + "grad_norm": 0.08165684342384338, "learning_rate": 3.4350655543149754e-05, - "loss": 0.0244, - "num_input_tokens_seen": 31430480, + "loss": 0.0232, + "num_input_tokens_seen": 31721120, "step": 4175 }, { "epoch": 1.1352801358234295, - "grad_norm": 0.07988372445106506, + "grad_norm": 0.059737276285886765, "learning_rate": 3.431766626594204e-05, - "loss": 0.0227, - "num_input_tokens_seen": 31464544, + "loss": 0.0211, + "num_input_tokens_seen": 31755712, "step": 4180 }, { "epoch": 1.1366383701188454, - "grad_norm": 0.07727611809968948, + "grad_norm": 0.09786748141050339, "learning_rate": 3.4284658136072054e-05, - "loss": 0.0259, - "num_input_tokens_seen": 31503728, + "loss": 0.0246, + "num_input_tokens_seen": 31795376, "step": 4185 }, { "epoch": 1.1379966044142615, - "grad_norm": 0.14389005303382874, + "grad_norm": 0.1641320139169693, "learning_rate": 3.425163122032595e-05, - "loss": 0.0205, - "num_input_tokens_seen": 31545168, + "loss": 0.0203, + "num_input_tokens_seen": 31836928, "step": 4190 }, { "epoch": 1.1393548387096775, - "grad_norm": 0.12024204432964325, + "grad_norm": 0.07092893123626709, "learning_rate": 3.421858558552789e-05, - "loss": 0.0223, - "num_input_tokens_seen": 31587264, + "loss": 0.0205, + "num_input_tokens_seen": 31879424, "step": 4195 }, { "epoch": 1.1407130730050934, - "grad_norm": 0.0743388682603836, + "grad_norm": 0.0681806355714798, "learning_rate": 3.418552129853994e-05, - "loss": 0.0231, - "num_input_tokens_seen": 31625376, + "loss": 0.0221, + "num_input_tokens_seen": 31917488, "step": 4200 }, { "epoch": 1.1420713073005093, - "grad_norm": 0.10065428167581558, + "grad_norm": 0.11347433179616928, "learning_rate": 3.4152438426261856e-05, - "loss": 0.0255, - "num_input_tokens_seen": 31663200, + "loss": 0.0256, + "num_input_tokens_seen": 31955568, "step": 4205 }, { "epoch": 1.1434295415959252, - "grad_norm": 0.08210783451795578, + "grad_norm": 0.07658606767654419, "learning_rate": 3.4119337035631036e-05, - "loss": 0.0239, - "num_input_tokens_seen": 31702640, + "loss": 0.0225, + "num_input_tokens_seen": 31995088, "step": 4210 }, { "epoch": 1.1447877758913412, - "grad_norm": 0.08425233513116837, + "grad_norm": 0.10462809354066849, "learning_rate": 3.408621719362233e-05, - "loss": 0.0257, - "num_input_tokens_seen": 31740672, + "loss": 0.0255, + "num_input_tokens_seen": 32034016, "step": 4215 }, { "epoch": 1.146146010186757, - "grad_norm": 0.08219177275896072, + "grad_norm": 0.07898668199777603, "learning_rate": 3.405307896724792e-05, - "loss": 0.0211, - "num_input_tokens_seen": 31775904, + "loss": 0.0207, + "num_input_tokens_seen": 32069776, "step": 4220 }, { "epoch": 1.1475042444821733, - "grad_norm": 0.07964393496513367, + "grad_norm": 0.07820317894220352, "learning_rate": 3.401992242355721e-05, - "loss": 0.0237, - "num_input_tokens_seen": 31815056, + "loss": 0.0228, + "num_input_tokens_seen": 32109216, "step": 4225 }, { "epoch": 1.1488624787775892, - "grad_norm": 0.09721174836158752, + "grad_norm": 0.10197337716817856, "learning_rate": 3.3986747629636635e-05, - "loss": 0.0221, - "num_input_tokens_seen": 31853360, + "loss": 0.0208, + "num_input_tokens_seen": 32148640, "step": 4230 }, { "epoch": 1.150220713073005, - "grad_norm": 0.0736435204744339, + "grad_norm": 0.08106466382741928, "learning_rate": 3.395355465260955e-05, - "loss": 0.0234, - "num_input_tokens_seen": 31887968, + "loss": 0.0222, + "num_input_tokens_seen": 32183632, "step": 4235 }, { "epoch": 1.151578947368421, - "grad_norm": 0.08828839659690857, + "grad_norm": 0.07396421581506729, "learning_rate": 3.3920343559636144e-05, - "loss": 0.0217, - "num_input_tokens_seen": 31923312, + "loss": 0.0214, + "num_input_tokens_seen": 32219104, "step": 4240 }, { "epoch": 1.152937181663837, - "grad_norm": 0.10559400171041489, + "grad_norm": 0.08965013176202774, "learning_rate": 3.388711441791322e-05, - "loss": 0.024, - "num_input_tokens_seen": 31954160, + "loss": 0.0232, + "num_input_tokens_seen": 32250560, "step": 4245 }, { "epoch": 1.154295415959253, - "grad_norm": 0.08508122712373734, + "grad_norm": 0.09419962018728256, "learning_rate": 3.385386729467413e-05, - "loss": 0.024, - "num_input_tokens_seen": 31988080, + "loss": 0.0234, + "num_input_tokens_seen": 32284896, "step": 4250 }, { "epoch": 1.1556536502546688, - "grad_norm": 0.08687109500169754, + "grad_norm": 0.07916060090065002, "learning_rate": 3.382060225718857e-05, - "loss": 0.0251, - "num_input_tokens_seen": 32025920, + "loss": 0.0241, + "num_input_tokens_seen": 32323024, "step": 4255 }, { "epoch": 1.157011884550085, - "grad_norm": 0.07195305824279785, + "grad_norm": 0.0666729286313057, "learning_rate": 3.378731937276252e-05, - "loss": 0.0221, - "num_input_tokens_seen": 32066400, + "loss": 0.0217, + "num_input_tokens_seen": 32363872, "step": 4260 }, { "epoch": 1.158370118845501, - "grad_norm": 0.08253200352191925, + "grad_norm": 0.07671894878149033, "learning_rate": 3.375401870873805e-05, - "loss": 0.0242, - "num_input_tokens_seen": 32102272, + "loss": 0.0241, + "num_input_tokens_seen": 32400448, "step": 4265 }, { "epoch": 1.1597283531409168, - "grad_norm": 0.07737712562084198, + "grad_norm": 0.06984855979681015, "learning_rate": 3.372070033249321e-05, - "loss": 0.0241, - "num_input_tokens_seen": 32138880, + "loss": 0.0227, + "num_input_tokens_seen": 32437456, "step": 4270 }, { "epoch": 1.1610865874363328, - "grad_norm": 0.06543577462434769, + "grad_norm": 0.06606384366750717, "learning_rate": 3.3687364311441885e-05, - "loss": 0.0215, - "num_input_tokens_seen": 32176208, + "loss": 0.0227, + "num_input_tokens_seen": 32474752, "step": 4275 }, { "epoch": 1.1624448217317487, - "grad_norm": 0.06374019384384155, + "grad_norm": 0.06127290800213814, "learning_rate": 3.365401071303367e-05, - "loss": 0.0222, - "num_input_tokens_seen": 32215888, + "loss": 0.0214, + "num_input_tokens_seen": 32515216, "step": 4280 }, { "epoch": 1.1638030560271646, - "grad_norm": 0.0881289541721344, + "grad_norm": 0.0846126452088356, "learning_rate": 3.36206396047537e-05, - "loss": 0.0263, - "num_input_tokens_seen": 32251712, + "loss": 0.0251, + "num_input_tokens_seen": 32551328, "step": 4285 }, { "epoch": 1.1651612903225805, - "grad_norm": 0.0876409113407135, + "grad_norm": 0.11235252022743225, "learning_rate": 3.358725105412257e-05, - "loss": 0.0237, - "num_input_tokens_seen": 32293696, + "loss": 0.0227, + "num_input_tokens_seen": 32593440, "step": 4290 }, { "epoch": 1.1665195246179967, - "grad_norm": 0.07405570894479752, + "grad_norm": 0.0739407017827034, "learning_rate": 3.3553845128696153e-05, - "loss": 0.0217, - "num_input_tokens_seen": 32336640, + "loss": 0.0211, + "num_input_tokens_seen": 32635824, "step": 4295 }, { "epoch": 1.1678777589134126, - "grad_norm": 0.09775377064943314, + "grad_norm": 0.08186372369527817, "learning_rate": 3.352042189606547e-05, - "loss": 0.0231, - "num_input_tokens_seen": 32367760, + "loss": 0.0218, + "num_input_tokens_seen": 32667264, "step": 4300 }, { "epoch": 1.1692359932088285, - "grad_norm": 0.10404365509748459, + "grad_norm": 0.14341872930526733, "learning_rate": 3.348698142385657e-05, - "loss": 0.0222, - "num_input_tokens_seen": 32404416, + "loss": 0.0233, + "num_input_tokens_seen": 32704656, "step": 4305 }, { "epoch": 1.1705942275042445, - "grad_norm": 0.09576475620269775, + "grad_norm": 0.12739844620227814, "learning_rate": 3.345352377973037e-05, - "loss": 0.0235, - "num_input_tokens_seen": 32441200, + "loss": 0.0231, + "num_input_tokens_seen": 32741616, "step": 4310 }, { "epoch": 1.1719524617996604, - "grad_norm": 0.07835312932729721, + "grad_norm": 0.07624967396259308, "learning_rate": 3.342004903138254e-05, - "loss": 0.022, - "num_input_tokens_seen": 32482528, + "loss": 0.0214, + "num_input_tokens_seen": 32783488, "step": 4315 }, { "epoch": 1.1733106960950763, - "grad_norm": 0.07558625191450119, + "grad_norm": 0.07121070474386215, "learning_rate": 3.338655724654337e-05, - "loss": 0.0235, - "num_input_tokens_seen": 32516576, + "loss": 0.0223, + "num_input_tokens_seen": 32817568, "step": 4320 }, { "epoch": 1.1746689303904922, - "grad_norm": 0.09693360328674316, + "grad_norm": 0.08026662468910217, "learning_rate": 3.33530484929776e-05, - "loss": 0.0241, - "num_input_tokens_seen": 32553840, + "loss": 0.0229, + "num_input_tokens_seen": 32855280, "step": 4325 }, { "epoch": 1.1760271646859084, - "grad_norm": 0.0833030492067337, + "grad_norm": 0.08224073797464371, "learning_rate": 3.3319522838484317e-05, - "loss": 0.0235, - "num_input_tokens_seen": 32595136, + "loss": 0.0229, + "num_input_tokens_seen": 32896624, "step": 4330 }, { "epoch": 1.1773853989813243, - "grad_norm": 0.08042357861995697, + "grad_norm": 0.07047872990369797, "learning_rate": 3.328598035089679e-05, - "loss": 0.023, - "num_input_tokens_seen": 32636624, + "loss": 0.0224, + "num_input_tokens_seen": 32938336, "step": 4335 }, { "epoch": 1.1787436332767403, - "grad_norm": 0.08195077627897263, + "grad_norm": 0.07714972645044327, "learning_rate": 3.325242109808237e-05, - "loss": 0.0267, - "num_input_tokens_seen": 32674848, + "loss": 0.0268, + "num_input_tokens_seen": 32977424, "step": 4340 }, { "epoch": 1.1801018675721562, - "grad_norm": 0.08427385985851288, + "grad_norm": 0.08624029904603958, "learning_rate": 3.3218845147942305e-05, - "loss": 0.0234, - "num_input_tokens_seen": 32710944, + "loss": 0.0225, + "num_input_tokens_seen": 33013696, "step": 4345 }, { "epoch": 1.181460101867572, - "grad_norm": 0.07809655368328094, + "grad_norm": 0.06990043818950653, "learning_rate": 3.318525256841163e-05, - "loss": 0.0226, - "num_input_tokens_seen": 32752080, + "loss": 0.0228, + "num_input_tokens_seen": 33054928, "step": 4350 }, { "epoch": 1.182818336162988, - "grad_norm": 0.07341457158327103, + "grad_norm": 0.06852797418832779, "learning_rate": 3.315164342745904e-05, - "loss": 0.0225, - "num_input_tokens_seen": 32787312, + "loss": 0.0219, + "num_input_tokens_seen": 33091536, "step": 4355 }, { "epoch": 1.184176570458404, - "grad_norm": 0.08349103480577469, + "grad_norm": 0.11895160377025604, "learning_rate": 3.311801779308674e-05, - "loss": 0.0226, - "num_input_tokens_seen": 32827120, + "loss": 0.0218, + "num_input_tokens_seen": 33131824, "step": 4360 }, { "epoch": 1.1855348047538201, - "grad_norm": 0.08617093414068222, + "grad_norm": 0.08010344207286835, "learning_rate": 3.308437573333028e-05, - "loss": 0.0236, - "num_input_tokens_seen": 32866240, + "loss": 0.0218, + "num_input_tokens_seen": 33171488, "step": 4365 }, { "epoch": 1.186893039049236, - "grad_norm": 0.08082292973995209, + "grad_norm": 0.4691314101219177, "learning_rate": 3.305071731625849e-05, - "loss": 0.0232, - "num_input_tokens_seen": 32906288, + "loss": 0.0239, + "num_input_tokens_seen": 33211872, "step": 4370 }, { "epoch": 1.188251273344652, - "grad_norm": 0.08806200325489044, + "grad_norm": 0.07850786298513412, "learning_rate": 3.3017042609973254e-05, - "loss": 0.0203, - "num_input_tokens_seen": 32942688, + "loss": 0.0207, + "num_input_tokens_seen": 33249024, "step": 4375 }, { "epoch": 1.189609507640068, - "grad_norm": 0.2596215605735779, + "grad_norm": 0.11123881489038467, "learning_rate": 3.298335168260943e-05, - "loss": 0.0216, - "num_input_tokens_seen": 32976688, + "loss": 0.0208, + "num_input_tokens_seen": 33283472, "step": 4380 }, { "epoch": 1.1909677419354838, - "grad_norm": 0.06558528542518616, + "grad_norm": 0.06519312411546707, "learning_rate": 3.29496446023347e-05, - "loss": 0.0217, - "num_input_tokens_seen": 33014640, + "loss": 0.0218, + "num_input_tokens_seen": 33321856, "step": 4385 }, { "epoch": 1.1923259762308998, - "grad_norm": 0.0638236254453659, + "grad_norm": 0.09924015402793884, "learning_rate": 3.2915921437349415e-05, - "loss": 0.0239, - "num_input_tokens_seen": 33052944, + "loss": 0.0237, + "num_input_tokens_seen": 33359744, "step": 4390 }, { "epoch": 1.1936842105263157, - "grad_norm": 0.0699969157576561, + "grad_norm": 0.06818155199289322, "learning_rate": 3.288218225588649e-05, - "loss": 0.0216, - "num_input_tokens_seen": 33096576, + "loss": 0.0202, + "num_input_tokens_seen": 33403776, "step": 4395 }, { "epoch": 1.1950424448217318, - "grad_norm": 0.09398026764392853, + "grad_norm": 0.09042264521121979, "learning_rate": 3.284842712621123e-05, - "loss": 0.0253, - "num_input_tokens_seen": 33142736, + "loss": 0.0247, + "num_input_tokens_seen": 33449840, "step": 4400 }, { "epoch": 1.1964006791171478, - "grad_norm": 0.07301928102970123, + "grad_norm": 0.07173609733581543, "learning_rate": 3.281465611662122e-05, - "loss": 0.0284, - "num_input_tokens_seen": 33182752, + "loss": 0.0276, + "num_input_tokens_seen": 33490064, "step": 4405 }, { "epoch": 1.1977589134125637, - "grad_norm": 0.078556589782238, + "grad_norm": 0.07773589342832565, "learning_rate": 3.278086929544618e-05, - "loss": 0.0271, - "num_input_tokens_seen": 33214416, + "loss": 0.0256, + "num_input_tokens_seen": 33522352, "step": 4410 }, { "epoch": 1.1991171477079796, - "grad_norm": 0.10160329192876816, + "grad_norm": 0.10161643475294113, "learning_rate": 3.2747066731047795e-05, - "loss": 0.0233, - "num_input_tokens_seen": 33253968, + "loss": 0.0216, + "num_input_tokens_seen": 33562240, "step": 4415 }, { "epoch": 1.2004753820033955, - "grad_norm": 0.08325953781604767, + "grad_norm": 0.1709461808204651, "learning_rate": 3.271324849181963e-05, - "loss": 0.0225, - "num_input_tokens_seen": 33289776, + "loss": 0.0224, + "num_input_tokens_seen": 33597888, "step": 4420 }, { "epoch": 1.2018336162988115, - "grad_norm": 0.08193095028400421, + "grad_norm": 0.1386878937482834, "learning_rate": 3.267941464618695e-05, "loss": 0.0265, - "num_input_tokens_seen": 33327056, + "num_input_tokens_seen": 33635376, "step": 4425 }, { "epoch": 1.2031918505942274, - "grad_norm": 0.074809230864048, + "grad_norm": 0.06184517592191696, "learning_rate": 3.2645565262606616e-05, - "loss": 0.0227, - "num_input_tokens_seen": 33363488, + "loss": 0.0213, + "num_input_tokens_seen": 33671616, "step": 4430 }, { "epoch": 1.2045500848896435, - "grad_norm": 0.06793966889381409, + "grad_norm": 0.06430092453956604, "learning_rate": 3.26117004095669e-05, - "loss": 0.0208, - "num_input_tokens_seen": 33403392, + "loss": 0.0204, + "num_input_tokens_seen": 33711760, "step": 4435 }, { "epoch": 1.2059083191850595, - "grad_norm": 0.07700396329164505, + "grad_norm": 0.25074464082717896, "learning_rate": 3.2577820155587396e-05, - "loss": 0.0235, - "num_input_tokens_seen": 33440976, + "loss": 0.023, + "num_input_tokens_seen": 33749312, "step": 4440 }, { "epoch": 1.2072665534804754, - "grad_norm": 0.0702715665102005, + "grad_norm": 0.06976979970932007, "learning_rate": 3.254392456921885e-05, - "loss": 0.0208, - "num_input_tokens_seen": 33482096, + "loss": 0.0199, + "num_input_tokens_seen": 33791232, "step": 4445 }, { "epoch": 1.2086247877758913, - "grad_norm": 0.0778246819972992, + "grad_norm": 0.12315336614847183, "learning_rate": 3.251001371904303e-05, - "loss": 0.0235, - "num_input_tokens_seen": 33518560, + "loss": 0.023, + "num_input_tokens_seen": 33827984, "step": 4450 }, { "epoch": 1.2099830220713073, - "grad_norm": 0.08308003097772598, + "grad_norm": 0.08373142033815384, "learning_rate": 3.2476087673672594e-05, - "loss": 0.0249, - "num_input_tokens_seen": 33558240, + "loss": 0.023, + "num_input_tokens_seen": 33868528, "step": 4455 }, { "epoch": 1.2113412563667232, - "grad_norm": 0.06911133229732513, + "grad_norm": 0.06499312072992325, "learning_rate": 3.244214650175094e-05, - "loss": 0.0239, - "num_input_tokens_seen": 33593744, + "loss": 0.0233, + "num_input_tokens_seen": 33904544, "step": 4460 }, { "epoch": 1.2126994906621391, - "grad_norm": 0.07607311755418777, + "grad_norm": 0.07466400414705276, "learning_rate": 3.240819027195207e-05, - "loss": 0.0269, - "num_input_tokens_seen": 33631024, + "loss": 0.0264, + "num_input_tokens_seen": 33942384, "step": 4465 }, { "epoch": 1.2140577249575553, - "grad_norm": 0.065288245677948, + "grad_norm": 0.12799307703971863, "learning_rate": 3.237421905298045e-05, - "loss": 0.0234, - "num_input_tokens_seen": 33672848, + "loss": 0.023, + "num_input_tokens_seen": 33984704, "step": 4470 }, { "epoch": 1.2154159592529712, - "grad_norm": 0.07402539253234863, + "grad_norm": 0.07542445510625839, "learning_rate": 3.234023291357089e-05, - "loss": 0.021, - "num_input_tokens_seen": 33713648, + "loss": 0.0198, + "num_input_tokens_seen": 34025920, "step": 4475 }, { "epoch": 1.2167741935483871, - "grad_norm": 0.0946425348520279, + "grad_norm": 0.09491270780563354, "learning_rate": 3.230623192248836e-05, - "loss": 0.023, - "num_input_tokens_seen": 33753344, + "loss": 0.022, + "num_input_tokens_seen": 34065200, "step": 4480 }, { "epoch": 1.218132427843803, - "grad_norm": 0.09493868052959442, + "grad_norm": 0.39163219928741455, "learning_rate": 3.2272216148527915e-05, - "loss": 0.0211, - "num_input_tokens_seen": 33795872, + "loss": 0.0222, + "num_input_tokens_seen": 34108176, "step": 4485 }, { "epoch": 1.219490662139219, - "grad_norm": 0.09150122851133347, + "grad_norm": 0.07612796872854233, "learning_rate": 3.2238185660514495e-05, - "loss": 0.0188, - "num_input_tokens_seen": 33829360, + "loss": 0.0184, + "num_input_tokens_seen": 34141968, "step": 4490 }, { "epoch": 1.220848896434635, - "grad_norm": 0.07608231902122498, + "grad_norm": 0.10126037895679474, "learning_rate": 3.2204140527302826e-05, - "loss": 0.0202, - "num_input_tokens_seen": 33869264, + "loss": 0.0198, + "num_input_tokens_seen": 34182016, "step": 4495 }, { "epoch": 1.2222071307300508, - "grad_norm": 0.07710189372301102, + "grad_norm": 0.21762922406196594, "learning_rate": 3.217008081777726e-05, - "loss": 0.0219, - "num_input_tokens_seen": 33904160, + "loss": 0.0215, + "num_input_tokens_seen": 34218256, "step": 4500 }, { "epoch": 1.223565365025467, - "grad_norm": 0.08437623828649521, + "grad_norm": 0.07319148629903793, "learning_rate": 3.213600660085163e-05, - "loss": 0.0217, - "num_input_tokens_seen": 33942064, + "loss": 0.0218, + "num_input_tokens_seen": 34256608, "step": 4505 }, { "epoch": 1.224923599320883, - "grad_norm": 0.12986433506011963, + "grad_norm": 0.08190125226974487, "learning_rate": 3.210191794546914e-05, - "loss": 0.0238, - "num_input_tokens_seen": 33978832, + "loss": 0.0242, + "num_input_tokens_seen": 34293488, "step": 4510 }, { "epoch": 1.2262818336162988, - "grad_norm": 0.08053214848041534, + "grad_norm": 0.07490704953670502, "learning_rate": 3.20678149206022e-05, - "loss": 0.0248, - "num_input_tokens_seen": 34015440, + "loss": 0.0246, + "num_input_tokens_seen": 34330480, "step": 4515 }, { "epoch": 1.2276400679117148, - "grad_norm": 0.07427144795656204, + "grad_norm": 0.0755556970834732, "learning_rate": 3.20336975952523e-05, - "loss": 0.0229, - "num_input_tokens_seen": 34049456, + "loss": 0.0251, + "num_input_tokens_seen": 34364656, "step": 4520 }, { "epoch": 1.2289983022071307, - "grad_norm": 0.07779482752084732, + "grad_norm": 0.18192873895168304, "learning_rate": 3.1999566038449846e-05, - "loss": 0.0217, - "num_input_tokens_seen": 34091008, + "loss": 0.0214, + "num_input_tokens_seen": 34406288, "step": 4525 }, { "epoch": 1.2303565365025466, - "grad_norm": 0.06058354675769806, + "grad_norm": 0.06207035854458809, "learning_rate": 3.196542031925407e-05, - "loss": 0.025, - "num_input_tokens_seen": 34128528, + "loss": 0.0248, + "num_input_tokens_seen": 34444704, "step": 4530 }, { "epoch": 1.2317147707979625, - "grad_norm": 0.08317749202251434, + "grad_norm": 0.0937003344297409, "learning_rate": 3.1931260506752844e-05, - "loss": 0.0239, - "num_input_tokens_seen": 34170848, + "loss": 0.0244, + "num_input_tokens_seen": 34487392, "step": 4535 }, { "epoch": 1.2330730050933787, - "grad_norm": 0.08458400517702103, + "grad_norm": 0.10253455489873886, "learning_rate": 3.189708667006253e-05, - "loss": 0.0265, - "num_input_tokens_seen": 34211760, + "loss": 0.0271, + "num_input_tokens_seen": 34528848, "step": 4540 }, { "epoch": 1.2344312393887946, - "grad_norm": 0.06555715948343277, + "grad_norm": 0.36243322491645813, "learning_rate": 3.18628988783279e-05, - "loss": 0.0221, - "num_input_tokens_seen": 34249504, + "loss": 0.0227, + "num_input_tokens_seen": 34566624, "step": 4545 }, { "epoch": 1.2357894736842105, - "grad_norm": 0.0697271004319191, + "grad_norm": 0.07938670367002487, "learning_rate": 3.1828697200721954e-05, - "loss": 0.0223, - "num_input_tokens_seen": 34284432, + "loss": 0.0214, + "num_input_tokens_seen": 34601664, "step": 4550 }, { "epoch": 1.2371477079796265, - "grad_norm": 0.06767825782299042, + "grad_norm": 0.11234685778617859, "learning_rate": 3.1794481706445776e-05, - "loss": 0.0242, - "num_input_tokens_seen": 34320736, + "loss": 0.0232, + "num_input_tokens_seen": 34637920, "step": 4555 }, { "epoch": 1.2385059422750424, - "grad_norm": 0.4156624376773834, + "grad_norm": 0.13407264649868011, "learning_rate": 3.1760252464728425e-05, - "loss": 0.0242, - "num_input_tokens_seen": 34357680, + "loss": 0.0246, + "num_input_tokens_seen": 34674976, "step": 4560 }, { "epoch": 1.2398641765704583, - "grad_norm": 0.09499278664588928, + "grad_norm": 0.3402995467185974, "learning_rate": 3.1726009544826756e-05, - "loss": 0.0227, - "num_input_tokens_seen": 34397088, + "loss": 0.0242, + "num_input_tokens_seen": 34715008, "step": 4565 }, { "epoch": 1.2412224108658743, - "grad_norm": 0.0653809905052185, + "grad_norm": 0.06808853894472122, "learning_rate": 3.1691753016025306e-05, - "loss": 0.0212, - "num_input_tokens_seen": 34433472, + "loss": 0.0221, + "num_input_tokens_seen": 34751632, "step": 4570 }, { "epoch": 1.2425806451612904, - "grad_norm": 0.0883389562368393, + "grad_norm": 0.16082559525966644, "learning_rate": 3.165748294763615e-05, - "loss": 0.0271, - "num_input_tokens_seen": 34473232, + "loss": 0.0294, + "num_input_tokens_seen": 34792512, "step": 4575 }, { "epoch": 1.2439388794567063, - "grad_norm": 0.29173022508621216, + "grad_norm": 0.1406860500574112, "learning_rate": 3.162319940899876e-05, - "loss": 0.0226, - "num_input_tokens_seen": 34509584, + "loss": 0.0216, + "num_input_tokens_seen": 34829344, "step": 4580 }, { "epoch": 1.2452971137521223, - "grad_norm": 0.11234007775783539, + "grad_norm": 0.7184796929359436, "learning_rate": 3.158890246947985e-05, - "loss": 0.0239, - "num_input_tokens_seen": 34548752, + "loss": 0.0272, + "num_input_tokens_seen": 34868992, "step": 4585 }, { "epoch": 1.2466553480475382, - "grad_norm": 0.1039336696267128, + "grad_norm": 0.19610410928726196, "learning_rate": 3.1554592198473266e-05, - "loss": 0.0218, - "num_input_tokens_seen": 34586288, + "loss": 0.0314, + "num_input_tokens_seen": 34906688, "step": 4590 }, { "epoch": 1.2480135823429541, - "grad_norm": 0.08585210889577866, + "grad_norm": 0.1024790033698082, "learning_rate": 3.152026866539981e-05, - "loss": 0.0237, - "num_input_tokens_seen": 34623504, + "loss": 0.0259, + "num_input_tokens_seen": 34944160, "step": 4595 }, { "epoch": 1.24937181663837, - "grad_norm": 0.13560838997364044, + "grad_norm": 0.08354140818119049, "learning_rate": 3.1485931939707155e-05, - "loss": 0.0204, - "num_input_tokens_seen": 34658960, + "loss": 0.0211, + "num_input_tokens_seen": 34979648, "step": 4600 }, { "epoch": 1.250730050933786, - "grad_norm": 0.07716051489114761, + "grad_norm": 0.12200751900672913, "learning_rate": 3.14515820908696e-05, - "loss": 0.0244, - "num_input_tokens_seen": 34695728, + "loss": 0.0256, + "num_input_tokens_seen": 35016944, "step": 4605 }, { "epoch": 1.2520882852292021, - "grad_norm": 0.08421866595745087, + "grad_norm": 0.14204563200473785, "learning_rate": 3.1417219188388056e-05, "loss": 0.0249, - "num_input_tokens_seen": 34731024, + "num_input_tokens_seen": 35052448, "step": 4610 }, { "epoch": 1.253446519524618, - "grad_norm": 0.07156237214803696, + "grad_norm": 0.08100078999996185, "learning_rate": 3.1382843301789814e-05, "loss": 0.0213, - "num_input_tokens_seen": 34766976, + "num_input_tokens_seen": 35088624, "step": 4615 }, { "epoch": 1.254804753820034, - "grad_norm": 0.08946319669485092, + "grad_norm": 0.32276859879493713, "learning_rate": 3.134845450062846e-05, - "loss": 0.0212, - "num_input_tokens_seen": 34803680, + "loss": 0.0227, + "num_input_tokens_seen": 35125520, "step": 4620 }, { "epoch": 1.25616298811545, - "grad_norm": 0.25089654326438904, + "grad_norm": 0.09568215161561966, "learning_rate": 3.131405285448369e-05, "loss": 0.026, - "num_input_tokens_seen": 34842416, + "num_input_tokens_seen": 35164784, "step": 4625 }, { "epoch": 1.2575212224108658, - "grad_norm": 0.0892956480383873, + "grad_norm": 0.17259132862091064, "learning_rate": 3.12796384329612e-05, - "loss": 0.025, - "num_input_tokens_seen": 34882272, + "loss": 0.0246, + "num_input_tokens_seen": 35204976, "step": 4630 }, { "epoch": 1.2588794567062818, - "grad_norm": 0.070553719997406, + "grad_norm": 0.20614652335643768, "learning_rate": 3.124521130569253e-05, - "loss": 0.0226, - "num_input_tokens_seen": 34920832, + "loss": 0.0252, + "num_input_tokens_seen": 35243648, "step": 4635 }, { "epoch": 1.2602376910016977, - "grad_norm": 0.09762918204069138, + "grad_norm": 0.09791337698698044, "learning_rate": 3.1210771542334925e-05, - "loss": 0.0223, - "num_input_tokens_seen": 34955600, + "loss": 0.0237, + "num_input_tokens_seen": 35278752, "step": 4640 }, { "epoch": 1.2615959252971138, - "grad_norm": 0.09758062660694122, + "grad_norm": 0.18192966282367706, "learning_rate": 3.1176319212571206e-05, - "loss": 0.0216, - "num_input_tokens_seen": 34992816, + "loss": 0.0254, + "num_input_tokens_seen": 35315984, "step": 4645 }, { "epoch": 1.2629541595925298, - "grad_norm": 0.09502190351486206, + "grad_norm": 0.3575439751148224, "learning_rate": 3.1141854386109617e-05, - "loss": 0.0233, - "num_input_tokens_seen": 35029440, + "loss": 0.0256, + "num_input_tokens_seen": 35353040, "step": 4650 }, { "epoch": 1.2643123938879457, - "grad_norm": 0.0648174062371254, + "grad_norm": 0.07534375041723251, "learning_rate": 3.1107377132683676e-05, - "loss": 0.0234, - "num_input_tokens_seen": 35065360, + "loss": 0.0238, + "num_input_tokens_seen": 35389600, "step": 4655 }, { "epoch": 1.2656706281833616, - "grad_norm": 0.11082839220762253, + "grad_norm": 0.10303278267383575, "learning_rate": 3.107288752205208e-05, - "loss": 0.0195, - "num_input_tokens_seen": 35099488, + "loss": 0.0199, + "num_input_tokens_seen": 35423856, "step": 4660 }, { "epoch": 1.2670288624787776, - "grad_norm": 0.06944713741540909, + "grad_norm": 0.13352227210998535, "learning_rate": 3.103838562399846e-05, - "loss": 0.0224, - "num_input_tokens_seen": 35141584, + "loss": 0.0229, + "num_input_tokens_seen": 35466304, "step": 4665 }, { "epoch": 1.2683870967741935, - "grad_norm": 0.11591637134552002, + "grad_norm": 0.09006495028734207, "learning_rate": 3.10038715083314e-05, - "loss": 0.0209, - "num_input_tokens_seen": 35181056, + "loss": 0.0213, + "num_input_tokens_seen": 35506144, "step": 4670 }, { "epoch": 1.2697453310696094, - "grad_norm": 0.13415029644966125, + "grad_norm": 0.10887410491704941, "learning_rate": 3.096934524488411e-05, - "loss": 0.0251, - "num_input_tokens_seen": 35219776, + "loss": 0.0232, + "num_input_tokens_seen": 35544960, "step": 4675 }, { "epoch": 1.2711035653650256, - "grad_norm": 0.06338277459144592, + "grad_norm": 0.069489486515522, "learning_rate": 3.093480690351444e-05, - "loss": 0.0227, - "num_input_tokens_seen": 35265536, + "loss": 0.0229, + "num_input_tokens_seen": 35591680, "step": 4680 }, { "epoch": 1.2724617996604415, - "grad_norm": 0.0781097263097763, + "grad_norm": 0.07250406593084335, "learning_rate": 3.090025655410468e-05, - "loss": 0.021, - "num_input_tokens_seen": 35305904, + "loss": 0.0208, + "num_input_tokens_seen": 35632528, "step": 4685 }, { "epoch": 1.2738200339558574, - "grad_norm": 0.09402786940336227, + "grad_norm": 0.24474364519119263, "learning_rate": 3.086569426656137e-05, - "loss": 0.0236, - "num_input_tokens_seen": 35342736, + "loss": 0.0245, + "num_input_tokens_seen": 35669760, "step": 4690 }, { "epoch": 1.2751782682512733, - "grad_norm": 0.10017075389623642, + "grad_norm": 0.09078273177146912, "learning_rate": 3.083112011081526e-05, - "loss": 0.024, - "num_input_tokens_seen": 35374048, + "loss": 0.0237, + "num_input_tokens_seen": 35702128, "step": 4695 }, { "epoch": 1.2765365025466893, - "grad_norm": 0.06926335394382477, + "grad_norm": 0.18370278179645538, "learning_rate": 3.0796534156821064e-05, - "loss": 0.0217, - "num_input_tokens_seen": 35407824, + "loss": 0.0226, + "num_input_tokens_seen": 35736304, "step": 4700 }, { "epoch": 1.2778947368421052, - "grad_norm": 0.10419530421495438, + "grad_norm": 0.30150753259658813, "learning_rate": 3.07619364745574e-05, - "loss": 0.0228, - "num_input_tokens_seen": 35444416, + "loss": 0.0227, + "num_input_tokens_seen": 35773376, "step": 4705 }, { "epoch": 1.2792529711375211, - "grad_norm": 0.07367909699678421, + "grad_norm": 0.11203360557556152, "learning_rate": 3.072732713402659e-05, - "loss": 0.0257, - "num_input_tokens_seen": 35485056, + "loss": 0.0289, + "num_input_tokens_seen": 35814544, "step": 4710 }, { "epoch": 1.2806112054329373, - "grad_norm": 0.07311346381902695, + "grad_norm": 0.07778805494308472, "learning_rate": 3.069270620525459e-05, - "loss": 0.0234, - "num_input_tokens_seen": 35526864, + "loss": 0.0247, + "num_input_tokens_seen": 35856944, "step": 4715 }, { "epoch": 1.2819694397283532, - "grad_norm": 0.05829387903213501, + "grad_norm": 0.14443954825401306, "learning_rate": 3.065807375829075e-05, - "loss": 0.0205, - "num_input_tokens_seen": 35565744, + "loss": 0.0235, + "num_input_tokens_seen": 35895856, "step": 4720 }, { "epoch": 1.2833276740237691, - "grad_norm": 0.07078757882118225, + "grad_norm": 0.12875258922576904, "learning_rate": 3.062342986320775e-05, - "loss": 0.0252, - "num_input_tokens_seen": 35597056, + "loss": 0.0251, + "num_input_tokens_seen": 35927408, "step": 4725 }, { "epoch": 1.284685908319185, - "grad_norm": 0.08261621743440628, + "grad_norm": 0.11114617437124252, "learning_rate": 3.058877459010143e-05, - "loss": 0.0254, - "num_input_tokens_seen": 35633552, + "loss": 0.026, + "num_input_tokens_seen": 35964848, "step": 4730 }, { "epoch": 1.286044142614601, - "grad_norm": 0.06505361199378967, + "grad_norm": 0.06739533692598343, "learning_rate": 3.055410800909067e-05, - "loss": 0.0268, - "num_input_tokens_seen": 35670032, + "loss": 0.0261, + "num_input_tokens_seen": 36002208, "step": 4735 }, { "epoch": 1.287402376910017, - "grad_norm": 0.06634906679391861, + "grad_norm": 0.2025603950023651, "learning_rate": 3.051943019031719e-05, - "loss": 0.0226, - "num_input_tokens_seen": 35713824, + "loss": 0.0246, + "num_input_tokens_seen": 36046192, "step": 4740 }, { "epoch": 1.2887606112054328, - "grad_norm": 0.07260783761739731, + "grad_norm": 0.10321507602930069, "learning_rate": 3.0484741203945478e-05, - "loss": 0.024, - "num_input_tokens_seen": 35747808, + "loss": 0.0242, + "num_input_tokens_seen": 36080720, "step": 4745 }, { "epoch": 1.290118845500849, - "grad_norm": 0.546751081943512, + "grad_norm": 0.2463771104812622, "learning_rate": 3.0450041120162615e-05, - "loss": 0.0228, - "num_input_tokens_seen": 35784368, + "loss": 0.0238, + "num_input_tokens_seen": 36117904, "step": 4750 }, { "epoch": 1.291477079796265, - "grad_norm": 0.07333123683929443, + "grad_norm": 0.3788740038871765, "learning_rate": 3.0415330009178116e-05, - "loss": 0.0239, - "num_input_tokens_seen": 35826704, + "loss": 0.0248, + "num_input_tokens_seen": 36161072, "step": 4755 }, { "epoch": 1.2928353140916808, - "grad_norm": 0.08109811693429947, + "grad_norm": 0.11892661452293396, "learning_rate": 3.038060794122384e-05, "loss": 0.0239, - "num_input_tokens_seen": 35862048, + "num_input_tokens_seen": 36196512, "step": 4760 }, { "epoch": 1.2941935483870968, - "grad_norm": 0.07651472836732864, + "grad_norm": 0.0829126238822937, "learning_rate": 3.034587498655378e-05, - "loss": 0.0203, - "num_input_tokens_seen": 35897600, + "loss": 0.0202, + "num_input_tokens_seen": 36233024, "step": 4765 }, { "epoch": 1.2955517826825127, - "grad_norm": 0.07519669830799103, + "grad_norm": 0.08492469787597656, "learning_rate": 3.0311131215443996e-05, - "loss": 0.0227, - "num_input_tokens_seen": 35934912, + "loss": 0.0222, + "num_input_tokens_seen": 36270416, "step": 4770 }, { "epoch": 1.2969100169779286, - "grad_norm": 0.14508254826068878, + "grad_norm": 0.07208302617073059, "learning_rate": 3.0276376698192384e-05, - "loss": 0.0228, - "num_input_tokens_seen": 35973552, + "loss": 0.0222, + "num_input_tokens_seen": 36309408, "step": 4775 }, { "epoch": 1.2982682512733446, - "grad_norm": 0.08745962381362915, + "grad_norm": 0.1342477947473526, "learning_rate": 3.0241611505118617e-05, - "loss": 0.0245, - "num_input_tokens_seen": 36009872, + "loss": 0.0235, + "num_input_tokens_seen": 36346096, "step": 4780 }, { "epoch": 1.2996264855687607, - "grad_norm": 0.07654950022697449, + "grad_norm": 0.09654390811920166, "learning_rate": 3.0206835706563974e-05, - "loss": 0.028, - "num_input_tokens_seen": 36047872, + "loss": 0.027, + "num_input_tokens_seen": 36384464, "step": 4785 }, { "epoch": 1.3009847198641766, - "grad_norm": 0.06679020822048187, + "grad_norm": 0.07336565107107162, "learning_rate": 3.017204937289117e-05, - "loss": 0.029, - "num_input_tokens_seen": 36085856, + "loss": 0.0259, + "num_input_tokens_seen": 36422800, "step": 4790 }, { "epoch": 1.3023429541595926, - "grad_norm": 0.1162283793091774, + "grad_norm": 0.07291150093078613, "learning_rate": 3.0137252574484242e-05, - "loss": 0.0221, - "num_input_tokens_seen": 36122016, + "loss": 0.0217, + "num_input_tokens_seen": 36459248, "step": 4795 }, { "epoch": 1.3037011884550085, - "grad_norm": 0.10590288043022156, + "grad_norm": 0.30311334133148193, "learning_rate": 3.0102445381748413e-05, - "loss": 0.0222, - "num_input_tokens_seen": 36160352, + "loss": 0.0211, + "num_input_tokens_seen": 36497120, "step": 4800 }, { "epoch": 1.3050594227504244, - "grad_norm": 0.07276478409767151, + "grad_norm": 0.3032911717891693, "learning_rate": 3.0067627865109925e-05, - "loss": 0.0254, - "num_input_tokens_seen": 36199232, + "loss": 0.024, + "num_input_tokens_seen": 36536048, "step": 4805 }, { "epoch": 1.3064176570458403, - "grad_norm": 0.0886823832988739, + "grad_norm": 0.3467036783695221, "learning_rate": 3.0032800095015916e-05, - "loss": 0.0219, - "num_input_tokens_seen": 36236640, + "loss": 0.0226, + "num_input_tokens_seen": 36573696, "step": 4810 }, { "epoch": 1.3077758913412563, - "grad_norm": 0.06619875878095627, + "grad_norm": 0.06864161789417267, "learning_rate": 2.9997962141934254e-05, - "loss": 0.0259, - "num_input_tokens_seen": 36268192, + "loss": 0.0275, + "num_input_tokens_seen": 36606352, "step": 4815 }, { "epoch": 1.3091341256366724, - "grad_norm": 0.07683831453323364, + "grad_norm": 0.07447998970746994, "learning_rate": 2.9963114076353433e-05, - "loss": 0.0257, - "num_input_tokens_seen": 36307744, + "loss": 0.0274, + "num_input_tokens_seen": 36645936, "step": 4820 }, { "epoch": 1.3104923599320883, - "grad_norm": 0.08424662798643112, + "grad_norm": 0.08919472992420197, "learning_rate": 2.9928255968782393e-05, - "loss": 0.0219, - "num_input_tokens_seen": 36348240, + "loss": 0.0216, + "num_input_tokens_seen": 36686912, "step": 4825 }, { "epoch": 1.3118505942275043, - "grad_norm": 0.07858245819807053, + "grad_norm": 0.07332591712474823, "learning_rate": 2.9893387889750396e-05, - "loss": 0.0227, - "num_input_tokens_seen": 36390400, + "loss": 0.0225, + "num_input_tokens_seen": 36729200, "step": 4830 }, { "epoch": 1.3132088285229202, - "grad_norm": 0.06554257124662399, + "grad_norm": 0.15131472051143646, "learning_rate": 2.985850990980688e-05, - "loss": 0.0247, - "num_input_tokens_seen": 36425584, + "loss": 0.024, + "num_input_tokens_seen": 36764752, "step": 4835 }, { "epoch": 1.3145670628183361, - "grad_norm": 0.23156918585300446, + "grad_norm": 0.07137975096702576, "learning_rate": 2.982362209952132e-05, - "loss": 0.0277, - "num_input_tokens_seen": 36458848, + "loss": 0.0263, + "num_input_tokens_seen": 36798128, "step": 4840 }, { "epoch": 1.315925297113752, - "grad_norm": 0.06257003545761108, + "grad_norm": 0.12470098584890366, "learning_rate": 2.978872452948307e-05, - "loss": 0.0258, - "num_input_tokens_seen": 36498800, + "loss": 0.0255, + "num_input_tokens_seen": 36838256, "step": 4845 }, { "epoch": 1.317283531409168, - "grad_norm": 0.09758147597312927, + "grad_norm": 0.14696817100048065, "learning_rate": 2.9753817270301244e-05, - "loss": 0.0232, - "num_input_tokens_seen": 36532912, + "loss": 0.0196, + "num_input_tokens_seen": 36872240, "step": 4850 }, { "epoch": 1.3186417657045841, - "grad_norm": 0.19192613661289215, + "grad_norm": 0.12289588898420334, "learning_rate": 2.9718900392604553e-05, - "loss": 0.0207, - "num_input_tokens_seen": 36569376, + "loss": 0.0196, + "num_input_tokens_seen": 36909568, "step": 4855 }, { "epoch": 1.32, - "grad_norm": 0.07223650813102722, + "grad_norm": 0.09164445847272873, "learning_rate": 2.9683973967041167e-05, - "loss": 0.0224, - "num_input_tokens_seen": 36601808, + "loss": 0.0227, + "num_input_tokens_seen": 36942512, "step": 4860 }, { "epoch": 1.321358234295416, - "grad_norm": 0.1284734308719635, + "grad_norm": 0.08058694750070572, "learning_rate": 2.9649038064278583e-05, - "loss": 0.0215, - "num_input_tokens_seen": 36649152, + "loss": 0.0214, + "num_input_tokens_seen": 36990240, "step": 4865 }, { "epoch": 1.322716468590832, - "grad_norm": 0.05682654306292534, + "grad_norm": 0.05822000280022621, "learning_rate": 2.9614092755003458e-05, - "loss": 0.0254, - "num_input_tokens_seen": 36688928, + "loss": 0.024, + "num_input_tokens_seen": 37029952, "step": 4870 }, { "epoch": 1.3240747028862478, - "grad_norm": 0.08648929744958878, + "grad_norm": 0.0980505719780922, "learning_rate": 2.9579138109921505e-05, "loss": 0.0238, - "num_input_tokens_seen": 36726960, + "num_input_tokens_seen": 37068688, "step": 4875 }, { "epoch": 1.3254329371816638, - "grad_norm": 0.0657745748758316, + "grad_norm": 0.07445904612541199, "learning_rate": 2.95441741997573e-05, - "loss": 0.0232, - "num_input_tokens_seen": 36766592, + "loss": 0.0235, + "num_input_tokens_seen": 37108896, "step": 4880 }, { "epoch": 1.3267911714770797, - "grad_norm": 0.07110031694173813, + "grad_norm": 0.0844590812921524, "learning_rate": 2.9509201095254196e-05, "loss": 0.0245, - "num_input_tokens_seen": 36802816, + "num_input_tokens_seen": 37145376, "step": 4885 }, { "epoch": 1.3281494057724959, - "grad_norm": 0.13777613639831543, + "grad_norm": 0.07354225218296051, "learning_rate": 2.9474218867174112e-05, - "loss": 0.0224, - "num_input_tokens_seen": 36845952, + "loss": 0.0227, + "num_input_tokens_seen": 37188768, "step": 4890 }, { "epoch": 1.3295076400679118, - "grad_norm": 0.07218018174171448, + "grad_norm": 0.07825478911399841, "learning_rate": 2.9439227586297464e-05, - "loss": 0.0248, - "num_input_tokens_seen": 36888768, + "loss": 0.0241, + "num_input_tokens_seen": 37231776, "step": 4895 }, { "epoch": 1.3308658743633277, - "grad_norm": 0.11915044486522675, + "grad_norm": 0.08993028849363327, "learning_rate": 2.940422732342295e-05, - "loss": 0.0234, - "num_input_tokens_seen": 36930912, + "loss": 0.0225, + "num_input_tokens_seen": 37274144, "step": 4900 }, { "epoch": 1.3322241086587436, - "grad_norm": 0.21373873949050903, + "grad_norm": 0.09865900874137878, "learning_rate": 2.9369218149367476e-05, - "loss": 0.0218, - "num_input_tokens_seen": 36974448, + "loss": 0.0221, + "num_input_tokens_seen": 37317936, "step": 4905 }, { "epoch": 1.3335823429541596, - "grad_norm": 0.07199373096227646, + "grad_norm": 0.07177091389894485, "learning_rate": 2.933420013496595e-05, - "loss": 0.0195, - "num_input_tokens_seen": 37010224, + "loss": 0.0193, + "num_input_tokens_seen": 37354416, "step": 4910 }, { "epoch": 1.3349405772495755, - "grad_norm": 0.06123640388250351, + "grad_norm": 0.06367233395576477, "learning_rate": 2.929917335107118e-05, - "loss": 0.022, - "num_input_tokens_seen": 37048496, + "loss": 0.0211, + "num_input_tokens_seen": 37392768, "step": 4915 }, { "epoch": 1.3362988115449914, - "grad_norm": 0.07031592726707458, + "grad_norm": 0.07192708551883698, "learning_rate": 2.9264137868553714e-05, - "loss": 0.0212, - "num_input_tokens_seen": 37089312, + "loss": 0.0204, + "num_input_tokens_seen": 37434000, "step": 4920 }, { "epoch": 1.3376570458404076, - "grad_norm": 0.15033331513404846, + "grad_norm": 0.11259570717811584, "learning_rate": 2.92290937583017e-05, - "loss": 0.0237, - "num_input_tokens_seen": 37126944, + "loss": 0.0222, + "num_input_tokens_seen": 37471984, "step": 4925 }, { "epoch": 1.3390152801358235, - "grad_norm": 0.05869729444384575, + "grad_norm": 0.08926200866699219, "learning_rate": 2.9194041091220753e-05, - "loss": 0.0235, - "num_input_tokens_seen": 37171504, + "loss": 0.023, + "num_input_tokens_seen": 37517120, "step": 4930 }, { "epoch": 1.3403735144312394, - "grad_norm": 0.0648658350110054, + "grad_norm": 0.06523467600345612, "learning_rate": 2.9158979938233793e-05, - "loss": 0.023, - "num_input_tokens_seen": 37209408, + "loss": 0.0218, + "num_input_tokens_seen": 37555088, "step": 4935 }, { "epoch": 1.3417317487266553, - "grad_norm": 0.06643757969141006, + "grad_norm": 0.09050670266151428, "learning_rate": 2.9123910370280895e-05, - "loss": 0.0226, - "num_input_tokens_seen": 37244864, + "loss": 0.0252, + "num_input_tokens_seen": 37589936, "step": 4940 }, { "epoch": 1.3430899830220713, - "grad_norm": 0.06915071606636047, + "grad_norm": 0.08106379210948944, "learning_rate": 2.9088832458319183e-05, - "loss": 0.0227, - "num_input_tokens_seen": 37285200, + "loss": 0.0224, + "num_input_tokens_seen": 37630320, "step": 4945 }, { "epoch": 1.3444482173174872, - "grad_norm": 0.0631452426314354, + "grad_norm": 0.06575528532266617, "learning_rate": 2.9053746273322657e-05, - "loss": 0.026, - "num_input_tokens_seen": 37320944, + "loss": 0.0257, + "num_input_tokens_seen": 37666144, "step": 4950 }, { "epoch": 1.3458064516129031, - "grad_norm": 0.06560534238815308, + "grad_norm": 0.06512979418039322, "learning_rate": 2.901865188628205e-05, - "loss": 0.0223, - "num_input_tokens_seen": 37355328, + "loss": 0.023, + "num_input_tokens_seen": 37701392, "step": 4955 }, { "epoch": 1.3471646859083193, - "grad_norm": 0.06297048926353455, + "grad_norm": 0.062344662845134735, "learning_rate": 2.89835493682047e-05, - "loss": 0.0261, - "num_input_tokens_seen": 37393936, + "loss": 0.0252, + "num_input_tokens_seen": 37740384, "step": 4960 }, { "epoch": 1.3485229202037352, - "grad_norm": 0.07204825431108475, + "grad_norm": 0.1797160655260086, "learning_rate": 2.894843879011438e-05, - "loss": 0.0203, - "num_input_tokens_seen": 37430992, + "loss": 0.02, + "num_input_tokens_seen": 37777712, "step": 4965 }, { "epoch": 1.3498811544991511, - "grad_norm": 0.0705382451415062, + "grad_norm": 0.07120561599731445, "learning_rate": 2.89133202230512e-05, - "loss": 0.0198, - "num_input_tokens_seen": 37471536, + "loss": 0.0193, + "num_input_tokens_seen": 37818624, "step": 4970 }, { "epoch": 1.351239388794567, - "grad_norm": 0.16061186790466309, + "grad_norm": 0.0576791875064373, "learning_rate": 2.8878193738071403e-05, - "loss": 0.0245, - "num_input_tokens_seen": 37511744, + "loss": 0.0232, + "num_input_tokens_seen": 37858880, "step": 4975 }, { "epoch": 1.352597623089983, - "grad_norm": 0.2348761260509491, + "grad_norm": 0.2292705774307251, "learning_rate": 2.8843059406247264e-05, - "loss": 0.0257, - "num_input_tokens_seen": 37549024, + "loss": 0.0249, + "num_input_tokens_seen": 37895776, "step": 4980 }, { "epoch": 1.353955857385399, - "grad_norm": 0.07010455429553986, + "grad_norm": 0.07271228730678558, "learning_rate": 2.8807917298666943e-05, - "loss": 0.0251, - "num_input_tokens_seen": 37589312, + "loss": 0.0242, + "num_input_tokens_seen": 37936416, "step": 4985 }, { "epoch": 1.3553140916808148, - "grad_norm": 0.07429832965135574, + "grad_norm": 0.0720776617527008, "learning_rate": 2.8772767486434328e-05, - "loss": 0.0253, - "num_input_tokens_seen": 37629344, + "loss": 0.0238, + "num_input_tokens_seen": 37976640, "step": 4990 }, { "epoch": 1.356672325976231, - "grad_norm": 0.26207640767097473, + "grad_norm": 0.08328072726726532, "learning_rate": 2.8737610040668893e-05, - "loss": 0.0228, - "num_input_tokens_seen": 37667392, + "loss": 0.0211, + "num_input_tokens_seen": 38013984, "step": 4995 }, { "epoch": 1.358030560271647, - "grad_norm": 0.06545676290988922, + "grad_norm": 0.06632568687200546, "learning_rate": 2.8702445032505555e-05, - "loss": 0.0233, - "num_input_tokens_seen": 37705728, + "loss": 0.0224, + "num_input_tokens_seen": 38052864, "step": 5000 }, { "epoch": 1.3593887945670629, - "grad_norm": 0.0644008219242096, + "grad_norm": 0.0645761787891388, "learning_rate": 2.8667272533094547e-05, - "loss": 0.0222, - "num_input_tokens_seen": 37743984, + "loss": 0.0217, + "num_input_tokens_seen": 38091024, "step": 5005 }, { "epoch": 1.3607470288624788, - "grad_norm": 0.08785516023635864, + "grad_norm": 0.08532677590847015, "learning_rate": 2.8632092613601236e-05, - "loss": 0.0281, - "num_input_tokens_seen": 37776448, + "loss": 0.028, + "num_input_tokens_seen": 38123696, "step": 5010 }, { "epoch": 1.3621052631578947, - "grad_norm": 0.07046717405319214, + "grad_norm": 0.07145427912473679, "learning_rate": 2.8596905345206015e-05, - "loss": 0.0211, - "num_input_tokens_seen": 37816128, + "loss": 0.0202, + "num_input_tokens_seen": 38164000, "step": 5015 }, { "epoch": 1.3634634974533106, - "grad_norm": 0.07592365145683289, + "grad_norm": 0.07960017025470734, "learning_rate": 2.8561710799104152e-05, - "loss": 0.0226, - "num_input_tokens_seen": 37856272, + "loss": 0.0225, + "num_input_tokens_seen": 38204496, "step": 5020 }, { "epoch": 1.3648217317487266, - "grad_norm": 0.07739284634590149, + "grad_norm": 0.07290725409984589, "learning_rate": 2.852650904650563e-05, - "loss": 0.0209, - "num_input_tokens_seen": 37893664, + "loss": 0.0204, + "num_input_tokens_seen": 38241904, "step": 5025 }, { "epoch": 1.3661799660441427, - "grad_norm": 0.06753869354724884, + "grad_norm": 0.08071789145469666, "learning_rate": 2.8491300158635013e-05, - "loss": 0.026, - "num_input_tokens_seen": 37934752, + "loss": 0.0266, + "num_input_tokens_seen": 38283488, "step": 5030 }, { "epoch": 1.3675382003395586, - "grad_norm": 0.08525117486715317, + "grad_norm": 0.08796056360006332, "learning_rate": 2.8456084206731326e-05, - "loss": 0.0225, - "num_input_tokens_seen": 37973840, + "loss": 0.0223, + "num_input_tokens_seen": 38322960, "step": 5035 }, { "epoch": 1.3688964346349746, - "grad_norm": 0.0558052584528923, + "grad_norm": 0.059795767068862915, "learning_rate": 2.8420861262047837e-05, - "loss": 0.0264, - "num_input_tokens_seen": 38011392, + "loss": 0.0258, + "num_input_tokens_seen": 38361728, "step": 5040 }, { "epoch": 1.3702546689303905, - "grad_norm": 0.05697003751993179, + "grad_norm": 0.10983120650053024, "learning_rate": 2.8385631395852008e-05, - "loss": 0.0227, - "num_input_tokens_seen": 38046048, + "loss": 0.0222, + "num_input_tokens_seen": 38396336, "step": 5045 }, { "epoch": 1.3716129032258064, - "grad_norm": 0.12278226763010025, + "grad_norm": 0.08720116317272186, "learning_rate": 2.8350394679425292e-05, - "loss": 0.0264, - "num_input_tokens_seen": 38083888, + "loss": 0.0247, + "num_input_tokens_seen": 38434112, "step": 5050 }, { "epoch": 1.3729711375212224, - "grad_norm": 0.06974665820598602, + "grad_norm": 0.06760919839143753, "learning_rate": 2.8315151184062992e-05, - "loss": 0.0212, - "num_input_tokens_seen": 38121520, + "loss": 0.0207, + "num_input_tokens_seen": 38472272, "step": 5055 }, { "epoch": 1.3743293718166383, - "grad_norm": 0.07815779000520706, + "grad_norm": 0.0700531154870987, "learning_rate": 2.8279900981074143e-05, - "loss": 0.0236, - "num_input_tokens_seen": 38162752, + "loss": 0.0227, + "num_input_tokens_seen": 38514144, "step": 5060 }, { "epoch": 1.3756876061120544, - "grad_norm": 0.07380659133195877, + "grad_norm": 0.07428401708602905, "learning_rate": 2.824464414178134e-05, - "loss": 0.0225, - "num_input_tokens_seen": 38201024, + "loss": 0.0219, + "num_input_tokens_seen": 38552864, "step": 5065 }, { "epoch": 1.3770458404074704, - "grad_norm": 0.06875959783792496, + "grad_norm": 0.06985628604888916, "learning_rate": 2.8209380737520613e-05, - "loss": 0.022, - "num_input_tokens_seen": 38235872, + "loss": 0.0212, + "num_input_tokens_seen": 38587952, "step": 5070 }, { "epoch": 1.3784040747028863, - "grad_norm": 0.3194339871406555, + "grad_norm": 0.06999427825212479, "learning_rate": 2.8174110839641266e-05, - "loss": 0.022, - "num_input_tokens_seen": 38273552, + "loss": 0.0205, + "num_input_tokens_seen": 38625808, "step": 5075 }, { "epoch": 1.3797623089983022, - "grad_norm": 0.06795497983694077, + "grad_norm": 0.07771144807338715, "learning_rate": 2.8138834519505742e-05, - "loss": 0.0239, - "num_input_tokens_seen": 38308704, + "loss": 0.0232, + "num_input_tokens_seen": 38661504, "step": 5080 }, { "epoch": 1.3811205432937181, - "grad_norm": 0.053568679839372635, + "grad_norm": 0.05541161075234413, "learning_rate": 2.810355184848949e-05, - "loss": 0.0223, - "num_input_tokens_seen": 38344672, + "loss": 0.0217, + "num_input_tokens_seen": 38697584, "step": 5085 }, { "epoch": 1.382478777589134, - "grad_norm": 0.07265440374612808, + "grad_norm": 0.0726446881890297, "learning_rate": 2.8068262897980803e-05, - "loss": 0.0229, - "num_input_tokens_seen": 38381424, + "loss": 0.0218, + "num_input_tokens_seen": 38735280, "step": 5090 }, { "epoch": 1.38383701188455, - "grad_norm": 0.12700700759887695, + "grad_norm": 0.07729476690292358, "learning_rate": 2.8032967739380678e-05, - "loss": 0.0217, - "num_input_tokens_seen": 38418240, + "loss": 0.0224, + "num_input_tokens_seen": 38772256, "step": 5095 }, { "epoch": 1.3851952461799661, - "grad_norm": 0.304447203874588, + "grad_norm": 0.069065622985363, "learning_rate": 2.7997666444102673e-05, - "loss": 0.0253, - "num_input_tokens_seen": 38454752, + "loss": 0.0229, + "num_input_tokens_seen": 38809744, "step": 5100 }, { "epoch": 1.386553480475382, - "grad_norm": 0.06393246352672577, + "grad_norm": 0.06344050914049149, "learning_rate": 2.7962359083572763e-05, - "loss": 0.0215, - "num_input_tokens_seen": 38494960, + "loss": 0.0201, + "num_input_tokens_seen": 38850224, "step": 5105 }, { "epoch": 1.387911714770798, - "grad_norm": 0.08064297586679459, + "grad_norm": 0.09007630497217178, "learning_rate": 2.7927045729229196e-05, - "loss": 0.0258, - "num_input_tokens_seen": 38538256, + "loss": 0.022, + "num_input_tokens_seen": 38893072, "step": 5110 }, { "epoch": 1.389269949066214, - "grad_norm": 0.07285715639591217, + "grad_norm": 0.060988880693912506, "learning_rate": 2.7891726452522338e-05, - "loss": 0.0223, - "num_input_tokens_seen": 38571968, + "loss": 0.0211, + "num_input_tokens_seen": 38926800, "step": 5115 }, { "epoch": 1.3906281833616299, - "grad_norm": 0.08592525869607925, + "grad_norm": 0.06315572559833527, "learning_rate": 2.7856401324914557e-05, - "loss": 0.0251, - "num_input_tokens_seen": 38608352, + "loss": 0.0235, + "num_input_tokens_seen": 38963632, "step": 5120 }, { "epoch": 1.3919864176570458, - "grad_norm": 0.07992704957723618, + "grad_norm": 0.1995934545993805, "learning_rate": 2.782107041788004e-05, - "loss": 0.0223, - "num_input_tokens_seen": 38649056, + "loss": 0.0209, + "num_input_tokens_seen": 39004832, "step": 5125 }, { "epoch": 1.3933446519524617, - "grad_norm": 0.09133918583393097, + "grad_norm": 0.08420293778181076, "learning_rate": 2.778573380290469e-05, - "loss": 0.0308, - "num_input_tokens_seen": 38683264, + "loss": 0.0283, + "num_input_tokens_seen": 39038880, "step": 5130 }, { "epoch": 1.3947028862478779, - "grad_norm": 0.1600920855998993, + "grad_norm": 0.0772460401058197, "learning_rate": 2.7750391551485927e-05, - "loss": 0.0242, - "num_input_tokens_seen": 38725584, + "loss": 0.0228, + "num_input_tokens_seen": 39081008, "step": 5135 }, { "epoch": 1.3960611205432938, - "grad_norm": 0.07070513069629669, + "grad_norm": 0.25229960680007935, "learning_rate": 2.7715043735132613e-05, - "loss": 0.0242, - "num_input_tokens_seen": 38764688, + "loss": 0.0234, + "num_input_tokens_seen": 39120576, "step": 5140 }, { "epoch": 1.3974193548387097, - "grad_norm": 0.07663524150848389, + "grad_norm": 0.06817038357257843, "learning_rate": 2.767969042536483e-05, - "loss": 0.0205, - "num_input_tokens_seen": 38801888, + "loss": 0.0198, + "num_input_tokens_seen": 39158320, "step": 5145 }, { "epoch": 1.3987775891341256, - "grad_norm": 0.14459607005119324, + "grad_norm": 0.07609429955482483, "learning_rate": 2.764433169371381e-05, - "loss": 0.0246, - "num_input_tokens_seen": 38839616, + "loss": 0.0222, + "num_input_tokens_seen": 39196256, "step": 5150 }, { "epoch": 1.4001358234295416, - "grad_norm": 0.0776137188076973, + "grad_norm": 0.3237346410751343, "learning_rate": 2.7608967611721732e-05, - "loss": 0.0257, - "num_input_tokens_seen": 38874160, + "loss": 0.0254, + "num_input_tokens_seen": 39230704, "step": 5155 }, { "epoch": 1.4014940577249575, - "grad_norm": 0.061741046607494354, + "grad_norm": 0.08231765031814575, "learning_rate": 2.7573598250941613e-05, - "loss": 0.0274, - "num_input_tokens_seen": 38908352, + "loss": 0.0252, + "num_input_tokens_seen": 39265280, "step": 5160 }, { "epoch": 1.4028522920203734, - "grad_norm": 0.22620877623558044, + "grad_norm": 0.2521565854549408, "learning_rate": 2.7538223682937142e-05, - "loss": 0.024, - "num_input_tokens_seen": 38944016, + "loss": 0.0242, + "num_input_tokens_seen": 39300912, "step": 5165 }, { "epoch": 1.4042105263157896, - "grad_norm": 0.07021656632423401, + "grad_norm": 0.110670305788517, "learning_rate": 2.750284397928256e-05, - "loss": 0.0271, - "num_input_tokens_seen": 38981824, + "loss": 0.0258, + "num_input_tokens_seen": 39339120, "step": 5170 }, { "epoch": 1.4055687606112055, - "grad_norm": 0.07928325980901718, + "grad_norm": 0.08039861917495728, "learning_rate": 2.7467459211562473e-05, - "loss": 0.0209, - "num_input_tokens_seen": 39023488, + "loss": 0.0207, + "num_input_tokens_seen": 39381344, "step": 5175 }, { "epoch": 1.4069269949066214, - "grad_norm": 0.11648694425821304, + "grad_norm": 0.12607116997241974, "learning_rate": 2.743206945137176e-05, - "loss": 0.0266, - "num_input_tokens_seen": 39060320, + "loss": 0.0263, + "num_input_tokens_seen": 39418288, "step": 5180 }, { "epoch": 1.4082852292020374, - "grad_norm": 0.06782970577478409, + "grad_norm": 0.06207061931490898, "learning_rate": 2.7396674770315378e-05, - "loss": 0.0207, - "num_input_tokens_seen": 39094800, + "loss": 0.0193, + "num_input_tokens_seen": 39453632, "step": 5185 }, { "epoch": 1.4096434634974533, - "grad_norm": 0.08282934874296188, + "grad_norm": 0.14125891029834747, "learning_rate": 2.736127524000826e-05, - "loss": 0.0224, - "num_input_tokens_seen": 39137216, + "loss": 0.021, + "num_input_tokens_seen": 39496272, "step": 5190 }, { "epoch": 1.4110016977928692, - "grad_norm": 0.06651698052883148, + "grad_norm": 0.18058492243289948, "learning_rate": 2.7325870932075143e-05, - "loss": 0.0228, - "num_input_tokens_seen": 39172336, + "loss": 0.0214, + "num_input_tokens_seen": 39531664, "step": 5195 }, { "epoch": 1.4123599320882851, - "grad_norm": 0.14998948574066162, + "grad_norm": 0.06172385439276695, "learning_rate": 2.7290461918150422e-05, - "loss": 0.0188, - "num_input_tokens_seen": 39206128, + "loss": 0.0193, + "num_input_tokens_seen": 39565776, "step": 5200 }, { "epoch": 1.4137181663837013, - "grad_norm": 0.084775410592556, + "grad_norm": 0.07556865364313126, "learning_rate": 2.7255048269878036e-05, - "loss": 0.0222, - "num_input_tokens_seen": 39240160, + "loss": 0.022, + "num_input_tokens_seen": 39600000, "step": 5205 }, { "epoch": 1.4150764006791172, - "grad_norm": 0.06739503890275955, + "grad_norm": 0.06654740124940872, "learning_rate": 2.7219630058911265e-05, - "loss": 0.0213, - "num_input_tokens_seen": 39278368, + "loss": 0.0202, + "num_input_tokens_seen": 39638864, "step": 5210 }, { "epoch": 1.4164346349745331, - "grad_norm": 0.08495637029409409, + "grad_norm": 0.1819053441286087, "learning_rate": 2.7184207356912662e-05, - "loss": 0.021, - "num_input_tokens_seen": 39317120, + "loss": 0.0208, + "num_input_tokens_seen": 39678096, "step": 5215 }, { "epoch": 1.417792869269949, - "grad_norm": 0.08469637483358383, + "grad_norm": 0.07664623856544495, "learning_rate": 2.714878023555384e-05, - "loss": 0.0286, - "num_input_tokens_seen": 39356624, + "loss": 0.0259, + "num_input_tokens_seen": 39717664, "step": 5220 }, { "epoch": 1.419151103565365, - "grad_norm": 0.10826413333415985, + "grad_norm": 0.06771133095026016, "learning_rate": 2.7113348766515355e-05, - "loss": 0.0193, - "num_input_tokens_seen": 39393328, + "loss": 0.018, + "num_input_tokens_seen": 39754272, "step": 5225 }, { "epoch": 1.420509337860781, - "grad_norm": 0.08049986511468887, + "grad_norm": 0.0725821778178215, "learning_rate": 2.7077913021486572e-05, - "loss": 0.0241, - "num_input_tokens_seen": 39433952, + "loss": 0.0233, + "num_input_tokens_seen": 39795520, "step": 5230 }, { "epoch": 1.4218675721561969, - "grad_norm": 0.13227759301662445, + "grad_norm": 0.058037035167217255, "learning_rate": 2.7042473072165503e-05, - "loss": 0.0231, - "num_input_tokens_seen": 39472448, + "loss": 0.0223, + "num_input_tokens_seen": 39834256, "step": 5235 }, { "epoch": 1.423225806451613, - "grad_norm": 0.06735458970069885, + "grad_norm": 0.06517830491065979, "learning_rate": 2.700702899025867e-05, - "loss": 0.0213, - "num_input_tokens_seen": 39508224, + "loss": 0.0215, + "num_input_tokens_seen": 39870272, "step": 5240 }, { "epoch": 1.424584040747029, - "grad_norm": 0.06840696185827255, + "grad_norm": 0.072090283036232, "learning_rate": 2.6971580847480936e-05, - "loss": 0.0239, - "num_input_tokens_seen": 39546528, + "loss": 0.0228, + "num_input_tokens_seen": 39909136, "step": 5245 }, { "epoch": 1.4259422750424449, - "grad_norm": 0.06488379836082458, + "grad_norm": 0.06330297142267227, "learning_rate": 2.693612871555541e-05, - "loss": 0.0213, - "num_input_tokens_seen": 39579376, + "loss": 0.0204, + "num_input_tokens_seen": 39942672, "step": 5250 }, { "epoch": 1.4273005093378608, - "grad_norm": 0.06412303447723389, + "grad_norm": 0.0643848180770874, "learning_rate": 2.690067266621325e-05, - "loss": 0.0241, - "num_input_tokens_seen": 39616976, + "loss": 0.0223, + "num_input_tokens_seen": 39980512, "step": 5255 }, { "epoch": 1.4286587436332767, - "grad_norm": 0.06902553141117096, + "grad_norm": 0.06675079464912415, "learning_rate": 2.686521277119355e-05, - "loss": 0.0237, - "num_input_tokens_seen": 39653456, + "loss": 0.023, + "num_input_tokens_seen": 40017680, "step": 5260 }, { "epoch": 1.4300169779286926, - "grad_norm": 0.08190702646970749, + "grad_norm": 0.11601167172193527, "learning_rate": 2.6829749102243184e-05, - "loss": 0.0218, - "num_input_tokens_seen": 39692144, + "loss": 0.0205, + "num_input_tokens_seen": 40056336, "step": 5265 }, { "epoch": 1.4313752122241086, - "grad_norm": 0.0807967483997345, + "grad_norm": 0.08099468052387238, "learning_rate": 2.679428173111667e-05, - "loss": 0.0233, - "num_input_tokens_seen": 39731392, + "loss": 0.0226, + "num_input_tokens_seen": 40095968, "step": 5270 }, { "epoch": 1.4327334465195247, - "grad_norm": 0.0548405647277832, + "grad_norm": 0.05389860272407532, "learning_rate": 2.6758810729575996e-05, - "loss": 0.0178, - "num_input_tokens_seen": 39770752, + "loss": 0.0174, + "num_input_tokens_seen": 40135296, "step": 5275 }, { "epoch": 1.4340916808149407, - "grad_norm": 0.061352528631687164, + "grad_norm": 0.05975969508290291, "learning_rate": 2.6723336169390517e-05, - "loss": 0.0242, - "num_input_tokens_seen": 39811360, + "loss": 0.0234, + "num_input_tokens_seen": 40176480, "step": 5280 }, { "epoch": 1.4354499151103566, - "grad_norm": 0.08337028324604034, + "grad_norm": 0.09432240575551987, "learning_rate": 2.668785812233678e-05, "loss": 0.0237, - "num_input_tokens_seen": 39846592, + "num_input_tokens_seen": 40212080, "step": 5285 }, { "epoch": 1.4368081494057725, - "grad_norm": 0.067890964448452, + "grad_norm": 0.0685882717370987, "learning_rate": 2.6652376660198374e-05, - "loss": 0.022, - "num_input_tokens_seen": 39880624, + "loss": 0.0208, + "num_input_tokens_seen": 40246672, "step": 5290 }, { "epoch": 1.4381663837011884, - "grad_norm": 0.08204153925180435, + "grad_norm": 0.07120517641305923, "learning_rate": 2.6616891854765817e-05, - "loss": 0.0235, - "num_input_tokens_seen": 39920320, + "loss": 0.0229, + "num_input_tokens_seen": 40286720, "step": 5295 }, { "epoch": 1.4395246179966044, - "grad_norm": 0.06897828727960587, + "grad_norm": 0.06598452478647232, "learning_rate": 2.658140377783639e-05, - "loss": 0.0247, - "num_input_tokens_seen": 39957392, + "loss": 0.0237, + "num_input_tokens_seen": 40324048, "step": 5300 }, { "epoch": 1.4408828522920203, - "grad_norm": 0.06335163116455078, + "grad_norm": 0.07643125206232071, "learning_rate": 2.654591250121398e-05, - "loss": 0.0216, - "num_input_tokens_seen": 39992224, + "loss": 0.021, + "num_input_tokens_seen": 40359696, "step": 5305 }, { "epoch": 1.4422410865874364, - "grad_norm": 0.11620504409074783, + "grad_norm": 0.0646776482462883, "learning_rate": 2.6510418096708962e-05, - "loss": 0.0239, - "num_input_tokens_seen": 40029776, + "loss": 0.0225, + "num_input_tokens_seen": 40397552, "step": 5310 }, { "epoch": 1.4435993208828524, - "grad_norm": 0.0801878273487091, + "grad_norm": 0.0733875036239624, "learning_rate": 2.6474920636138028e-05, - "loss": 0.0224, - "num_input_tokens_seen": 40066080, + "loss": 0.0212, + "num_input_tokens_seen": 40434592, "step": 5315 }, { "epoch": 1.4449575551782683, - "grad_norm": 0.07442334294319153, + "grad_norm": 0.0704973116517067, "learning_rate": 2.6439420191324066e-05, "loss": 0.0238, - "num_input_tokens_seen": 40101696, + "num_input_tokens_seen": 40470208, "step": 5320 }, { "epoch": 1.4463157894736842, - "grad_norm": 0.09909939020872116, + "grad_norm": 0.12577655911445618, "learning_rate": 2.6403916834095986e-05, - "loss": 0.024, - "num_input_tokens_seen": 40136656, + "loss": 0.023, + "num_input_tokens_seen": 40505520, "step": 5325 }, { "epoch": 1.4476740237691001, - "grad_norm": 0.06508640199899673, + "grad_norm": 0.05730614811182022, "learning_rate": 2.6368410636288603e-05, - "loss": 0.0217, - "num_input_tokens_seen": 40170992, + "loss": 0.0209, + "num_input_tokens_seen": 40539680, "step": 5330 }, { "epoch": 1.449032258064516, - "grad_norm": 0.0813475251197815, + "grad_norm": 0.07864313572645187, "learning_rate": 2.6332901669742483e-05, - "loss": 0.025, - "num_input_tokens_seen": 40205248, + "loss": 0.024, + "num_input_tokens_seen": 40574672, "step": 5335 }, { "epoch": 1.450390492359932, - "grad_norm": 0.07520993798971176, + "grad_norm": 0.07452428340911865, "learning_rate": 2.6297390006303785e-05, - "loss": 0.0226, - "num_input_tokens_seen": 40240896, + "loss": 0.0218, + "num_input_tokens_seen": 40609776, "step": 5340 }, { "epoch": 1.4517487266553482, - "grad_norm": 0.09048011898994446, + "grad_norm": 0.08726528286933899, "learning_rate": 2.626187571782412e-05, "loss": 0.0238, - "num_input_tokens_seen": 40281664, + "num_input_tokens_seen": 40650736, "step": 5345 }, { "epoch": 1.453106960950764, - "grad_norm": 0.06863588094711304, + "grad_norm": 0.06545647978782654, "learning_rate": 2.6226358876160423e-05, - "loss": 0.022, - "num_input_tokens_seen": 40325232, + "loss": 0.0216, + "num_input_tokens_seen": 40693776, "step": 5350 }, { "epoch": 1.45446519524618, - "grad_norm": 0.07342568784952164, + "grad_norm": 0.0651511549949646, "learning_rate": 2.6190839553174785e-05, - "loss": 0.0224, - "num_input_tokens_seen": 40362336, + "loss": 0.0219, + "num_input_tokens_seen": 40730832, "step": 5355 }, { "epoch": 1.455823429541596, - "grad_norm": 0.07908016443252563, + "grad_norm": 0.07261963188648224, "learning_rate": 2.6155317820734326e-05, - "loss": 0.0262, - "num_input_tokens_seen": 40405856, + "loss": 0.0257, + "num_input_tokens_seen": 40774656, "step": 5360 }, { "epoch": 1.4571816638370119, - "grad_norm": 0.06552285701036453, + "grad_norm": 0.06197890266776085, "learning_rate": 2.6119793750711035e-05, - "loss": 0.0225, - "num_input_tokens_seen": 40444816, + "loss": 0.0212, + "num_input_tokens_seen": 40813584, "step": 5365 }, { "epoch": 1.4585398981324278, - "grad_norm": 0.06925879418849945, + "grad_norm": 0.07417548447847366, "learning_rate": 2.6084267414981634e-05, - "loss": 0.024, - "num_input_tokens_seen": 40478032, + "loss": 0.0236, + "num_input_tokens_seen": 40847104, "step": 5370 }, { "epoch": 1.4598981324278437, - "grad_norm": 0.06812389940023422, + "grad_norm": 0.22232474386692047, "learning_rate": 2.604873888542743e-05, - "loss": 0.0239, - "num_input_tokens_seen": 40515104, + "loss": 0.0235, + "num_input_tokens_seen": 40884480, "step": 5375 }, { "epoch": 1.4612563667232599, - "grad_norm": 0.06670275330543518, + "grad_norm": 0.07234742492437363, "learning_rate": 2.6013208233934168e-05, - "loss": 0.0233, - "num_input_tokens_seen": 40558048, + "loss": 0.0235, + "num_input_tokens_seen": 40927744, "step": 5380 }, { "epoch": 1.4626146010186758, - "grad_norm": 0.07094362378120422, + "grad_norm": 0.06311970204114914, "learning_rate": 2.597767553239187e-05, - "loss": 0.0221, - "num_input_tokens_seen": 40591104, + "loss": 0.0218, + "num_input_tokens_seen": 40961168, "step": 5385 }, { "epoch": 1.4639728353140917, - "grad_norm": 0.055240336805582047, + "grad_norm": 0.050982389599084854, "learning_rate": 2.5942140852694735e-05, - "loss": 0.0201, - "num_input_tokens_seen": 40629328, + "loss": 0.0194, + "num_input_tokens_seen": 40999856, "step": 5390 }, { "epoch": 1.4653310696095077, - "grad_norm": 0.07453335076570511, + "grad_norm": 0.08263373374938965, "learning_rate": 2.5906604266740946e-05, - "loss": 0.0223, - "num_input_tokens_seen": 40665696, + "loss": 0.0214, + "num_input_tokens_seen": 41036736, "step": 5395 }, { "epoch": 1.4666893039049236, - "grad_norm": 0.07131902873516083, + "grad_norm": 0.06723923981189728, "learning_rate": 2.587106584643255e-05, - "loss": 0.0261, - "num_input_tokens_seen": 40703888, + "loss": 0.0257, + "num_input_tokens_seen": 41075488, "step": 5400 }, { "epoch": 1.4680475382003395, - "grad_norm": 0.06715118885040283, + "grad_norm": 0.07190420478582382, "learning_rate": 2.5835525663675307e-05, - "loss": 0.0279, - "num_input_tokens_seen": 40736096, + "loss": 0.0263, + "num_input_tokens_seen": 41107296, "step": 5405 }, { "epoch": 1.4694057724957554, - "grad_norm": 0.05577417463064194, + "grad_norm": 0.05527619272470474, "learning_rate": 2.579998379037853e-05, - "loss": 0.0181, - "num_input_tokens_seen": 40769344, + "loss": 0.0175, + "num_input_tokens_seen": 41140944, "step": 5410 }, { "epoch": 1.4707640067911716, - "grad_norm": 0.0784672349691391, + "grad_norm": 0.07915385812520981, "learning_rate": 2.5764440298454962e-05, - "loss": 0.0249, - "num_input_tokens_seen": 40804720, + "loss": 0.024, + "num_input_tokens_seen": 41176304, "step": 5415 }, { "epoch": 1.4721222410865875, - "grad_norm": 0.07332061231136322, + "grad_norm": 0.07157456129789352, "learning_rate": 2.5728895259820624e-05, - "loss": 0.0257, - "num_input_tokens_seen": 40845488, + "loss": 0.0249, + "num_input_tokens_seen": 41217664, "step": 5420 }, { "epoch": 1.4734804753820034, - "grad_norm": 0.10234995931386948, + "grad_norm": 0.09950970858335495, "learning_rate": 2.5693348746394664e-05, - "loss": 0.0227, - "num_input_tokens_seen": 40879520, + "loss": 0.0225, + "num_input_tokens_seen": 41252080, "step": 5425 }, { "epoch": 1.4748387096774194, - "grad_norm": 0.060830697417259216, + "grad_norm": 0.059746671468019485, "learning_rate": 2.5657800830099204e-05, - "loss": 0.0225, - "num_input_tokens_seen": 40921344, + "loss": 0.0215, + "num_input_tokens_seen": 41293968, "step": 5430 }, { "epoch": 1.4761969439728353, - "grad_norm": 0.06066186726093292, + "grad_norm": 0.06419409066438675, "learning_rate": 2.5622251582859224e-05, - "loss": 0.024, - "num_input_tokens_seen": 40961888, + "loss": 0.0256, + "num_input_tokens_seen": 41334880, "step": 5435 }, { "epoch": 1.4775551782682512, - "grad_norm": 0.07994074374437332, + "grad_norm": 0.10189487040042877, "learning_rate": 2.5586701076602393e-05, - "loss": 0.0242, - "num_input_tokens_seen": 40994336, + "loss": 0.0234, + "num_input_tokens_seen": 41367840, "step": 5440 }, { "epoch": 1.4789134125636672, - "grad_norm": 0.05833202227950096, + "grad_norm": 0.05525585636496544, "learning_rate": 2.55511493832589e-05, - "loss": 0.0226, - "num_input_tokens_seen": 41030464, + "loss": 0.0221, + "num_input_tokens_seen": 41404384, "step": 5445 }, { "epoch": 1.4802716468590833, - "grad_norm": 0.07037898898124695, + "grad_norm": 0.07335818558931351, "learning_rate": 2.5515596574761365e-05, - "loss": 0.0228, - "num_input_tokens_seen": 41065552, + "loss": 0.0218, + "num_input_tokens_seen": 41440112, "step": 5450 }, { "epoch": 1.4816298811544992, - "grad_norm": 0.06961207836866379, + "grad_norm": 0.0657782033085823, "learning_rate": 2.5480042723044656e-05, - "loss": 0.0249, - "num_input_tokens_seen": 41102784, + "loss": 0.0241, + "num_input_tokens_seen": 41477904, "step": 5455 }, { "epoch": 1.4829881154499152, - "grad_norm": 0.06795570254325867, + "grad_norm": 0.06898418068885803, "learning_rate": 2.5444487900045756e-05, - "loss": 0.0249, - "num_input_tokens_seen": 41138896, + "loss": 0.0237, + "num_input_tokens_seen": 41514096, "step": 5460 }, { "epoch": 1.484346349745331, - "grad_norm": 0.06811458617448807, + "grad_norm": 0.06857375055551529, "learning_rate": 2.5408932177703603e-05, - "loss": 0.0284, - "num_input_tokens_seen": 41175632, + "loss": 0.0266, + "num_input_tokens_seen": 41551104, "step": 5465 }, { "epoch": 1.485704584040747, - "grad_norm": 0.06638490408658981, + "grad_norm": 0.07178980112075806, "learning_rate": 2.5373375627958966e-05, - "loss": 0.0191, - "num_input_tokens_seen": 41214752, + "loss": 0.0196, + "num_input_tokens_seen": 41589856, "step": 5470 }, { "epoch": 1.487062818336163, - "grad_norm": 0.051260873675346375, + "grad_norm": 0.06357000023126602, "learning_rate": 2.5337818322754287e-05, - "loss": 0.0227, - "num_input_tokens_seen": 41257200, + "loss": 0.0214, + "num_input_tokens_seen": 41632416, "step": 5475 }, { "epoch": 1.4884210526315789, - "grad_norm": 0.15929630398750305, + "grad_norm": 0.122623510658741, "learning_rate": 2.5302260334033518e-05, - "loss": 0.0243, - "num_input_tokens_seen": 41291248, + "loss": 0.0227, + "num_input_tokens_seen": 41667104, "step": 5480 }, { "epoch": 1.489779286926995, - "grad_norm": 0.08481497317552567, + "grad_norm": 0.0813496932387352, "learning_rate": 2.5266701733742015e-05, - "loss": 0.0221, - "num_input_tokens_seen": 41328144, + "loss": 0.0208, + "num_input_tokens_seen": 41704192, "step": 5485 }, { "epoch": 1.491137521222411, - "grad_norm": 0.09031788259744644, + "grad_norm": 0.08662014454603195, "learning_rate": 2.5231142593826364e-05, - "loss": 0.0233, - "num_input_tokens_seen": 41361824, + "loss": 0.0228, + "num_input_tokens_seen": 41739504, "step": 5490 }, { "epoch": 1.4924957555178269, - "grad_norm": 0.07357614487409592, + "grad_norm": 0.08136535435914993, "learning_rate": 2.5195582986234244e-05, - "loss": 0.0205, - "num_input_tokens_seen": 41397296, + "loss": 0.0197, + "num_input_tokens_seen": 41775792, "step": 5495 }, { "epoch": 1.4938539898132428, - "grad_norm": 0.06317932903766632, + "grad_norm": 0.062364641577005386, "learning_rate": 2.5160022982914284e-05, - "loss": 0.0246, - "num_input_tokens_seen": 41432320, + "loss": 0.0241, + "num_input_tokens_seen": 41811200, "step": 5500 }, { "epoch": 1.4952122241086587, - "grad_norm": 0.07367074489593506, + "grad_norm": 0.07676199823617935, "learning_rate": 2.51244626558159e-05, - "loss": 0.0237, - "num_input_tokens_seen": 41470272, + "loss": 0.0235, + "num_input_tokens_seen": 41850064, "step": 5505 }, { "epoch": 1.4965704584040747, - "grad_norm": 0.31130602955818176, + "grad_norm": 0.056583426892757416, "learning_rate": 2.508890207688918e-05, - "loss": 0.0254, - "num_input_tokens_seen": 41509696, + "loss": 0.0227, + "num_input_tokens_seen": 41889760, "step": 5510 }, { "epoch": 1.4979286926994906, - "grad_norm": 0.06500332057476044, + "grad_norm": 0.06550715118646622, "learning_rate": 2.5053341318084712e-05, - "loss": 0.0255, - "num_input_tokens_seen": 41551024, + "loss": 0.0251, + "num_input_tokens_seen": 41931152, "step": 5515 }, { "epoch": 1.4992869269949067, - "grad_norm": 0.08249549567699432, + "grad_norm": 0.08628766983747482, "learning_rate": 2.5017780451353444e-05, - "loss": 0.0226, - "num_input_tokens_seen": 41587088, + "loss": 0.0217, + "num_input_tokens_seen": 41967728, "step": 5520 }, { "epoch": 1.5006451612903224, - "grad_norm": 0.0655369907617569, + "grad_norm": 0.06417880952358246, "learning_rate": 2.4982219548646565e-05, - "loss": 0.0291, - "num_input_tokens_seen": 41631104, + "loss": 0.0281, + "num_input_tokens_seen": 42012448, "step": 5525 }, { "epoch": 1.5020033955857386, - "grad_norm": 0.0672750473022461, + "grad_norm": 0.06570667028427124, "learning_rate": 2.49466586819153e-05, - "loss": 0.0195, - "num_input_tokens_seen": 41671376, + "loss": 0.019, + "num_input_tokens_seen": 42052944, "step": 5530 }, { "epoch": 1.5033616298811545, - "grad_norm": 0.0822894349694252, + "grad_norm": 0.07598859816789627, "learning_rate": 2.491109792311083e-05, - "loss": 0.0229, - "num_input_tokens_seen": 41709312, + "loss": 0.0221, + "num_input_tokens_seen": 42091152, "step": 5535 }, { "epoch": 1.5047198641765704, - "grad_norm": 0.06528326123952866, + "grad_norm": 0.06767242401838303, "learning_rate": 2.4875537344184107e-05, - "loss": 0.0242, - "num_input_tokens_seen": 41739936, + "loss": 0.0233, + "num_input_tokens_seen": 42122336, "step": 5540 }, { "epoch": 1.5060780984719864, - "grad_norm": 0.07415784150362015, + "grad_norm": 0.075541652739048, "learning_rate": 2.4839977017085725e-05, - "loss": 0.0235, - "num_input_tokens_seen": 41771616, + "loss": 0.0222, + "num_input_tokens_seen": 42154160, "step": 5545 }, { "epoch": 1.5074363327674023, - "grad_norm": 0.4940996766090393, + "grad_norm": 0.05461564660072327, "learning_rate": 2.480441701376576e-05, - "loss": 0.0205, - "num_input_tokens_seen": 41806352, + "loss": 0.0189, + "num_input_tokens_seen": 42189408, "step": 5550 }, { "epoch": 1.5087945670628184, - "grad_norm": 0.06652292609214783, + "grad_norm": 0.08329666405916214, "learning_rate": 2.4768857406173642e-05, - "loss": 0.0252, - "num_input_tokens_seen": 41846928, + "loss": 0.0232, + "num_input_tokens_seen": 42230288, "step": 5555 }, { "epoch": 1.5101528013582342, - "grad_norm": 0.10073066502809525, + "grad_norm": 0.058408744633197784, "learning_rate": 2.4733298266257994e-05, - "loss": 0.0232, - "num_input_tokens_seen": 41884960, + "loss": 0.021, + "num_input_tokens_seen": 42268656, "step": 5560 }, { "epoch": 1.5115110356536503, - "grad_norm": 0.06277264654636383, + "grad_norm": 0.057225123047828674, "learning_rate": 2.4697739665966488e-05, - "loss": 0.0236, - "num_input_tokens_seen": 41922608, + "loss": 0.0215, + "num_input_tokens_seen": 42306032, "step": 5565 }, { "epoch": 1.5128692699490662, - "grad_norm": 0.08400798588991165, + "grad_norm": 0.07627663016319275, "learning_rate": 2.466218167724572e-05, - "loss": 0.0229, - "num_input_tokens_seen": 41960912, + "loss": 0.0216, + "num_input_tokens_seen": 42345264, "step": 5570 }, { "epoch": 1.5142275042444822, - "grad_norm": 0.09464214742183685, + "grad_norm": 0.07789086550474167, "learning_rate": 2.4626624372041033e-05, - "loss": 0.0255, - "num_input_tokens_seen": 41995456, + "loss": 0.0229, + "num_input_tokens_seen": 42380064, "step": 5575 }, { "epoch": 1.515585738539898, - "grad_norm": 0.09425076842308044, + "grad_norm": 0.062203120440244675, "learning_rate": 2.4591067822296396e-05, - "loss": 0.0212, - "num_input_tokens_seen": 42032112, + "loss": 0.0198, + "num_input_tokens_seen": 42417024, "step": 5580 }, { "epoch": 1.516943972835314, - "grad_norm": 0.222529336810112, + "grad_norm": 0.1261635720729828, "learning_rate": 2.4555512099954243e-05, - "loss": 0.0253, - "num_input_tokens_seen": 42065376, + "loss": 0.0237, + "num_input_tokens_seen": 42450560, "step": 5585 }, { "epoch": 1.5183022071307302, - "grad_norm": 0.05081102252006531, + "grad_norm": 0.044348254799842834, "learning_rate": 2.4519957276955354e-05, - "loss": 0.0228, - "num_input_tokens_seen": 42097824, + "loss": 0.0219, + "num_input_tokens_seen": 42483360, "step": 5590 }, { "epoch": 1.5196604414261459, - "grad_norm": 0.1382300704717636, + "grad_norm": 0.0854431614279747, "learning_rate": 2.4484403425238648e-05, - "loss": 0.0241, - "num_input_tokens_seen": 42134256, + "loss": 0.0233, + "num_input_tokens_seen": 42520736, "step": 5595 }, { "epoch": 1.521018675721562, - "grad_norm": 0.21996216475963593, + "grad_norm": 0.06003598868846893, "learning_rate": 2.4448850616741113e-05, - "loss": 0.0217, - "num_input_tokens_seen": 42178880, + "loss": 0.0204, + "num_input_tokens_seen": 42565728, "step": 5600 }, { "epoch": 1.522376910016978, - "grad_norm": 0.0675797313451767, + "grad_norm": 0.06501615792512894, "learning_rate": 2.441329892339762e-05, - "loss": 0.026, - "num_input_tokens_seen": 42220848, + "loss": 0.0246, + "num_input_tokens_seen": 42608048, "step": 5605 }, { "epoch": 1.5237351443123939, - "grad_norm": 0.07650138437747955, + "grad_norm": 0.06390561163425446, "learning_rate": 2.437774841714078e-05, - "loss": 0.0206, - "num_input_tokens_seen": 42255744, + "loss": 0.0196, + "num_input_tokens_seen": 42643344, "step": 5610 }, { "epoch": 1.5250933786078098, - "grad_norm": 0.058517057448625565, + "grad_norm": 0.10138913989067078, "learning_rate": 2.4342199169900802e-05, - "loss": 0.0217, - "num_input_tokens_seen": 42298368, + "loss": 0.0198, + "num_input_tokens_seen": 42686160, "step": 5615 }, { "epoch": 1.5264516129032257, - "grad_norm": 0.08017346262931824, + "grad_norm": 0.07548526674509048, "learning_rate": 2.4306651253605342e-05, - "loss": 0.0242, - "num_input_tokens_seen": 42333920, + "loss": 0.0237, + "num_input_tokens_seen": 42722400, "step": 5620 }, { "epoch": 1.5278098471986419, - "grad_norm": 0.07034081965684891, + "grad_norm": 0.06796387583017349, "learning_rate": 2.427110474017938e-05, - "loss": 0.0224, - "num_input_tokens_seen": 42373424, + "loss": 0.0212, + "num_input_tokens_seen": 42762272, "step": 5625 }, { "epoch": 1.5291680814940576, - "grad_norm": 0.056663986295461655, + "grad_norm": 0.07166541367769241, "learning_rate": 2.423555970154504e-05, - "loss": 0.0242, - "num_input_tokens_seen": 42415792, + "loss": 0.0236, + "num_input_tokens_seen": 42804528, "step": 5630 }, { "epoch": 1.5305263157894737, - "grad_norm": 0.07644589245319366, + "grad_norm": 0.07266102731227875, "learning_rate": 2.4200016209621473e-05, - "loss": 0.0205, - "num_input_tokens_seen": 42452944, + "loss": 0.02, + "num_input_tokens_seen": 42842080, "step": 5635 }, { "epoch": 1.5318845500848897, - "grad_norm": 0.07390902191400528, + "grad_norm": 0.08697303384542465, "learning_rate": 2.4164474336324695e-05, - "loss": 0.0233, - "num_input_tokens_seen": 42490576, + "loss": 0.0223, + "num_input_tokens_seen": 42880096, "step": 5640 }, { "epoch": 1.5332427843803056, - "grad_norm": 0.059082433581352234, + "grad_norm": 0.053325824439525604, "learning_rate": 2.4128934153567448e-05, - "loss": 0.0199, - "num_input_tokens_seen": 42522224, + "loss": 0.019, + "num_input_tokens_seen": 42912064, "step": 5645 }, { "epoch": 1.5346010186757215, - "grad_norm": 0.10129754990339279, + "grad_norm": 0.0636473223567009, "learning_rate": 2.409339573325905e-05, - "loss": 0.0224, - "num_input_tokens_seen": 42563120, + "loss": 0.021, + "num_input_tokens_seen": 42953152, "step": 5650 }, { "epoch": 1.5359592529711374, - "grad_norm": 0.06713312119245529, + "grad_norm": 0.06718988716602325, "learning_rate": 2.4057859147305264e-05, - "loss": 0.0214, - "num_input_tokens_seen": 42606256, + "loss": 0.0217, + "num_input_tokens_seen": 42996688, "step": 5655 }, { "epoch": 1.5373174872665536, - "grad_norm": 0.07387067377567291, + "grad_norm": 0.07799860090017319, "learning_rate": 2.402232446760814e-05, - "loss": 0.0236, - "num_input_tokens_seen": 42638400, + "loss": 0.0222, + "num_input_tokens_seen": 43029072, "step": 5660 }, { "epoch": 1.5386757215619693, - "grad_norm": 0.06997398287057877, + "grad_norm": 0.05269554257392883, "learning_rate": 2.3986791766065848e-05, - "loss": 0.0202, - "num_input_tokens_seen": 42678880, + "loss": 0.0194, + "num_input_tokens_seen": 43069920, "step": 5665 }, { "epoch": 1.5400339558573855, - "grad_norm": 0.1527748554944992, + "grad_norm": 0.06410904228687286, "learning_rate": 2.3951261114572576e-05, - "loss": 0.0232, - "num_input_tokens_seen": 42715728, + "loss": 0.022, + "num_input_tokens_seen": 43106928, "step": 5670 }, { "epoch": 1.5413921901528014, - "grad_norm": 0.13918256759643555, + "grad_norm": 0.06798090040683746, "learning_rate": 2.3915732585018368e-05, - "loss": 0.0238, - "num_input_tokens_seen": 42748480, + "loss": 0.0228, + "num_input_tokens_seen": 43140288, "step": 5675 }, { "epoch": 1.5427504244482173, - "grad_norm": 0.07530725747346878, + "grad_norm": 0.07531072199344635, "learning_rate": 2.388020624928897e-05, - "loss": 0.0242, - "num_input_tokens_seen": 42793232, + "loss": 0.0243, + "num_input_tokens_seen": 43186192, "step": 5680 }, { "epoch": 1.5441086587436332, - "grad_norm": 0.3548333942890167, + "grad_norm": 0.07235848158597946, "learning_rate": 2.384468217926568e-05, - "loss": 0.0213, - "num_input_tokens_seen": 42833216, + "loss": 0.0208, + "num_input_tokens_seen": 43226432, "step": 5685 }, { "epoch": 1.5454668930390492, - "grad_norm": 0.07701446861028671, + "grad_norm": 0.060144148766994476, "learning_rate": 2.380916044682522e-05, - "loss": 0.0216, - "num_input_tokens_seen": 42868544, + "loss": 0.0206, + "num_input_tokens_seen": 43261952, "step": 5690 }, { "epoch": 1.5468251273344653, - "grad_norm": 0.10194521397352219, + "grad_norm": 0.08592890948057175, "learning_rate": 2.3773641123839582e-05, - "loss": 0.0227, - "num_input_tokens_seen": 42907760, + "loss": 0.0216, + "num_input_tokens_seen": 43301888, "step": 5695 }, { "epoch": 1.548183361629881, - "grad_norm": 0.06494913250207901, + "grad_norm": 0.06696067750453949, "learning_rate": 2.3738124282175885e-05, - "loss": 0.0244, - "num_input_tokens_seen": 42943776, + "loss": 0.0237, + "num_input_tokens_seen": 43338288, "step": 5700 }, { "epoch": 1.5495415959252972, - "grad_norm": 0.3170040249824524, + "grad_norm": 0.05403844267129898, "learning_rate": 2.370260999369622e-05, - "loss": 0.0216, - "num_input_tokens_seen": 42980064, + "loss": 0.0204, + "num_input_tokens_seen": 43374864, "step": 5705 }, { "epoch": 1.550899830220713, - "grad_norm": 0.060080353170633316, + "grad_norm": 0.07326831668615341, "learning_rate": 2.3667098330257516e-05, - "loss": 0.0249, - "num_input_tokens_seen": 43014064, + "loss": 0.0239, + "num_input_tokens_seen": 43409168, "step": 5710 }, { "epoch": 1.552258064516129, - "grad_norm": 0.2713087797164917, + "grad_norm": 0.05972052365541458, "learning_rate": 2.3631589363711396e-05, - "loss": 0.0222, - "num_input_tokens_seen": 43050944, + "loss": 0.0205, + "num_input_tokens_seen": 43446560, "step": 5715 }, { "epoch": 1.553616298811545, - "grad_norm": 0.05927253141999245, + "grad_norm": 0.05852935463190079, "learning_rate": 2.3596083165904016e-05, - "loss": 0.0233, - "num_input_tokens_seen": 43093424, + "loss": 0.0228, + "num_input_tokens_seen": 43489792, "step": 5720 }, { "epoch": 1.5549745331069609, - "grad_norm": 0.11192373186349869, + "grad_norm": 0.07164724171161652, "learning_rate": 2.3560579808675943e-05, - "loss": 0.0235, - "num_input_tokens_seen": 43126880, + "loss": 0.0213, + "num_input_tokens_seen": 43523456, "step": 5725 }, { "epoch": 1.556332767402377, - "grad_norm": 0.06608562171459198, + "grad_norm": 0.06650342792272568, "learning_rate": 2.3525079363861978e-05, - "loss": 0.0224, - "num_input_tokens_seen": 43164368, + "loss": 0.0209, + "num_input_tokens_seen": 43560864, "step": 5730 }, { "epoch": 1.5576910016977927, - "grad_norm": 0.06360087543725967, + "grad_norm": 0.05741402879357338, "learning_rate": 2.3489581903291044e-05, - "loss": 0.0211, - "num_input_tokens_seen": 43201264, + "loss": 0.0202, + "num_input_tokens_seen": 43598096, "step": 5735 }, { "epoch": 1.5590492359932089, - "grad_norm": 0.06886191666126251, + "grad_norm": 0.06168164685368538, "learning_rate": 2.3454087498786024e-05, - "loss": 0.026, - "num_input_tokens_seen": 43236928, + "loss": 0.0244, + "num_input_tokens_seen": 43634048, "step": 5740 }, { "epoch": 1.5604074702886248, - "grad_norm": 0.07811182737350464, + "grad_norm": 0.07885050028562546, "learning_rate": 2.3418596222163612e-05, - "loss": 0.0244, - "num_input_tokens_seen": 43272272, + "loss": 0.0234, + "num_input_tokens_seen": 43669824, "step": 5745 }, { "epoch": 1.5617657045840407, - "grad_norm": 0.06188252568244934, + "grad_norm": 0.0652555301785469, "learning_rate": 2.3383108145234185e-05, - "loss": 0.024, - "num_input_tokens_seen": 43307696, + "loss": 0.0256, + "num_input_tokens_seen": 43705904, "step": 5750 }, { "epoch": 1.5631239388794567, - "grad_norm": 0.08145466446876526, + "grad_norm": 0.05286850780248642, "learning_rate": 2.3347623339801632e-05, - "loss": 0.0224, - "num_input_tokens_seen": 43339216, + "loss": 0.0213, + "num_input_tokens_seen": 43737952, "step": 5755 }, { "epoch": 1.5644821731748726, - "grad_norm": 0.06275265663862228, + "grad_norm": 0.06385361403226852, "learning_rate": 2.3312141877663226e-05, - "loss": 0.022, - "num_input_tokens_seen": 43376608, + "loss": 0.0204, + "num_input_tokens_seen": 43775968, "step": 5760 }, { "epoch": 1.5658404074702887, - "grad_norm": 0.061310768127441406, + "grad_norm": 0.06357962638139725, "learning_rate": 2.327666383060949e-05, - "loss": 0.0254, - "num_input_tokens_seen": 43413104, + "loss": 0.0248, + "num_input_tokens_seen": 43812288, "step": 5765 }, { "epoch": 1.5671986417657044, - "grad_norm": 0.07107134163379669, + "grad_norm": 0.07855304330587387, "learning_rate": 2.3241189270424007e-05, - "loss": 0.0218, - "num_input_tokens_seen": 43449024, + "loss": 0.0214, + "num_input_tokens_seen": 43848448, "step": 5770 }, { "epoch": 1.5685568760611206, - "grad_norm": 0.05577808991074562, + "grad_norm": 0.05779190734028816, "learning_rate": 2.3205718268883338e-05, - "loss": 0.0223, - "num_input_tokens_seen": 43488016, + "loss": 0.0217, + "num_input_tokens_seen": 43887824, "step": 5775 }, { "epoch": 1.5699151103565365, - "grad_norm": 0.08125761151313782, + "grad_norm": 0.07732636481523514, "learning_rate": 2.3170250897756818e-05, - "loss": 0.0231, - "num_input_tokens_seen": 43525568, + "loss": 0.0224, + "num_input_tokens_seen": 43926160, "step": 5780 }, { "epoch": 1.5712733446519525, - "grad_norm": 0.09342998266220093, + "grad_norm": 0.07363414764404297, "learning_rate": 2.3134787228806457e-05, - "loss": 0.0207, - "num_input_tokens_seen": 43568784, + "loss": 0.0206, + "num_input_tokens_seen": 43969680, "step": 5785 }, { "epoch": 1.5726315789473684, - "grad_norm": 0.07627476006746292, + "grad_norm": 0.07191505283117294, "learning_rate": 2.3099327333786758e-05, - "loss": 0.0218, - "num_input_tokens_seen": 43602624, + "loss": 0.0214, + "num_input_tokens_seen": 44004096, "step": 5790 }, { "epoch": 1.5739898132427843, - "grad_norm": 0.06518115848302841, + "grad_norm": 0.06319862604141235, "learning_rate": 2.3063871284444602e-05, - "loss": 0.0214, - "num_input_tokens_seen": 43639856, + "loss": 0.0208, + "num_input_tokens_seen": 44041696, "step": 5795 }, { "epoch": 1.5753480475382005, - "grad_norm": 0.07762468606233597, + "grad_norm": 0.07420594990253448, "learning_rate": 2.3028419152519073e-05, - "loss": 0.0197, - "num_input_tokens_seen": 43676896, + "loss": 0.0199, + "num_input_tokens_seen": 44079152, "step": 5800 }, { "epoch": 1.5767062818336162, - "grad_norm": 0.052777402102947235, + "grad_norm": 0.05289439111948013, "learning_rate": 2.2992971009741342e-05, - "loss": 0.0214, - "num_input_tokens_seen": 43718384, + "loss": 0.0211, + "num_input_tokens_seen": 44120352, "step": 5805 }, { "epoch": 1.5780645161290323, - "grad_norm": 0.06247423216700554, + "grad_norm": 0.061246607452631, "learning_rate": 2.29575269278345e-05, - "loss": 0.0227, - "num_input_tokens_seen": 43755664, + "loss": 0.0222, + "num_input_tokens_seen": 44158208, "step": 5810 }, { "epoch": 1.5794227504244482, - "grad_norm": 0.0672045573592186, + "grad_norm": 0.06384890526533127, "learning_rate": 2.2922086978513434e-05, - "loss": 0.0237, - "num_input_tokens_seen": 43792720, + "loss": 0.0233, + "num_input_tokens_seen": 44194800, "step": 5815 }, { "epoch": 1.5807809847198642, - "grad_norm": 0.050724368542432785, + "grad_norm": 0.047608643770217896, "learning_rate": 2.288665123348465e-05, - "loss": 0.0245, - "num_input_tokens_seen": 43824432, + "loss": 0.0238, + "num_input_tokens_seen": 44226608, "step": 5820 }, { "epoch": 1.58213921901528, - "grad_norm": 0.06322595477104187, + "grad_norm": 0.0636783018708229, "learning_rate": 2.285121976444617e-05, - "loss": 0.0193, - "num_input_tokens_seen": 43856064, + "loss": 0.0188, + "num_input_tokens_seen": 44258960, "step": 5825 }, { "epoch": 1.583497453310696, - "grad_norm": 0.06533841788768768, + "grad_norm": 0.06563393026590347, "learning_rate": 2.281579264308734e-05, - "loss": 0.0223, - "num_input_tokens_seen": 43892080, + "loss": 0.0212, + "num_input_tokens_seen": 44295360, "step": 5830 }, { "epoch": 1.5848556876061122, - "grad_norm": 0.05999269708991051, + "grad_norm": 0.08919782936573029, "learning_rate": 2.2780369941088737e-05, - "loss": 0.0235, - "num_input_tokens_seen": 43930192, + "loss": 0.0231, + "num_input_tokens_seen": 44333856, "step": 5835 }, { "epoch": 1.5862139219015279, - "grad_norm": 0.05806586891412735, + "grad_norm": 0.056667882949113846, "learning_rate": 2.2744951730121973e-05, - "loss": 0.023, - "num_input_tokens_seen": 43971136, + "loss": 0.0222, + "num_input_tokens_seen": 44374448, "step": 5840 }, { "epoch": 1.587572156196944, - "grad_norm": 0.07610578835010529, + "grad_norm": 0.10093308985233307, "learning_rate": 2.2709538081849577e-05, - "loss": 0.0221, - "num_input_tokens_seen": 44008720, + "loss": 0.0216, + "num_input_tokens_seen": 44412336, "step": 5845 }, { "epoch": 1.58893039049236, - "grad_norm": 0.0748520940542221, + "grad_norm": 0.06972062587738037, "learning_rate": 2.2674129067924856e-05, - "loss": 0.024, - "num_input_tokens_seen": 44041472, + "loss": 0.023, + "num_input_tokens_seen": 44445600, "step": 5850 }, { "epoch": 1.5902886247877759, - "grad_norm": 0.06864915788173676, + "grad_norm": 0.06975811719894409, "learning_rate": 2.2638724759991736e-05, - "loss": 0.0224, - "num_input_tokens_seen": 44079024, + "loss": 0.0225, + "num_input_tokens_seen": 44483488, "step": 5855 }, { "epoch": 1.5916468590831918, - "grad_norm": 0.05537925288081169, + "grad_norm": 0.052726611495018005, "learning_rate": 2.260332522968463e-05, - "loss": 0.0226, - "num_input_tokens_seen": 44119856, + "loss": 0.0218, + "num_input_tokens_seen": 44524400, "step": 5860 }, { "epoch": 1.5930050933786077, - "grad_norm": 0.07073484361171722, + "grad_norm": 0.06838538497686386, "learning_rate": 2.256793054862825e-05, - "loss": 0.0208, - "num_input_tokens_seen": 44155648, + "loss": 0.019, + "num_input_tokens_seen": 44560656, "step": 5865 }, { "epoch": 1.594363327674024, - "grad_norm": 0.06511124968528748, + "grad_norm": 0.09711752831935883, "learning_rate": 2.2532540788437533e-05, - "loss": 0.0216, - "num_input_tokens_seen": 44194896, + "loss": 0.0215, + "num_input_tokens_seen": 44600448, "step": 5870 }, { "epoch": 1.5957215619694396, - "grad_norm": 0.19548256695270538, + "grad_norm": 0.07649100571870804, "learning_rate": 2.249715602071745e-05, - "loss": 0.0233, - "num_input_tokens_seen": 44226944, + "loss": 0.0224, + "num_input_tokens_seen": 44633088, "step": 5875 }, { "epoch": 1.5970797962648557, - "grad_norm": 0.056777697056531906, + "grad_norm": 0.06637274473905563, "learning_rate": 2.2461776317062864e-05, - "loss": 0.0191, - "num_input_tokens_seen": 44261712, + "loss": 0.0184, + "num_input_tokens_seen": 44668208, "step": 5880 }, { "epoch": 1.5984380305602717, - "grad_norm": 0.05846523493528366, + "grad_norm": 0.07350891828536987, "learning_rate": 2.2426401749058397e-05, - "loss": 0.0216, - "num_input_tokens_seen": 44305792, + "loss": 0.0213, + "num_input_tokens_seen": 44712736, "step": 5885 }, { "epoch": 1.5997962648556876, - "grad_norm": 0.05641613528132439, + "grad_norm": 0.054964981973171234, "learning_rate": 2.2391032388278277e-05, - "loss": 0.0239, - "num_input_tokens_seen": 44340032, + "loss": 0.0233, + "num_input_tokens_seen": 44747216, "step": 5890 }, { "epoch": 1.6011544991511035, - "grad_norm": 0.06615839898586273, + "grad_norm": 0.0680345967411995, "learning_rate": 2.2355668306286198e-05, - "loss": 0.0266, - "num_input_tokens_seen": 44370704, + "loss": 0.0247, + "num_input_tokens_seen": 44779232, "step": 5895 }, { "epoch": 1.6025127334465195, - "grad_norm": 0.0633646547794342, + "grad_norm": 0.06711190193891525, "learning_rate": 2.2320309574635176e-05, - "loss": 0.0237, - "num_input_tokens_seen": 44412144, + "loss": 0.0228, + "num_input_tokens_seen": 44820976, "step": 5900 }, { "epoch": 1.6038709677419356, - "grad_norm": 0.06729670614004135, + "grad_norm": 0.06518176198005676, "learning_rate": 2.2284956264867393e-05, - "loss": 0.0236, - "num_input_tokens_seen": 44453536, + "loss": 0.0226, + "num_input_tokens_seen": 44862416, "step": 5905 }, { "epoch": 1.6052292020373513, - "grad_norm": 0.0584205687046051, + "grad_norm": 0.05932782590389252, "learning_rate": 2.2249608448514072e-05, - "loss": 0.0242, - "num_input_tokens_seen": 44492192, + "loss": 0.0241, + "num_input_tokens_seen": 44901248, "step": 5910 }, { "epoch": 1.6065874363327675, - "grad_norm": 0.06782110780477524, + "grad_norm": 0.07299770414829254, "learning_rate": 2.2214266197095313e-05, - "loss": 0.0213, - "num_input_tokens_seen": 44534528, + "loss": 0.0204, + "num_input_tokens_seen": 44943152, "step": 5915 }, { "epoch": 1.6079456706281834, - "grad_norm": 0.061381708830595016, + "grad_norm": 0.053681571036577225, "learning_rate": 2.2178929582119955e-05, - "loss": 0.0219, - "num_input_tokens_seen": 44569536, + "loss": 0.0208, + "num_input_tokens_seen": 44978400, "step": 5920 }, { "epoch": 1.6093039049235993, - "grad_norm": 0.07434074580669403, + "grad_norm": 0.07506358623504639, "learning_rate": 2.2143598675085442e-05, - "loss": 0.0235, - "num_input_tokens_seen": 44611760, + "loss": 0.023, + "num_input_tokens_seen": 45020560, "step": 5925 }, { "epoch": 1.6106621392190152, - "grad_norm": 0.8118953704833984, + "grad_norm": 0.07410407811403275, "learning_rate": 2.2108273547477675e-05, - "loss": 0.0235, - "num_input_tokens_seen": 44650032, + "loss": 0.0218, + "num_input_tokens_seen": 45059616, "step": 5930 }, { "epoch": 1.6120203735144312, - "grad_norm": 0.25898486375808716, + "grad_norm": 0.06528817862272263, "learning_rate": 2.2072954270770817e-05, - "loss": 0.0245, - "num_input_tokens_seen": 44690352, + "loss": 0.0235, + "num_input_tokens_seen": 45100272, "step": 5935 }, { "epoch": 1.6133786078098473, - "grad_norm": 0.074348583817482, + "grad_norm": 0.07352499663829803, "learning_rate": 2.2037640916427246e-05, - "loss": 0.0236, - "num_input_tokens_seen": 44726176, + "loss": 0.0216, + "num_input_tokens_seen": 45136912, "step": 5940 }, { "epoch": 1.614736842105263, - "grad_norm": 0.07309360057115555, + "grad_norm": 0.06833123415708542, "learning_rate": 2.2002333555897333e-05, - "loss": 0.0219, - "num_input_tokens_seen": 44767360, + "loss": 0.021, + "num_input_tokens_seen": 45178160, "step": 5945 }, { "epoch": 1.6160950764006792, - "grad_norm": 0.26140955090522766, + "grad_norm": 0.08347856253385544, "learning_rate": 2.1967032260619328e-05, - "loss": 0.0257, - "num_input_tokens_seen": 44802368, + "loss": 0.0251, + "num_input_tokens_seen": 45213424, "step": 5950 }, { "epoch": 1.617453310696095, - "grad_norm": 0.12731550633907318, + "grad_norm": 0.07707837969064713, "learning_rate": 2.19317371020192e-05, - "loss": 0.0234, - "num_input_tokens_seen": 44845760, + "loss": 0.0227, + "num_input_tokens_seen": 45257520, "step": 5955 }, { "epoch": 1.618811544991511, - "grad_norm": 0.07852014154195786, + "grad_norm": 0.08516712486743927, "learning_rate": 2.1896448151510515e-05, - "loss": 0.0244, - "num_input_tokens_seen": 44878176, + "loss": 0.0225, + "num_input_tokens_seen": 45290480, "step": 5960 }, { "epoch": 1.620169779286927, - "grad_norm": 0.08292017877101898, + "grad_norm": 0.06930142641067505, "learning_rate": 2.1861165480494263e-05, - "loss": 0.0228, - "num_input_tokens_seen": 44915136, + "loss": 0.0225, + "num_input_tokens_seen": 45328336, "step": 5965 }, { "epoch": 1.6215280135823429, - "grad_norm": 0.05232521891593933, + "grad_norm": 0.05107520520687103, "learning_rate": 2.1825889160358743e-05, - "loss": 0.0223, - "num_input_tokens_seen": 44952112, + "loss": 0.0218, + "num_input_tokens_seen": 45365664, "step": 5970 }, { "epoch": 1.622886247877759, - "grad_norm": 0.07840275019407272, + "grad_norm": 0.05756008252501488, "learning_rate": 2.1790619262479393e-05, - "loss": 0.0233, - "num_input_tokens_seen": 44985488, + "loss": 0.0208, + "num_input_tokens_seen": 45399328, "step": 5975 }, { "epoch": 1.6242444821731747, - "grad_norm": 0.06773694604635239, + "grad_norm": 0.10355553030967712, "learning_rate": 2.175535585821866e-05, - "loss": 0.0231, - "num_input_tokens_seen": 45022704, + "loss": 0.0222, + "num_input_tokens_seen": 45437168, "step": 5980 }, { "epoch": 1.625602716468591, - "grad_norm": 0.07408016175031662, + "grad_norm": 0.07006395608186722, "learning_rate": 2.1720099018925853e-05, - "loss": 0.0209, - "num_input_tokens_seen": 45058032, + "loss": 0.0213, + "num_input_tokens_seen": 45473168, "step": 5985 }, { "epoch": 1.6269609507640068, - "grad_norm": 0.060067445039749146, + "grad_norm": 0.055388469249010086, "learning_rate": 2.1684848815937007e-05, - "loss": 0.0199, - "num_input_tokens_seen": 45098848, + "loss": 0.019, + "num_input_tokens_seen": 45514224, "step": 5990 }, { "epoch": 1.6283191850594227, - "grad_norm": 0.05388723686337471, + "grad_norm": 0.05363514646887779, "learning_rate": 2.1649605320574717e-05, - "loss": 0.0219, - "num_input_tokens_seen": 45140288, + "loss": 0.0208, + "num_input_tokens_seen": 45555824, "step": 5995 }, { "epoch": 1.6296774193548387, - "grad_norm": 0.09219494462013245, + "grad_norm": 0.08136473596096039, "learning_rate": 2.1614368604148e-05, - "loss": 0.0227, - "num_input_tokens_seen": 45176192, + "loss": 0.0216, + "num_input_tokens_seen": 45591968, "step": 6000 }, { "epoch": 1.6310356536502546, - "grad_norm": 0.06733919680118561, + "grad_norm": 0.0638800710439682, "learning_rate": 2.1579138737952172e-05, - "loss": 0.025, - "num_input_tokens_seen": 45213104, + "loss": 0.0234, + "num_input_tokens_seen": 45629648, "step": 6005 }, { "epoch": 1.6323938879456708, - "grad_norm": 0.06585469841957092, + "grad_norm": 0.05967140942811966, "learning_rate": 2.1543915793268686e-05, - "loss": 0.0251, - "num_input_tokens_seen": 45249296, + "loss": 0.0236, + "num_input_tokens_seen": 45666320, "step": 6010 }, { "epoch": 1.6337521222410865, - "grad_norm": 0.07048477977514267, + "grad_norm": 0.07135913521051407, "learning_rate": 2.150869984136499e-05, - "loss": 0.0225, - "num_input_tokens_seen": 45293792, + "loss": 0.0219, + "num_input_tokens_seen": 45710880, "step": 6015 }, { "epoch": 1.6351103565365026, - "grad_norm": 0.10251811146736145, + "grad_norm": 0.07664114981889725, "learning_rate": 2.1473490953494377e-05, - "loss": 0.0245, - "num_input_tokens_seen": 45341408, + "loss": 0.0238, + "num_input_tokens_seen": 45758736, "step": 6020 }, { "epoch": 1.6364685908319185, - "grad_norm": 0.06972771137952805, + "grad_norm": 0.0829898789525032, "learning_rate": 2.1438289200895857e-05, - "loss": 0.0229, - "num_input_tokens_seen": 45383712, + "loss": 0.0224, + "num_input_tokens_seen": 45802128, "step": 6025 }, { "epoch": 1.6378268251273345, - "grad_norm": 0.07650493830442429, + "grad_norm": 0.07322639971971512, "learning_rate": 2.140309465479399e-05, - "loss": 0.0262, - "num_input_tokens_seen": 45422800, + "loss": 0.024, + "num_input_tokens_seen": 45841600, "step": 6030 }, { "epoch": 1.6391850594227504, - "grad_norm": 0.07909869402647018, + "grad_norm": 0.07010941952466965, "learning_rate": 2.1367907386398773e-05, - "loss": 0.0243, - "num_input_tokens_seen": 45460016, + "loss": 0.0231, + "num_input_tokens_seen": 45879088, "step": 6035 }, { "epoch": 1.6405432937181663, - "grad_norm": 0.06334605067968369, + "grad_norm": 0.05733799934387207, "learning_rate": 2.133272746690546e-05, - "loss": 0.0216, - "num_input_tokens_seen": 45501568, + "loss": 0.0204, + "num_input_tokens_seen": 45920944, "step": 6040 }, { "epoch": 1.6419015280135825, - "grad_norm": 0.062005579471588135, + "grad_norm": 0.06398631632328033, "learning_rate": 2.1297554967494444e-05, - "loss": 0.0209, - "num_input_tokens_seen": 45535504, + "loss": 0.0196, + "num_input_tokens_seen": 45954592, "step": 6045 }, { "epoch": 1.6432597623089982, - "grad_norm": 0.08561499416828156, + "grad_norm": 0.06675909459590912, "learning_rate": 2.126238995933111e-05, - "loss": 0.02, - "num_input_tokens_seen": 45572624, + "loss": 0.0195, + "num_input_tokens_seen": 45992304, "step": 6050 }, { "epoch": 1.6446179966044143, - "grad_norm": 0.0670948326587677, + "grad_norm": 0.06096784770488739, "learning_rate": 2.122723251356567e-05, - "loss": 0.0227, - "num_input_tokens_seen": 45609520, + "loss": 0.022, + "num_input_tokens_seen": 46029040, "step": 6055 }, { "epoch": 1.6459762308998303, - "grad_norm": 0.0624363049864769, + "grad_norm": 0.06156138330698013, "learning_rate": 2.1192082701333056e-05, - "loss": 0.0234, - "num_input_tokens_seen": 45644288, + "loss": 0.0221, + "num_input_tokens_seen": 46064064, "step": 6060 }, { "epoch": 1.6473344651952462, - "grad_norm": 0.07170990854501724, + "grad_norm": 0.06856270879507065, "learning_rate": 2.1156940593752745e-05, - "loss": 0.0221, - "num_input_tokens_seen": 45678624, + "loss": 0.0211, + "num_input_tokens_seen": 46098864, "step": 6065 }, { "epoch": 1.648692699490662, - "grad_norm": 0.09405393898487091, + "grad_norm": 0.05802920088171959, "learning_rate": 2.1121806261928613e-05, - "loss": 0.0221, - "num_input_tokens_seen": 45717152, + "loss": 0.021, + "num_input_tokens_seen": 46138224, "step": 6070 }, { "epoch": 1.650050933786078, - "grad_norm": 0.0944899395108223, + "grad_norm": 0.06598014384508133, "learning_rate": 2.1086679776948813e-05, - "loss": 0.0234, - "num_input_tokens_seen": 45752928, + "loss": 0.0223, + "num_input_tokens_seen": 46174640, "step": 6075 }, { "epoch": 1.6514091680814942, - "grad_norm": 0.06202540919184685, + "grad_norm": 0.06200994923710823, "learning_rate": 2.1051561209885627e-05, - "loss": 0.0202, - "num_input_tokens_seen": 45781392, + "loss": 0.0191, + "num_input_tokens_seen": 46203472, "step": 6080 }, { "epoch": 1.6527674023769099, - "grad_norm": 0.11157164722681046, + "grad_norm": 0.09854763746261597, "learning_rate": 2.101645063179531e-05, - "loss": 0.0248, - "num_input_tokens_seen": 45816800, + "loss": 0.0243, + "num_input_tokens_seen": 46239200, "step": 6085 }, { "epoch": 1.654125636672326, - "grad_norm": 0.20105481147766113, + "grad_norm": 0.07024915516376495, "learning_rate": 2.0981348113717954e-05, - "loss": 0.023, - "num_input_tokens_seen": 45857296, + "loss": 0.0219, + "num_input_tokens_seen": 46280640, "step": 6090 }, { "epoch": 1.655483870967742, - "grad_norm": 0.07302982360124588, + "grad_norm": 0.07219861447811127, "learning_rate": 2.094625372667735e-05, - "loss": 0.0234, - "num_input_tokens_seen": 45897920, + "loss": 0.0219, + "num_input_tokens_seen": 46321664, "step": 6095 }, { "epoch": 1.656842105263158, - "grad_norm": 0.06398825347423553, + "grad_norm": 0.06369444727897644, "learning_rate": 2.091116754168082e-05, - "loss": 0.0207, - "num_input_tokens_seen": 45934768, + "loss": 0.0198, + "num_input_tokens_seen": 46358976, "step": 6100 }, { "epoch": 1.6582003395585738, - "grad_norm": 0.05927053466439247, + "grad_norm": 0.05098241940140724, "learning_rate": 2.0876089629719108e-05, - "loss": 0.0228, - "num_input_tokens_seen": 45975088, + "loss": 0.0218, + "num_input_tokens_seen": 46399584, "step": 6105 }, { "epoch": 1.6595585738539897, - "grad_norm": 0.06923488527536392, + "grad_norm": 0.06891950219869614, "learning_rate": 2.0841020061766216e-05, - "loss": 0.0252, - "num_input_tokens_seen": 46012384, + "loss": 0.024, + "num_input_tokens_seen": 46437216, "step": 6110 }, { "epoch": 1.660916808149406, - "grad_norm": 0.06517381966114044, + "grad_norm": 0.08127767592668533, "learning_rate": 2.0805958908779242e-05, - "loss": 0.0207, - "num_input_tokens_seen": 46055376, + "loss": 0.0206, + "num_input_tokens_seen": 46480544, "step": 6115 }, { "epoch": 1.6622750424448216, - "grad_norm": 0.06465736031532288, + "grad_norm": 0.060630496591329575, "learning_rate": 2.0770906241698295e-05, - "loss": 0.0219, - "num_input_tokens_seen": 46095408, + "loss": 0.0208, + "num_input_tokens_seen": 46520928, "step": 6120 }, { "epoch": 1.6636332767402378, - "grad_norm": 0.08297889679670334, + "grad_norm": 0.08422663062810898, "learning_rate": 2.0735862131446288e-05, - "loss": 0.0247, - "num_input_tokens_seen": 46133952, + "loss": 0.0231, + "num_input_tokens_seen": 46559840, "step": 6125 }, { "epoch": 1.6649915110356537, - "grad_norm": 0.058604106307029724, + "grad_norm": 0.056205444037914276, "learning_rate": 2.070082664892883e-05, - "loss": 0.0224, - "num_input_tokens_seen": 46173104, + "loss": 0.0214, + "num_input_tokens_seen": 46599296, "step": 6130 }, { "epoch": 1.6663497453310696, - "grad_norm": 0.06325851380825043, + "grad_norm": 0.05494374781847, "learning_rate": 2.066579986503406e-05, - "loss": 0.0212, - "num_input_tokens_seen": 46210368, + "loss": 0.02, + "num_input_tokens_seen": 46636928, "step": 6135 }, { "epoch": 1.6677079796264855, - "grad_norm": 0.10435193032026291, + "grad_norm": 0.07366279512643814, "learning_rate": 2.0630781850632533e-05, - "loss": 0.0234, - "num_input_tokens_seen": 46253168, + "loss": 0.0226, + "num_input_tokens_seen": 46680016, "step": 6140 }, { "epoch": 1.6690662139219015, - "grad_norm": 0.0600326769053936, + "grad_norm": 0.05606800317764282, "learning_rate": 2.0595772676577054e-05, - "loss": 0.0197, - "num_input_tokens_seen": 46288832, + "loss": 0.0188, + "num_input_tokens_seen": 46715824, "step": 6145 }, { "epoch": 1.6704244482173176, - "grad_norm": 0.06828299909830093, + "grad_norm": 0.06256142258644104, "learning_rate": 2.0560772413702545e-05, - "loss": 0.0226, - "num_input_tokens_seen": 46330960, + "loss": 0.0214, + "num_input_tokens_seen": 46758416, "step": 6150 }, { "epoch": 1.6717826825127333, - "grad_norm": 0.06123996898531914, + "grad_norm": 0.05996181443333626, "learning_rate": 2.052578113282589e-05, - "loss": 0.0209, - "num_input_tokens_seen": 46365296, + "loss": 0.0196, + "num_input_tokens_seen": 46793184, "step": 6155 }, { "epoch": 1.6731409168081495, - "grad_norm": 0.06523648649454117, + "grad_norm": 0.06338190287351608, "learning_rate": 2.0490798904745813e-05, - "loss": 0.0224, - "num_input_tokens_seen": 46407184, + "loss": 0.0214, + "num_input_tokens_seen": 46835456, "step": 6160 }, { "epoch": 1.6744991511035654, - "grad_norm": 0.055404286831617355, + "grad_norm": 0.05405982583761215, "learning_rate": 2.04558258002427e-05, - "loss": 0.0248, - "num_input_tokens_seen": 46447584, + "loss": 0.0242, + "num_input_tokens_seen": 46875824, "step": 6165 }, { "epoch": 1.6758573853989813, - "grad_norm": 0.06958398222923279, + "grad_norm": 0.0894760936498642, "learning_rate": 2.04208618900785e-05, - "loss": 0.0214, - "num_input_tokens_seen": 46481328, + "loss": 0.0212, + "num_input_tokens_seen": 46909904, "step": 6170 }, { "epoch": 1.6772156196943973, - "grad_norm": 0.07731959968805313, + "grad_norm": 0.07980603724718094, "learning_rate": 2.0385907244996545e-05, - "loss": 0.0247, - "num_input_tokens_seen": 46518080, + "loss": 0.024, + "num_input_tokens_seen": 46946464, "step": 6175 }, { "epoch": 1.6785738539898132, - "grad_norm": 0.06248604878783226, + "grad_norm": 0.06450074166059494, "learning_rate": 2.0350961935721426e-05, - "loss": 0.0187, - "num_input_tokens_seen": 46556576, + "loss": 0.0181, + "num_input_tokens_seen": 46985632, "step": 6180 }, { "epoch": 1.6799320882852293, - "grad_norm": 0.10714106261730194, + "grad_norm": 0.0625942125916481, "learning_rate": 2.0316026032958836e-05, - "loss": 0.0226, - "num_input_tokens_seen": 46594736, + "loss": 0.0218, + "num_input_tokens_seen": 47024048, "step": 6185 }, { "epoch": 1.681290322580645, - "grad_norm": 0.05643761530518532, + "grad_norm": 0.05469739809632301, "learning_rate": 2.028109960739545e-05, - "loss": 0.0185, - "num_input_tokens_seen": 46630992, + "loss": 0.0181, + "num_input_tokens_seen": 47060336, "step": 6190 }, { "epoch": 1.6826485568760612, - "grad_norm": 0.0658479779958725, + "grad_norm": 0.061180293560028076, "learning_rate": 2.0246182729698755e-05, - "loss": 0.0259, - "num_input_tokens_seen": 46666032, + "loss": 0.0246, + "num_input_tokens_seen": 47095440, "step": 6195 }, { "epoch": 1.6840067911714771, - "grad_norm": 0.06653279066085815, + "grad_norm": 0.06307143718004227, "learning_rate": 2.021127547051693e-05, - "loss": 0.0217, - "num_input_tokens_seen": 46699552, + "loss": 0.0207, + "num_input_tokens_seen": 47129456, "step": 6200 }, { "epoch": 1.685365025466893, - "grad_norm": 0.05504164844751358, + "grad_norm": 0.05172323063015938, "learning_rate": 2.0176377900478686e-05, - "loss": 0.0219, - "num_input_tokens_seen": 46737152, + "loss": 0.0209, + "num_input_tokens_seen": 47167840, "step": 6205 }, { "epoch": 1.686723259762309, - "grad_norm": 0.07203247398138046, + "grad_norm": 0.07302428036928177, "learning_rate": 2.0141490090193125e-05, - "loss": 0.0221, - "num_input_tokens_seen": 46774752, + "loss": 0.0214, + "num_input_tokens_seen": 47205648, "step": 6210 }, { "epoch": 1.688081494057725, - "grad_norm": 0.12581218779087067, + "grad_norm": 0.06473372876644135, "learning_rate": 2.010661211024961e-05, - "loss": 0.0223, - "num_input_tokens_seen": 46810896, + "loss": 0.0213, + "num_input_tokens_seen": 47241760, "step": 6215 }, { "epoch": 1.689439728353141, - "grad_norm": 0.07001473009586334, + "grad_norm": 0.06354104727506638, "learning_rate": 2.0071744031217613e-05, - "loss": 0.0211, - "num_input_tokens_seen": 46843328, + "loss": 0.0198, + "num_input_tokens_seen": 47274720, "step": 6220 }, { "epoch": 1.6907979626485568, - "grad_norm": 0.05632656440138817, + "grad_norm": 0.056263722479343414, "learning_rate": 2.003688592364657e-05, - "loss": 0.0207, - "num_input_tokens_seen": 46881616, + "loss": 0.0202, + "num_input_tokens_seen": 47313728, "step": 6225 }, { "epoch": 1.692156196943973, - "grad_norm": 0.0708954706788063, + "grad_norm": 0.06569759547710419, "learning_rate": 2.000203785806575e-05, - "loss": 0.0219, - "num_input_tokens_seen": 46923504, + "loss": 0.0212, + "num_input_tokens_seen": 47356160, "step": 6230 }, { "epoch": 1.6935144312393888, - "grad_norm": 0.07583509385585785, + "grad_norm": 0.07358380407094955, "learning_rate": 1.9967199904984087e-05, - "loss": 0.0235, - "num_input_tokens_seen": 46958688, + "loss": 0.0232, + "num_input_tokens_seen": 47392128, "step": 6235 }, { "epoch": 1.6948726655348048, - "grad_norm": 0.06593821942806244, + "grad_norm": 0.06224498152732849, "learning_rate": 1.9932372134890077e-05, - "loss": 0.021, - "num_input_tokens_seen": 46998960, + "loss": 0.0198, + "num_input_tokens_seen": 47432672, "step": 6240 }, { "epoch": 1.6962308998302207, - "grad_norm": 0.0587056428194046, + "grad_norm": 0.056039683520793915, "learning_rate": 1.989755461825159e-05, - "loss": 0.0253, - "num_input_tokens_seen": 47037392, + "loss": 0.0241, + "num_input_tokens_seen": 47471888, "step": 6245 }, { "epoch": 1.6975891341256366, - "grad_norm": 0.059191226959228516, + "grad_norm": 0.11455528438091278, "learning_rate": 1.986274742551576e-05, - "loss": 0.0235, - "num_input_tokens_seen": 47076912, + "loss": 0.0232, + "num_input_tokens_seen": 47511824, "step": 6250 }, { "epoch": 1.6989473684210528, - "grad_norm": 0.14773398637771606, + "grad_norm": 0.10780948400497437, "learning_rate": 1.982795062710884e-05, - "loss": 0.0235, - "num_input_tokens_seen": 47111760, + "loss": 0.0222, + "num_input_tokens_seen": 47547120, "step": 6255 }, { "epoch": 1.7003056027164685, - "grad_norm": 0.10469520092010498, + "grad_norm": 0.050327423959970474, "learning_rate": 1.9793164293436032e-05, - "loss": 0.0183, - "num_input_tokens_seen": 47147616, + "loss": 0.0168, + "num_input_tokens_seen": 47583488, "step": 6260 }, { "epoch": 1.7016638370118846, - "grad_norm": 0.09306786954402924, + "grad_norm": 0.0792776420712471, "learning_rate": 1.9758388494881392e-05, - "loss": 0.024, - "num_input_tokens_seen": 47183296, + "loss": 0.0241, + "num_input_tokens_seen": 47619312, "step": 6265 }, { "epoch": 1.7030220713073005, - "grad_norm": 0.06894327700138092, + "grad_norm": 0.07312720268964767, "learning_rate": 1.9723623301807632e-05, - "loss": 0.0242, - "num_input_tokens_seen": 47222592, + "loss": 0.0224, + "num_input_tokens_seen": 47658096, "step": 6270 }, { "epoch": 1.7043803056027165, - "grad_norm": 0.07884909957647324, + "grad_norm": 0.07055926322937012, "learning_rate": 1.968886878455602e-05, - "loss": 0.0283, - "num_input_tokens_seen": 47255856, + "loss": 0.0271, + "num_input_tokens_seen": 47691952, "step": 6275 }, { "epoch": 1.7057385398981324, - "grad_norm": 0.06651686131954193, + "grad_norm": 0.07367575913667679, "learning_rate": 1.965412501344622e-05, - "loss": 0.0228, - "num_input_tokens_seen": 47291520, + "loss": 0.0219, + "num_input_tokens_seen": 47727856, "step": 6280 }, { "epoch": 1.7070967741935483, - "grad_norm": 0.0753357782959938, + "grad_norm": 0.06040302291512489, "learning_rate": 1.9619392058776167e-05, - "loss": 0.0248, - "num_input_tokens_seen": 47332384, + "loss": 0.0239, + "num_input_tokens_seen": 47769248, "step": 6285 }, { "epoch": 1.7084550084889645, - "grad_norm": 0.04835745319724083, + "grad_norm": 0.08292552083730698, "learning_rate": 1.9584669990821887e-05, - "loss": 0.0242, - "num_input_tokens_seen": 47374064, + "loss": 0.0237, + "num_input_tokens_seen": 47811376, "step": 6290 }, { "epoch": 1.7098132427843802, - "grad_norm": 0.06652335822582245, + "grad_norm": 0.0608774870634079, "learning_rate": 1.9549958879837394e-05, - "loss": 0.0218, - "num_input_tokens_seen": 47406672, + "loss": 0.0207, + "num_input_tokens_seen": 47844208, "step": 6295 }, { "epoch": 1.7111714770797963, - "grad_norm": 0.05564892292022705, + "grad_norm": 0.05462099611759186, "learning_rate": 1.9515258796054525e-05, - "loss": 0.0192, - "num_input_tokens_seen": 47443936, + "loss": 0.0187, + "num_input_tokens_seen": 47881536, "step": 6300 }, { "epoch": 1.7125297113752123, - "grad_norm": 0.09786105155944824, + "grad_norm": 0.05520470440387726, "learning_rate": 1.9480569809682812e-05, - "loss": 0.0196, - "num_input_tokens_seen": 47482128, + "loss": 0.0188, + "num_input_tokens_seen": 47920144, "step": 6305 }, { "epoch": 1.7138879456706282, - "grad_norm": 0.06839790940284729, + "grad_norm": 0.061104074120521545, "learning_rate": 1.9445891990909335e-05, - "loss": 0.0236, - "num_input_tokens_seen": 47520576, + "loss": 0.0228, + "num_input_tokens_seen": 47958976, "step": 6310 }, { "epoch": 1.7152461799660441, - "grad_norm": 0.058803826570510864, + "grad_norm": 0.0640842542052269, "learning_rate": 1.941122540989857e-05, - "loss": 0.0177, - "num_input_tokens_seen": 47559184, + "loss": 0.0176, + "num_input_tokens_seen": 47997968, "step": 6315 }, { "epoch": 1.71660441426146, - "grad_norm": 0.0820888802409172, + "grad_norm": 0.09331648796796799, "learning_rate": 1.937657013679225e-05, - "loss": 0.0241, - "num_input_tokens_seen": 47590992, + "loss": 0.0236, + "num_input_tokens_seen": 48030416, "step": 6320 }, { "epoch": 1.7179626485568762, - "grad_norm": 0.05401637777686119, + "grad_norm": 0.05289526656270027, "learning_rate": 1.934192624170925e-05, - "loss": 0.0219, - "num_input_tokens_seen": 47630048, + "loss": 0.0218, + "num_input_tokens_seen": 48069744, "step": 6325 }, { "epoch": 1.719320882852292, - "grad_norm": 0.06732040643692017, + "grad_norm": 0.07099978625774384, "learning_rate": 1.9307293794745422e-05, - "loss": 0.0258, - "num_input_tokens_seen": 47661328, + "loss": 0.0244, + "num_input_tokens_seen": 48101568, "step": 6330 }, { "epoch": 1.720679117147708, - "grad_norm": 0.08160870522260666, + "grad_norm": 0.072075255215168, "learning_rate": 1.9272672865973413e-05, - "loss": 0.0238, - "num_input_tokens_seen": 47694816, + "loss": 0.0222, + "num_input_tokens_seen": 48135440, "step": 6335 }, { "epoch": 1.722037351443124, - "grad_norm": 0.058661170303821564, + "grad_norm": 0.05409224331378937, "learning_rate": 1.923806352544261e-05, - "loss": 0.025, - "num_input_tokens_seen": 47733296, + "loss": 0.0243, + "num_input_tokens_seen": 48174192, "step": 6340 }, { "epoch": 1.72339558573854, - "grad_norm": 0.059246569871902466, + "grad_norm": 0.05239759385585785, "learning_rate": 1.9203465843178945e-05, - "loss": 0.0238, - "num_input_tokens_seen": 47771824, + "loss": 0.0237, + "num_input_tokens_seen": 48212688, "step": 6345 }, { "epoch": 1.7247538200339558, - "grad_norm": 0.054253801703453064, + "grad_norm": 0.05297086387872696, "learning_rate": 1.916887988918475e-05, "loss": 0.0167, - "num_input_tokens_seen": 47807696, + "num_input_tokens_seen": 48249392, "step": 6350 }, { "epoch": 1.7261120543293718, - "grad_norm": 0.08045604825019836, + "grad_norm": 0.07605672627687454, "learning_rate": 1.913430573343863e-05, - "loss": 0.0245, - "num_input_tokens_seen": 47844560, + "loss": 0.0236, + "num_input_tokens_seen": 48286480, "step": 6355 }, { "epoch": 1.727470288624788, - "grad_norm": 0.06397169083356857, + "grad_norm": 0.06141950562596321, "learning_rate": 1.909974344589533e-05, - "loss": 0.0199, - "num_input_tokens_seen": 47883104, + "loss": 0.0197, + "num_input_tokens_seen": 48325600, "step": 6360 }, { "epoch": 1.7288285229202036, - "grad_norm": 0.05664176866412163, + "grad_norm": 0.05634889379143715, "learning_rate": 1.9065193096485563e-05, - "loss": 0.0232, - "num_input_tokens_seen": 47920496, + "loss": 0.0216, + "num_input_tokens_seen": 48363200, "step": 6365 }, { "epoch": 1.7301867572156198, - "grad_norm": 0.06017405912280083, + "grad_norm": 0.054491329938173294, "learning_rate": 1.90306547551159e-05, - "loss": 0.023, - "num_input_tokens_seen": 47957200, + "loss": 0.022, + "num_input_tokens_seen": 48400160, "step": 6370 }, { "epoch": 1.7315449915110357, - "grad_norm": 0.05232442915439606, + "grad_norm": 0.055698785930871964, "learning_rate": 1.899612849166861e-05, - "loss": 0.0257, - "num_input_tokens_seen": 47990256, + "loss": 0.0247, + "num_input_tokens_seen": 48434064, "step": 6375 }, { "epoch": 1.7329032258064516, - "grad_norm": 0.047369834035634995, + "grad_norm": 0.05116735026240349, "learning_rate": 1.8961614376001537e-05, - "loss": 0.0228, - "num_input_tokens_seen": 48028448, + "loss": 0.0212, + "num_input_tokens_seen": 48471920, "step": 6380 }, { "epoch": 1.7342614601018675, - "grad_norm": 0.05333266034722328, + "grad_norm": 0.05348145216703415, "learning_rate": 1.892711247794793e-05, - "loss": 0.0203, - "num_input_tokens_seen": 48069168, + "loss": 0.0196, + "num_input_tokens_seen": 48513008, "step": 6385 }, { "epoch": 1.7356196943972835, - "grad_norm": 0.07257471978664398, + "grad_norm": 0.07355861365795135, "learning_rate": 1.8892622867316316e-05, - "loss": 0.023, - "num_input_tokens_seen": 48108352, + "loss": 0.0224, + "num_input_tokens_seen": 48552672, "step": 6390 }, { "epoch": 1.7369779286926996, - "grad_norm": 0.07028140872716904, + "grad_norm": 0.07027094811201096, "learning_rate": 1.8858145613890382e-05, - "loss": 0.0215, - "num_input_tokens_seen": 48147408, + "loss": 0.0214, + "num_input_tokens_seen": 48591552, "step": 6395 }, { "epoch": 1.7383361629881153, - "grad_norm": 0.059473246335983276, + "grad_norm": 0.05680019035935402, "learning_rate": 1.8823680787428803e-05, - "loss": 0.0226, - "num_input_tokens_seen": 48180400, + "loss": 0.0215, + "num_input_tokens_seen": 48624720, "step": 6400 }, { "epoch": 1.7396943972835315, - "grad_norm": 0.07126529514789581, + "grad_norm": 0.07357285916805267, "learning_rate": 1.8789228457665088e-05, - "loss": 0.0219, - "num_input_tokens_seen": 48218064, + "loss": 0.0208, + "num_input_tokens_seen": 48662256, "step": 6405 }, { "epoch": 1.7410526315789474, - "grad_norm": 0.052097707986831665, + "grad_norm": 0.05064487084746361, "learning_rate": 1.8754788694307482e-05, - "loss": 0.021, - "num_input_tokens_seen": 48252032, + "loss": 0.0208, + "num_input_tokens_seen": 48696960, "step": 6410 }, { "epoch": 1.7424108658743633, - "grad_norm": 0.060646429657936096, + "grad_norm": 0.10876769572496414, "learning_rate": 1.8720361567038808e-05, - "loss": 0.0222, - "num_input_tokens_seen": 48287200, + "loss": 0.0214, + "num_input_tokens_seen": 48732288, "step": 6415 }, { "epoch": 1.7437691001697793, - "grad_norm": 0.054863572120666504, + "grad_norm": 0.055469512939453125, "learning_rate": 1.868594714551632e-05, - "loss": 0.0237, - "num_input_tokens_seen": 48328032, + "loss": 0.0231, + "num_input_tokens_seen": 48773648, "step": 6420 }, { "epoch": 1.7451273344651952, - "grad_norm": 0.0578652061522007, + "grad_norm": 0.05542069673538208, "learning_rate": 1.865154549937155e-05, - "loss": 0.0221, - "num_input_tokens_seen": 48365296, + "loss": 0.021, + "num_input_tokens_seen": 48810960, "step": 6425 }, { "epoch": 1.7464855687606113, - "grad_norm": 0.06459280103445053, + "grad_norm": 0.06563257426023483, "learning_rate": 1.8617156698210192e-05, - "loss": 0.0217, - "num_input_tokens_seen": 48403424, + "loss": 0.021, + "num_input_tokens_seen": 48849344, "step": 6430 }, { "epoch": 1.747843803056027, - "grad_norm": 0.0934501439332962, + "grad_norm": 0.0805174857378006, "learning_rate": 1.8582780811611954e-05, - "loss": 0.0212, - "num_input_tokens_seen": 48436608, + "loss": 0.0201, + "num_input_tokens_seen": 48883008, "step": 6435 }, { "epoch": 1.7492020373514432, - "grad_norm": 0.1569691151380539, + "grad_norm": 0.05504951253533363, "learning_rate": 1.8548417909130406e-05, - "loss": 0.0238, - "num_input_tokens_seen": 48470096, + "loss": 0.0223, + "num_input_tokens_seen": 48916624, "step": 6440 }, { "epoch": 1.7505602716468591, - "grad_norm": 0.10178361088037491, + "grad_norm": 0.06241302564740181, "learning_rate": 1.8514068060292854e-05, - "loss": 0.0203, - "num_input_tokens_seen": 48506064, + "loss": 0.0198, + "num_input_tokens_seen": 48952880, "step": 6445 }, { "epoch": 1.751918505942275, - "grad_norm": 0.06947403401136398, + "grad_norm": 0.06562935560941696, "learning_rate": 1.847973133460018e-05, - "loss": 0.0252, - "num_input_tokens_seen": 48546896, + "loss": 0.0246, + "num_input_tokens_seen": 48993984, "step": 6450 }, { "epoch": 1.753276740237691, - "grad_norm": 0.05491937696933746, + "grad_norm": 0.05664669722318649, "learning_rate": 1.8445407801526733e-05, - "loss": 0.0209, - "num_input_tokens_seen": 48590592, + "loss": 0.0203, + "num_input_tokens_seen": 49038016, "step": 6455 }, { "epoch": 1.754634974533107, - "grad_norm": 0.07351076602935791, + "grad_norm": 0.054909318685531616, "learning_rate": 1.841109753052015e-05, - "loss": 0.0218, - "num_input_tokens_seen": 48635664, + "loss": 0.0214, + "num_input_tokens_seen": 49083392, "step": 6460 }, { "epoch": 1.755993208828523, - "grad_norm": 0.06625485420227051, + "grad_norm": 0.06568380445241928, "learning_rate": 1.8376800591001254e-05, - "loss": 0.0222, - "num_input_tokens_seen": 48671152, + "loss": 0.0208, + "num_input_tokens_seen": 49118976, "step": 6465 }, { "epoch": 1.7573514431239388, - "grad_norm": 0.05678631737828255, + "grad_norm": 0.056140609085559845, "learning_rate": 1.8342517052363857e-05, - "loss": 0.0226, - "num_input_tokens_seen": 48712192, + "loss": 0.0222, + "num_input_tokens_seen": 49160208, "step": 6470 }, { "epoch": 1.758709677419355, - "grad_norm": 0.06484083831310272, + "grad_norm": 0.063535675406456, "learning_rate": 1.8308246983974703e-05, - "loss": 0.0202, - "num_input_tokens_seen": 48744896, + "loss": 0.0195, + "num_input_tokens_seen": 49193568, "step": 6475 }, { "epoch": 1.7600679117147708, - "grad_norm": 0.06810230761766434, + "grad_norm": 0.06647848337888718, "learning_rate": 1.827399045517325e-05, - "loss": 0.021, - "num_input_tokens_seen": 48784544, + "loss": 0.0207, + "num_input_tokens_seen": 49233872, "step": 6480 }, { "epoch": 1.7614261460101868, - "grad_norm": 0.057892151176929474, + "grad_norm": 0.05483429506421089, "learning_rate": 1.823974753527158e-05, - "loss": 0.0246, - "num_input_tokens_seen": 48824368, + "loss": 0.0221, + "num_input_tokens_seen": 49274576, "step": 6485 }, { "epoch": 1.7627843803056027, - "grad_norm": 0.06382384151220322, + "grad_norm": 0.07061336934566498, "learning_rate": 1.8205518293554226e-05, - "loss": 0.0216, - "num_input_tokens_seen": 48855600, + "loss": 0.0208, + "num_input_tokens_seen": 49306128, "step": 6490 }, { "epoch": 1.7641426146010186, - "grad_norm": 0.05575373023748398, + "grad_norm": 0.10263277590274811, "learning_rate": 1.8171302799278052e-05, - "loss": 0.0213, - "num_input_tokens_seen": 48893328, + "loss": 0.022, + "num_input_tokens_seen": 49344272, "step": 6495 }, { "epoch": 1.7655008488964348, - "grad_norm": 0.07554890960454941, + "grad_norm": 0.06351672112941742, "learning_rate": 1.8137101121672108e-05, - "loss": 0.0236, - "num_input_tokens_seen": 48930288, + "loss": 0.0226, + "num_input_tokens_seen": 49381552, "step": 6500 }, { "epoch": 1.7668590831918505, - "grad_norm": 0.06260082125663757, + "grad_norm": 0.0671292245388031, "learning_rate": 1.8102913329937478e-05, - "loss": 0.0225, - "num_input_tokens_seen": 48967392, + "loss": 0.0217, + "num_input_tokens_seen": 49419200, "step": 6505 }, { "epoch": 1.7682173174872666, - "grad_norm": 0.05568086355924606, + "grad_norm": 0.055766455829143524, "learning_rate": 1.8068739493247165e-05, - "loss": 0.0205, - "num_input_tokens_seen": 49004320, + "loss": 0.0197, + "num_input_tokens_seen": 49456416, "step": 6510 }, { "epoch": 1.7695755517826826, - "grad_norm": 0.06968121975660324, + "grad_norm": 0.06835462152957916, "learning_rate": 1.8034579680745927e-05, - "loss": 0.0208, - "num_input_tokens_seen": 49037104, + "loss": 0.0216, + "num_input_tokens_seen": 49489568, "step": 6515 }, { "epoch": 1.7709337860780985, - "grad_norm": 0.06512192636728287, + "grad_norm": 0.0635710060596466, "learning_rate": 1.800043396155015e-05, - "loss": 0.0199, - "num_input_tokens_seen": 49079440, + "loss": 0.0197, + "num_input_tokens_seen": 49532416, "step": 6520 }, { "epoch": 1.7722920203735144, - "grad_norm": 0.11070317029953003, + "grad_norm": 0.05543075501918793, "learning_rate": 1.7966302404747704e-05, - "loss": 0.0211, - "num_input_tokens_seen": 49114240, + "loss": 0.0202, + "num_input_tokens_seen": 49567792, "step": 6525 }, { "epoch": 1.7736502546689303, - "grad_norm": 0.05814257636666298, + "grad_norm": 0.05593164265155792, "learning_rate": 1.7932185079397802e-05, - "loss": 0.0202, - "num_input_tokens_seen": 49150752, + "loss": 0.0197, + "num_input_tokens_seen": 49605056, "step": 6530 }, { "epoch": 1.7750084889643465, - "grad_norm": 0.06973344832658768, + "grad_norm": 0.0635877251625061, "learning_rate": 1.789808205453087e-05, - "loss": 0.022, - "num_input_tokens_seen": 49193280, + "loss": 0.0213, + "num_input_tokens_seen": 49647680, "step": 6535 }, { "epoch": 1.7763667232597622, - "grad_norm": 0.0639907568693161, + "grad_norm": 0.06712055951356888, "learning_rate": 1.786399339914838e-05, - "loss": 0.0209, - "num_input_tokens_seen": 49231024, + "loss": 0.02, + "num_input_tokens_seen": 49685200, "step": 6540 }, { "epoch": 1.7777249575551783, - "grad_norm": 0.06217236444354057, + "grad_norm": 0.06433555483818054, "learning_rate": 1.7829919182222752e-05, - "loss": 0.0246, - "num_input_tokens_seen": 49268992, + "loss": 0.0238, + "num_input_tokens_seen": 49723824, "step": 6545 }, { "epoch": 1.7790831918505943, - "grad_norm": 0.06269615143537521, + "grad_norm": 0.0525888167321682, "learning_rate": 1.779585947269718e-05, - "loss": 0.0208, - "num_input_tokens_seen": 49303888, + "loss": 0.0196, + "num_input_tokens_seen": 49758976, "step": 6550 }, { "epoch": 1.7804414261460102, - "grad_norm": 0.059671543538570404, + "grad_norm": 0.06161023676395416, "learning_rate": 1.7761814339485504e-05, - "loss": 0.0227, - "num_input_tokens_seen": 49340784, + "loss": 0.0215, + "num_input_tokens_seen": 49796752, "step": 6555 }, { "epoch": 1.7817996604414261, - "grad_norm": 0.05118735134601593, + "grad_norm": 0.049128443002700806, "learning_rate": 1.772778385147209e-05, - "loss": 0.0218, - "num_input_tokens_seen": 49378672, + "loss": 0.0221, + "num_input_tokens_seen": 49834672, "step": 6560 }, { "epoch": 1.783157894736842, - "grad_norm": 0.06125462427735329, + "grad_norm": 0.053119756281375885, "learning_rate": 1.7693768077511645e-05, - "loss": 0.0228, - "num_input_tokens_seen": 49420944, + "loss": 0.0214, + "num_input_tokens_seen": 49877200, "step": 6565 }, { "epoch": 1.7845161290322582, - "grad_norm": 0.11390368640422821, + "grad_norm": 0.0915680080652237, "learning_rate": 1.7659767086429117e-05, - "loss": 0.0245, - "num_input_tokens_seen": 49452096, + "loss": 0.0235, + "num_input_tokens_seen": 49909168, "step": 6570 }, { "epoch": 1.785874363327674, - "grad_norm": 0.08304750919342041, + "grad_norm": 0.06351496279239655, "learning_rate": 1.7625780947019554e-05, - "loss": 0.023, - "num_input_tokens_seen": 49492096, + "loss": 0.0227, + "num_input_tokens_seen": 49949808, "step": 6575 }, { "epoch": 1.78723259762309, - "grad_norm": 0.07931499183177948, + "grad_norm": 0.07798344641923904, "learning_rate": 1.7591809728047933e-05, - "loss": 0.0244, - "num_input_tokens_seen": 49534976, + "loss": 0.0235, + "num_input_tokens_seen": 49992816, "step": 6580 }, { "epoch": 1.788590831918506, - "grad_norm": 0.07373729348182678, + "grad_norm": 0.0681823343038559, "learning_rate": 1.755785349824906e-05, - "loss": 0.0231, - "num_input_tokens_seen": 49568976, + "loss": 0.0229, + "num_input_tokens_seen": 50027328, "step": 6585 }, { "epoch": 1.789949066213922, - "grad_norm": 0.08440768718719482, + "grad_norm": 0.0819878950715065, "learning_rate": 1.75239123263274e-05, - "loss": 0.0237, - "num_input_tokens_seen": 49606784, + "loss": 0.0225, + "num_input_tokens_seen": 50065616, "step": 6590 }, { "epoch": 1.7913073005093378, - "grad_norm": 0.06280064582824707, + "grad_norm": 0.062420692294836044, "learning_rate": 1.7489986280956965e-05, - "loss": 0.0203, - "num_input_tokens_seen": 49648176, + "loss": 0.0205, + "num_input_tokens_seen": 50106896, "step": 6595 }, { "epoch": 1.7926655348047538, - "grad_norm": 0.10720508545637131, + "grad_norm": 0.097614586353302, "learning_rate": 1.7456075430781155e-05, - "loss": 0.0191, - "num_input_tokens_seen": 49685328, + "loss": 0.019, + "num_input_tokens_seen": 50144064, "step": 6600 }, { "epoch": 1.79402376910017, - "grad_norm": 0.05949164927005768, + "grad_norm": 0.05531376227736473, "learning_rate": 1.7422179844412607e-05, - "loss": 0.0182, - "num_input_tokens_seen": 49723136, + "loss": 0.0178, + "num_input_tokens_seen": 50182304, "step": 6605 }, { "epoch": 1.7953820033955856, - "grad_norm": 0.07848730683326721, + "grad_norm": 0.07144606113433838, "learning_rate": 1.7388299590433106e-05, - "loss": 0.0229, - "num_input_tokens_seen": 49758416, + "loss": 0.0224, + "num_input_tokens_seen": 50217808, "step": 6610 }, { "epoch": 1.7967402376910018, - "grad_norm": 0.05873727798461914, + "grad_norm": 0.05558127909898758, "learning_rate": 1.7354434737393393e-05, - "loss": 0.0233, - "num_input_tokens_seen": 49797296, + "loss": 0.0223, + "num_input_tokens_seen": 50256592, "step": 6615 }, { "epoch": 1.7980984719864177, - "grad_norm": 0.14630505442619324, + "grad_norm": 0.07012637704610825, "learning_rate": 1.7320585353813055e-05, - "loss": 0.0251, - "num_input_tokens_seen": 49832848, + "loss": 0.0234, + "num_input_tokens_seen": 50292336, "step": 6620 }, { "epoch": 1.7994567062818336, - "grad_norm": 0.06748270243406296, + "grad_norm": 0.07306186109781265, "learning_rate": 1.7286751508180376e-05, "loss": 0.0197, - "num_input_tokens_seen": 49869520, + "num_input_tokens_seen": 50330000, "step": 6625 }, { "epoch": 1.8008149405772496, - "grad_norm": 0.06950581818819046, + "grad_norm": 0.0681864321231842, "learning_rate": 1.725293326895221e-05, - "loss": 0.0236, - "num_input_tokens_seen": 49906880, + "loss": 0.0234, + "num_input_tokens_seen": 50367696, "step": 6630 }, { "epoch": 1.8021731748726655, - "grad_norm": 0.06153281405568123, + "grad_norm": 0.05640508607029915, "learning_rate": 1.7219130704553828e-05, - "loss": 0.0237, - "num_input_tokens_seen": 49944096, + "loss": 0.0231, + "num_input_tokens_seen": 50405808, "step": 6635 }, { "epoch": 1.8035314091680816, - "grad_norm": 0.06984547525644302, + "grad_norm": 0.06432195752859116, "learning_rate": 1.718534388337878e-05, - "loss": 0.021, - "num_input_tokens_seen": 49982992, + "loss": 0.0207, + "num_input_tokens_seen": 50445024, "step": 6640 }, { "epoch": 1.8048896434634973, - "grad_norm": 0.0692206621170044, + "grad_norm": 0.06272155791521072, "learning_rate": 1.7151572873788774e-05, - "loss": 0.0226, - "num_input_tokens_seen": 50019760, + "loss": 0.0222, + "num_input_tokens_seen": 50483040, "step": 6645 }, { "epoch": 1.8062478777589135, - "grad_norm": 0.05928044393658638, + "grad_norm": 0.05431320518255234, "learning_rate": 1.7117817744113515e-05, - "loss": 0.0228, - "num_input_tokens_seen": 50061088, + "loss": 0.0225, + "num_input_tokens_seen": 50524976, "step": 6650 }, { "epoch": 1.8076061120543294, - "grad_norm": 0.06796359270811081, + "grad_norm": 0.06570173054933548, "learning_rate": 1.708407856265059e-05, - "loss": 0.0256, - "num_input_tokens_seen": 50100528, + "loss": 0.0232, + "num_input_tokens_seen": 50564624, "step": 6655 }, { "epoch": 1.8089643463497453, - "grad_norm": 0.11564414948225021, + "grad_norm": 0.05761904641985893, "learning_rate": 1.7050355397665308e-05, - "loss": 0.0226, - "num_input_tokens_seen": 50136000, + "loss": 0.0205, + "num_input_tokens_seen": 50600576, "step": 6660 }, { "epoch": 1.8103225806451613, - "grad_norm": 0.06873555481433868, + "grad_norm": 0.06520888209342957, "learning_rate": 1.701664831739057e-05, - "loss": 0.02, - "num_input_tokens_seen": 50170480, + "loss": 0.0196, + "num_input_tokens_seen": 50635424, "step": 6665 }, { "epoch": 1.8116808149405772, - "grad_norm": 0.09888514876365662, + "grad_norm": 0.06567588448524475, "learning_rate": 1.6982957390026748e-05, - "loss": 0.0212, - "num_input_tokens_seen": 50204304, + "loss": 0.02, + "num_input_tokens_seen": 50669344, "step": 6670 }, { "epoch": 1.8130390492359933, - "grad_norm": 0.05057258531451225, + "grad_norm": 0.04973582178354263, "learning_rate": 1.6949282683741513e-05, - "loss": 0.0215, - "num_input_tokens_seen": 50243104, + "loss": 0.0201, + "num_input_tokens_seen": 50708064, "step": 6675 }, { "epoch": 1.814397283531409, - "grad_norm": 0.05890742316842079, + "grad_norm": 0.05827637016773224, "learning_rate": 1.6915624266669716e-05, - "loss": 0.0233, - "num_input_tokens_seen": 50281152, + "loss": 0.0225, + "num_input_tokens_seen": 50746688, "step": 6680 }, { "epoch": 1.8157555178268252, - "grad_norm": 0.057754117995500565, + "grad_norm": 0.05299776419997215, "learning_rate": 1.6881982206913265e-05, - "loss": 0.0194, - "num_input_tokens_seen": 50322688, + "loss": 0.0185, + "num_input_tokens_seen": 50788832, "step": 6685 }, { "epoch": 1.8171137521222411, - "grad_norm": 0.060516826808452606, + "grad_norm": 0.07037771493196487, "learning_rate": 1.6848356572540963e-05, - "loss": 0.0223, - "num_input_tokens_seen": 50358032, + "loss": 0.022, + "num_input_tokens_seen": 50824816, "step": 6690 }, { "epoch": 1.818471986417657, - "grad_norm": 0.04983174428343773, + "grad_norm": 0.05225036293268204, "learning_rate": 1.6814747431588378e-05, - "loss": 0.0223, - "num_input_tokens_seen": 50394256, + "loss": 0.0216, + "num_input_tokens_seen": 50860848, "step": 6695 }, { "epoch": 1.819830220713073, - "grad_norm": 0.06562423706054688, + "grad_norm": 0.06369852274656296, "learning_rate": 1.6781154852057705e-05, - "loss": 0.0218, - "num_input_tokens_seen": 50431616, + "loss": 0.0213, + "num_input_tokens_seen": 50898800, "step": 6700 }, { "epoch": 1.821188455008489, - "grad_norm": 0.05733059346675873, + "grad_norm": 0.058175861835479736, "learning_rate": 1.674757890191763e-05, - "loss": 0.0223, - "num_input_tokens_seen": 50469712, + "loss": 0.0219, + "num_input_tokens_seen": 50937312, "step": 6705 }, { "epoch": 1.822546689303905, - "grad_norm": 0.08202795684337616, + "grad_norm": 0.08628622442483902, "learning_rate": 1.6714019649103206e-05, - "loss": 0.0193, - "num_input_tokens_seen": 50508896, + "loss": 0.0188, + "num_input_tokens_seen": 50976800, "step": 6710 }, { "epoch": 1.8239049235993208, - "grad_norm": 0.06936032325029373, + "grad_norm": 0.06458728015422821, "learning_rate": 1.668047716151569e-05, - "loss": 0.0242, - "num_input_tokens_seen": 50541312, + "loss": 0.0227, + "num_input_tokens_seen": 51009152, "step": 6715 }, { "epoch": 1.825263157894737, - "grad_norm": 0.2017645388841629, + "grad_norm": 0.04750293493270874, "learning_rate": 1.6646951507022407e-05, - "loss": 0.0205, - "num_input_tokens_seen": 50578640, + "loss": 0.0199, + "num_input_tokens_seen": 51046768, "step": 6720 }, { "epoch": 1.8266213921901528, - "grad_norm": 0.08113709837198257, + "grad_norm": 0.0717410296201706, "learning_rate": 1.6613442753456638e-05, - "loss": 0.0211, - "num_input_tokens_seen": 50615328, + "loss": 0.0203, + "num_input_tokens_seen": 51083648, "step": 6725 }, { "epoch": 1.8279796264855688, - "grad_norm": 0.056701257824897766, + "grad_norm": 0.054074857383966446, "learning_rate": 1.6579950968617466e-05, - "loss": 0.0207, - "num_input_tokens_seen": 50652240, + "loss": 0.0196, + "num_input_tokens_seen": 51120976, "step": 6730 }, { "epoch": 1.8293378607809847, - "grad_norm": 0.06136125326156616, + "grad_norm": 0.06058571860194206, "learning_rate": 1.6546476220269647e-05, - "loss": 0.0209, - "num_input_tokens_seen": 50692720, + "loss": 0.02, + "num_input_tokens_seen": 51161456, "step": 6735 }, { "epoch": 1.8306960950764006, - "grad_norm": 0.15777438879013062, + "grad_norm": 0.10675826668739319, "learning_rate": 1.6513018576143447e-05, - "loss": 0.0227, - "num_input_tokens_seen": 50734384, + "loss": 0.0215, + "num_input_tokens_seen": 51203344, "step": 6740 }, { "epoch": 1.8320543293718168, - "grad_norm": 0.053264934569597244, + "grad_norm": 0.05028011277318001, "learning_rate": 1.647957810393454e-05, - "loss": 0.0209, - "num_input_tokens_seen": 50773216, + "loss": 0.021, + "num_input_tokens_seen": 51242848, "step": 6745 }, { "epoch": 1.8334125636672325, - "grad_norm": 0.06792760640382767, + "grad_norm": 0.05301205813884735, "learning_rate": 1.6446154871303852e-05, - "loss": 0.0236, - "num_input_tokens_seen": 50812592, + "loss": 0.0235, + "num_input_tokens_seen": 51282336, "step": 6750 }, { "epoch": 1.8347707979626486, - "grad_norm": 0.05595404654741287, + "grad_norm": 0.05482291430234909, "learning_rate": 1.641274894587743e-05, - "loss": 0.022, - "num_input_tokens_seen": 50847216, + "loss": 0.0211, + "num_input_tokens_seen": 51316896, "step": 6755 }, { "epoch": 1.8361290322580646, - "grad_norm": 0.054529160261154175, + "grad_norm": 0.05019862949848175, "learning_rate": 1.63793603952463e-05, - "loss": 0.0216, - "num_input_tokens_seen": 50883904, + "loss": 0.0207, + "num_input_tokens_seen": 51353984, "step": 6760 }, { "epoch": 1.8374872665534805, - "grad_norm": 0.05558116361498833, + "grad_norm": 0.0570211298763752, "learning_rate": 1.634598928696634e-05, - "loss": 0.0214, - "num_input_tokens_seen": 50924112, + "loss": 0.0212, + "num_input_tokens_seen": 51394752, "step": 6765 }, { "epoch": 1.8388455008488964, - "grad_norm": 0.051606666296720505, + "grad_norm": 0.18512773513793945, "learning_rate": 1.6312635688558114e-05, - "loss": 0.0214, - "num_input_tokens_seen": 50958528, + "loss": 0.0203, + "num_input_tokens_seen": 51429632, "step": 6770 }, { "epoch": 1.8402037351443123, - "grad_norm": 0.2778514623641968, + "grad_norm": 0.05840624123811722, "learning_rate": 1.6279299667506793e-05, - "loss": 0.0222, - "num_input_tokens_seen": 50997552, + "loss": 0.0211, + "num_input_tokens_seen": 51468736, "step": 6775 }, { "epoch": 1.8415619694397285, - "grad_norm": 0.06035924702882767, + "grad_norm": 0.06473814696073532, "learning_rate": 1.6245981291261953e-05, - "loss": 0.0253, - "num_input_tokens_seen": 51038752, + "loss": 0.0246, + "num_input_tokens_seen": 51510208, "step": 6780 }, { "epoch": 1.8429202037351442, - "grad_norm": 0.055022913962602615, + "grad_norm": 0.056781597435474396, "learning_rate": 1.6212680627237483e-05, - "loss": 0.0225, - "num_input_tokens_seen": 51073440, + "loss": 0.0216, + "num_input_tokens_seen": 51545536, "step": 6785 }, { "epoch": 1.8442784380305604, - "grad_norm": 0.05965486913919449, + "grad_norm": 0.0576360784471035, "learning_rate": 1.617939774281143e-05, - "loss": 0.0241, - "num_input_tokens_seen": 51113376, + "loss": 0.0239, + "num_input_tokens_seen": 51585744, "step": 6790 }, { "epoch": 1.8456366723259763, - "grad_norm": 0.06445445865392685, + "grad_norm": 0.06504545360803604, "learning_rate": 1.6146132705325872e-05, - "loss": 0.0232, - "num_input_tokens_seen": 51151712, + "loss": 0.0225, + "num_input_tokens_seen": 51624608, "step": 6795 }, { "epoch": 1.8469949066213922, - "grad_norm": 0.06630487740039825, + "grad_norm": 0.06570043414831161, "learning_rate": 1.6112885582086773e-05, - "loss": 0.0234, - "num_input_tokens_seen": 51188512, + "loss": 0.0221, + "num_input_tokens_seen": 51661888, "step": 6800 }, { "epoch": 1.8483531409168081, - "grad_norm": 0.055931299924850464, + "grad_norm": 0.05429806932806969, "learning_rate": 1.6079656440363865e-05, - "loss": 0.0252, - "num_input_tokens_seen": 51224464, + "loss": 0.0249, + "num_input_tokens_seen": 51698784, "step": 6805 }, { "epoch": 1.849711375212224, - "grad_norm": 0.059301793575286865, + "grad_norm": 0.06328245252370834, "learning_rate": 1.6046445347390454e-05, - "loss": 0.0217, - "num_input_tokens_seen": 51261904, + "loss": 0.0213, + "num_input_tokens_seen": 51736368, "step": 6810 }, { "epoch": 1.8510696095076402, - "grad_norm": 0.12036537379026413, + "grad_norm": 0.0558626763522625, "learning_rate": 1.601325237036338e-05, - "loss": 0.0215, - "num_input_tokens_seen": 51297072, + "loss": 0.0205, + "num_input_tokens_seen": 51771520, "step": 6815 }, { "epoch": 1.852427843803056, - "grad_norm": 0.06112348660826683, + "grad_norm": 0.0610220804810524, "learning_rate": 1.5980077576442794e-05, - "loss": 0.0208, - "num_input_tokens_seen": 51335664, + "loss": 0.02, + "num_input_tokens_seen": 51810496, "step": 6820 }, { "epoch": 1.853786078098472, - "grad_norm": 0.11574798077344894, + "grad_norm": 0.08432168513536453, "learning_rate": 1.5946921032752076e-05, - "loss": 0.0208, - "num_input_tokens_seen": 51374704, + "loss": 0.02, + "num_input_tokens_seen": 51849952, "step": 6825 }, { "epoch": 1.855144312393888, - "grad_norm": 0.0724540650844574, + "grad_norm": 0.069209024310112, "learning_rate": 1.5913782806377674e-05, - "loss": 0.0198, - "num_input_tokens_seen": 51415072, + "loss": 0.0191, + "num_input_tokens_seen": 51890576, "step": 6830 }, { "epoch": 1.856502546689304, - "grad_norm": 0.06270474195480347, + "grad_norm": 0.07066139578819275, "learning_rate": 1.588066296436897e-05, - "loss": 0.0207, - "num_input_tokens_seen": 51454976, + "loss": 0.0203, + "num_input_tokens_seen": 51930928, "step": 6835 }, { "epoch": 1.8578607809847199, - "grad_norm": 0.06559880822896957, + "grad_norm": 0.07974623143672943, "learning_rate": 1.5847561573738147e-05, "loss": 0.0225, - "num_input_tokens_seen": 51490400, + "num_input_tokens_seen": 51966496, "step": 6840 }, { "epoch": 1.8592190152801358, - "grad_norm": 0.05387277528643608, + "grad_norm": 0.06943505257368088, "learning_rate": 1.5814478701460065e-05, - "loss": 0.0224, - "num_input_tokens_seen": 51530832, + "loss": 0.0217, + "num_input_tokens_seen": 52007584, "step": 6845 }, { "epoch": 1.860577249575552, - "grad_norm": 0.07749956101179123, + "grad_norm": 0.09194477647542953, "learning_rate": 1.5781414414472106e-05, - "loss": 0.0227, - "num_input_tokens_seen": 51565040, + "loss": 0.0223, + "num_input_tokens_seen": 52041920, "step": 6850 }, { "epoch": 1.8619354838709676, - "grad_norm": 0.07207181304693222, + "grad_norm": 0.06561128795146942, "learning_rate": 1.5748368779674054e-05, - "loss": 0.0216, - "num_input_tokens_seen": 51605856, + "loss": 0.0212, + "num_input_tokens_seen": 52083344, "step": 6855 }, { "epoch": 1.8632937181663838, - "grad_norm": 0.07440678030252457, + "grad_norm": 0.07169898599386215, "learning_rate": 1.5715341863927952e-05, - "loss": 0.028, - "num_input_tokens_seen": 51645024, + "loss": 0.0265, + "num_input_tokens_seen": 52123472, "step": 6860 }, { "epoch": 1.8646519524617997, - "grad_norm": 0.05914171412587166, + "grad_norm": 0.05700104683637619, "learning_rate": 1.568233373405796e-05, - "loss": 0.0241, - "num_input_tokens_seen": 51682160, + "loss": 0.0235, + "num_input_tokens_seen": 52161120, "step": 6865 }, { "epoch": 1.8660101867572156, - "grad_norm": 0.05507180839776993, + "grad_norm": 0.05529894307255745, "learning_rate": 1.5649344456850256e-05, - "loss": 0.0195, - "num_input_tokens_seen": 51723184, + "loss": 0.019, + "num_input_tokens_seen": 52202608, "step": 6870 }, { "epoch": 1.8673684210526316, - "grad_norm": 0.0616757906973362, + "grad_norm": 0.061204228550195694, "learning_rate": 1.5616374099052823e-05, - "loss": 0.0234, - "num_input_tokens_seen": 51754736, + "loss": 0.0228, + "num_input_tokens_seen": 52234480, "step": 6875 }, { "epoch": 1.8687266553480475, - "grad_norm": 0.0618629977107048, + "grad_norm": 0.05946953222155571, "learning_rate": 1.5583422727375406e-05, - "loss": 0.0215, - "num_input_tokens_seen": 51790208, + "loss": 0.0211, + "num_input_tokens_seen": 52270560, "step": 6880 }, { "epoch": 1.8700848896434636, - "grad_norm": 0.11168353259563446, + "grad_norm": 0.06086380034685135, "learning_rate": 1.5550490408489322e-05, - "loss": 0.0237, - "num_input_tokens_seen": 51828416, + "loss": 0.0224, + "num_input_tokens_seen": 52309120, "step": 6885 }, { "epoch": 1.8714431239388793, - "grad_norm": 0.06446199864149094, + "grad_norm": 0.05908305197954178, "learning_rate": 1.551757720902734e-05, - "loss": 0.0237, - "num_input_tokens_seen": 51870016, + "loss": 0.023, + "num_input_tokens_seen": 52351056, "step": 6890 }, { "epoch": 1.8728013582342955, - "grad_norm": 0.06454633176326752, + "grad_norm": 0.060026511549949646, "learning_rate": 1.548468319558354e-05, - "loss": 0.0224, - "num_input_tokens_seen": 51904672, + "loss": 0.022, + "num_input_tokens_seen": 52385632, "step": 6895 }, { "epoch": 1.8741595925297114, - "grad_norm": 0.06271891295909882, + "grad_norm": 0.06655149161815643, "learning_rate": 1.5451808434713187e-05, - "loss": 0.0216, - "num_input_tokens_seen": 51942720, + "loss": 0.0204, + "num_input_tokens_seen": 52424208, "step": 6900 }, { "epoch": 1.8755178268251274, - "grad_norm": 0.06335651874542236, + "grad_norm": 0.07591118663549423, "learning_rate": 1.5418952992932588e-05, - "loss": 0.021, - "num_input_tokens_seen": 51976912, + "loss": 0.0205, + "num_input_tokens_seen": 52458848, "step": 6905 }, { "epoch": 1.8768760611205433, - "grad_norm": 0.08403119444847107, + "grad_norm": 0.053961992263793945, "learning_rate": 1.5386116936718963e-05, - "loss": 0.0219, - "num_input_tokens_seen": 52010688, + "loss": 0.0207, + "num_input_tokens_seen": 52493440, "step": 6910 }, { "epoch": 1.8782342954159592, - "grad_norm": 0.061412930488586426, + "grad_norm": 0.05218011885881424, "learning_rate": 1.5353300332510306e-05, - "loss": 0.0207, - "num_input_tokens_seen": 52048224, + "loss": 0.02, + "num_input_tokens_seen": 52531536, "step": 6915 }, { "epoch": 1.8795925297113754, - "grad_norm": 0.057810891419649124, + "grad_norm": 0.09047458320856094, "learning_rate": 1.532050324670526e-05, - "loss": 0.0236, - "num_input_tokens_seen": 52086640, + "loss": 0.0226, + "num_input_tokens_seen": 52570304, "step": 6920 }, { "epoch": 1.880950764006791, - "grad_norm": 0.06707277148962021, + "grad_norm": 0.06559126824140549, "learning_rate": 1.5287725745662966e-05, - "loss": 0.0237, - "num_input_tokens_seen": 52122512, + "loss": 0.0234, + "num_input_tokens_seen": 52606112, "step": 6925 }, { "epoch": 1.8823089983022072, - "grad_norm": 0.07093371450901031, + "grad_norm": 0.06673850119113922, "learning_rate": 1.5254967895702954e-05, - "loss": 0.0223, - "num_input_tokens_seen": 52163152, + "loss": 0.021, + "num_input_tokens_seen": 52647248, "step": 6930 }, { "epoch": 1.8836672325976231, - "grad_norm": 0.20037858188152313, + "grad_norm": 0.06524866074323654, "learning_rate": 1.5222229763104983e-05, - "loss": 0.0283, - "num_input_tokens_seen": 52200240, + "loss": 0.025, + "num_input_tokens_seen": 52684688, "step": 6935 }, { "epoch": 1.885025466893039, - "grad_norm": 0.06738494336605072, + "grad_norm": 0.0685221254825592, "learning_rate": 1.5189511414108903e-05, - "loss": 0.0219, - "num_input_tokens_seen": 52239744, + "loss": 0.0214, + "num_input_tokens_seen": 52724640, "step": 6940 }, { "epoch": 1.886383701188455, - "grad_norm": 0.07329879701137543, + "grad_norm": 0.05598802492022514, "learning_rate": 1.5156812914914564e-05, - "loss": 0.0234, - "num_input_tokens_seen": 52279824, + "loss": 0.0217, + "num_input_tokens_seen": 52765488, "step": 6945 }, { "epoch": 1.887741935483871, - "grad_norm": 0.06737591326236725, + "grad_norm": 0.06245255470275879, "learning_rate": 1.5124134331681634e-05, - "loss": 0.0236, - "num_input_tokens_seen": 52315600, + "loss": 0.0223, + "num_input_tokens_seen": 52801440, "step": 6950 }, { "epoch": 1.889100169779287, - "grad_norm": 0.15183807909488678, + "grad_norm": 0.05997039005160332, "learning_rate": 1.5091475730529492e-05, - "loss": 0.0208, - "num_input_tokens_seen": 52357584, + "loss": 0.0203, + "num_input_tokens_seen": 52843952, "step": 6955 }, { "epoch": 1.8904584040747028, - "grad_norm": 0.05592288076877594, + "grad_norm": 0.09911610186100006, "learning_rate": 1.5058837177537088e-05, - "loss": 0.0231, - "num_input_tokens_seen": 52394464, + "loss": 0.0226, + "num_input_tokens_seen": 52881440, "step": 6960 }, { "epoch": 1.891816638370119, - "grad_norm": 0.11332418769598007, + "grad_norm": 0.05058996379375458, "learning_rate": 1.5026218738742803e-05, - "loss": 0.0223, - "num_input_tokens_seen": 52431184, + "loss": 0.022, + "num_input_tokens_seen": 52918752, "step": 6965 }, { "epoch": 1.8931748726655349, - "grad_norm": 0.17018726468086243, + "grad_norm": 0.06364253908395767, "learning_rate": 1.4993620480144322e-05, - "loss": 0.025, - "num_input_tokens_seen": 52468736, + "loss": 0.0233, + "num_input_tokens_seen": 52957008, "step": 6970 }, { "epoch": 1.8945331069609508, - "grad_norm": 0.08814197778701782, + "grad_norm": 0.06475406885147095, "learning_rate": 1.4961042467698503e-05, - "loss": 0.0222, - "num_input_tokens_seen": 52507168, + "loss": 0.0211, + "num_input_tokens_seen": 52995728, "step": 6975 }, { "epoch": 1.8958913412563667, - "grad_norm": 0.055439457297325134, + "grad_norm": 0.05265176296234131, "learning_rate": 1.4928484767321232e-05, - "loss": 0.0235, - "num_input_tokens_seen": 52540944, + "loss": 0.0226, + "num_input_tokens_seen": 53030512, "step": 6980 }, { "epoch": 1.8972495755517826, - "grad_norm": 0.07516860961914062, + "grad_norm": 0.06381529569625854, "learning_rate": 1.4895947444887304e-05, - "loss": 0.0256, - "num_input_tokens_seen": 52578128, + "loss": 0.0228, + "num_input_tokens_seen": 53067952, "step": 6985 }, { "epoch": 1.8986078098471988, - "grad_norm": 0.06383699923753738, + "grad_norm": 0.054354868829250336, "learning_rate": 1.486343056623028e-05, - "loss": 0.0229, - "num_input_tokens_seen": 52612128, + "loss": 0.0215, + "num_input_tokens_seen": 53102544, "step": 6990 }, { "epoch": 1.8999660441426145, - "grad_norm": 0.06671349704265594, + "grad_norm": 0.06441856920719147, "learning_rate": 1.4830934197142357e-05, - "loss": 0.0241, - "num_input_tokens_seen": 52652640, + "loss": 0.023, + "num_input_tokens_seen": 53143552, "step": 6995 }, { "epoch": 1.9013242784380306, - "grad_norm": 0.0735764428973198, + "grad_norm": 0.07909012585878372, "learning_rate": 1.4798458403374233e-05, - "loss": 0.0214, - "num_input_tokens_seen": 52692048, + "loss": 0.0206, + "num_input_tokens_seen": 53183232, "step": 7000 } ], "logging_steps": 5, "max_steps": 11043, - "num_input_tokens_seen": 52692048, + "num_input_tokens_seen": 53183232, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { @@ -11227,7 +11227,7 @@ "attributes": {} } }, - "total_flos": 2.3992198824438006e+18, + "total_flos": 2.421584896966263e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null