| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.934336525307797, | |
| "eval_steps": 500, | |
| "global_step": 17400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022799817601459188, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 4.9225, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.045599635202918376, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 2.1478, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06839945280437756, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6361, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09119927040583675, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 0.467, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11399908800729594, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 0.0001666666666666667, | |
| "loss": 0.4381, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13679890560875513, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.3365, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15959872321021432, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 0.00019988560970029743, | |
| "loss": 0.3255, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1823985408116735, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00019977121940059484, | |
| "loss": 0.2946, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2051983584131327, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 0.00019965682910089226, | |
| "loss": 0.3023, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.22799817601459188, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 0.00019954243880118967, | |
| "loss": 0.2708, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2507979936160511, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 0.0001994280485014871, | |
| "loss": 0.3645, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.27359781121751026, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 0.00019931365820178448, | |
| "loss": 0.3072, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.29639762881896947, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 0.00019919926790208192, | |
| "loss": 0.2891, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.31919744642042863, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 0.00019908487760237934, | |
| "loss": 0.2763, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.34199726402188785, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 0.00019897048730267673, | |
| "loss": 0.2824, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.364797081623347, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 0.00019885609700297417, | |
| "loss": 0.3007, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3875968992248062, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00019874170670327156, | |
| "loss": 0.2955, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4103967168262654, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 0.000198627316403569, | |
| "loss": 0.2385, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4331965344277246, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 0.0001985129261038664, | |
| "loss": 0.2461, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.45599635202918376, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 0.0001983985358041638, | |
| "loss": 0.2567, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.478796169630643, | |
| "grad_norm": 1.25, | |
| "learning_rate": 0.00019828414550446125, | |
| "loss": 0.2189, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5015959872321022, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 0.00019816975520475864, | |
| "loss": 0.272, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5243958048335613, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 0.00019805536490505606, | |
| "loss": 0.2615, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5471956224350205, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 0.00019794097460535347, | |
| "loss": 0.2476, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5699954400364797, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 0.0001978265843056509, | |
| "loss": 0.2327, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5927952576379389, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 0.0001977121940059483, | |
| "loss": 0.2664, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.615595075239398, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00019759780370624572, | |
| "loss": 0.2266, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6383948928408573, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 0.00019748341340654314, | |
| "loss": 0.2234, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6611947104423165, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 0.00019736902310684055, | |
| "loss": 0.2463, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6839945280437757, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 0.00019725463280713797, | |
| "loss": 0.2311, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7067943456452348, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 0.00019714024250743536, | |
| "loss": 0.2251, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.729594163246694, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 0.0001970258522077328, | |
| "loss": 0.2492, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7523939808481532, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 0.00019691146190803022, | |
| "loss": 0.2554, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7751937984496124, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.00019679707160832763, | |
| "loss": 0.2384, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7979936160510716, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.00019668268130862505, | |
| "loss": 0.2583, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.8207934336525308, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.00019656829100892244, | |
| "loss": 0.2343, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.84359325125399, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 0.00019645390070921988, | |
| "loss": 0.2193, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8663930688554492, | |
| "grad_norm": 0.875, | |
| "learning_rate": 0.00019633951040951727, | |
| "loss": 0.2212, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8891928864569083, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 0.00019622512010981468, | |
| "loss": 0.2178, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.9119927040583675, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.00019611072981011213, | |
| "loss": 0.2135, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9347925216598267, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 0.00019599633951040952, | |
| "loss": 0.2522, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.957592339261286, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 0.00019588194921070696, | |
| "loss": 0.2138, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9803921568627451, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.00019576755891100435, | |
| "loss": 0.2043, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.0031919744642044, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 0.00019565316861130177, | |
| "loss": 0.1855, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.0259917920656634, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 0.00019553877831159918, | |
| "loss": 0.2099, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.0487916096671226, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 0.0001954243880118966, | |
| "loss": 0.2032, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.0715914272685818, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 0.000195309997712194, | |
| "loss": 0.1753, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.094391244870041, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00019519560741249143, | |
| "loss": 0.1997, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.1171910624715002, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 0.00019508121711278885, | |
| "loss": 0.2101, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.1399908800729595, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 0.00019496682681308626, | |
| "loss": 0.1734, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.1627906976744187, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 0.00019485243651338368, | |
| "loss": 0.1716, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.1855905152758779, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 0.0001947380462136811, | |
| "loss": 0.1588, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.2083903328773369, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 0.0001946236559139785, | |
| "loss": 0.166, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.231190150478796, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 0.00019450926561427593, | |
| "loss": 0.1514, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.2539899680802553, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.00019439487531457334, | |
| "loss": 0.2103, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.2767897856817145, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 0.00019428048501487076, | |
| "loss": 0.185, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.2995896032831737, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 0.00019416609471516815, | |
| "loss": 0.1498, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.322389420884633, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.0001940517044154656, | |
| "loss": 0.1706, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.3451892384860922, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 0.00019393731411576298, | |
| "loss": 0.1579, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.3679890560875512, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.0001938229238160604, | |
| "loss": 0.1777, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.3907888736890106, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.00019370853351635784, | |
| "loss": 0.168, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.4135886912904696, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 0.00019359414321665523, | |
| "loss": 0.1325, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.4363885088919288, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.00019347975291695267, | |
| "loss": 0.1483, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.459188326493388, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 0.00019336536261725006, | |
| "loss": 0.1424, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.4819881440948472, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 0.00019325097231754748, | |
| "loss": 0.1352, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.5047879616963065, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 0.0001931365820178449, | |
| "loss": 0.1583, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.5275877792977655, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.0001930221917181423, | |
| "loss": 0.1496, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.550387596899225, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 0.00019290780141843972, | |
| "loss": 0.1551, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.573187414500684, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.00019279341111873714, | |
| "loss": 0.1275, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.5959872321021433, | |
| "grad_norm": 1.25, | |
| "learning_rate": 0.00019267902081903456, | |
| "loss": 0.1685, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.6187870497036023, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 0.00019256463051933197, | |
| "loss": 0.1271, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.6415868673050615, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 0.0001924502402196294, | |
| "loss": 0.1344, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.6643866849065208, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0001923358499199268, | |
| "loss": 0.1516, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.68718650250798, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.00019222145962022422, | |
| "loss": 0.1379, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.7099863201094392, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00019210706932052164, | |
| "loss": 0.1396, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.7327861377108982, | |
| "grad_norm": 1.25, | |
| "learning_rate": 0.00019199267902081902, | |
| "loss": 0.1544, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.7555859553123576, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 0.00019187828872111647, | |
| "loss": 0.1639, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.7783857729138166, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 0.00019176389842141386, | |
| "loss": 0.1473, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.8011855905152758, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 0.0001916495081217113, | |
| "loss": 0.1606, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.823985408116735, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00019153511782200872, | |
| "loss": 0.155, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.8467852257181943, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 0.0001914207275223061, | |
| "loss": 0.1222, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.8695850433196535, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 0.00019130633722260355, | |
| "loss": 0.1476, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.8923848609211125, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.00019119194692290094, | |
| "loss": 0.1358, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.915184678522572, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 0.00019107755662319838, | |
| "loss": 0.1439, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.937984496124031, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 0.00019096316632349577, | |
| "loss": 0.1699, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.9607843137254903, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 0.00019084877602379319, | |
| "loss": 0.1364, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.9835841313269493, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 0.0001907343857240906, | |
| "loss": 0.1242, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.0063839489284088, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 0.00019061999542438802, | |
| "loss": 0.1146, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.0291837665298678, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 0.00019050560512468543, | |
| "loss": 0.1327, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.0519835841313268, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 0.00019039121482498285, | |
| "loss": 0.1278, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.074783401732786, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.00019027682452528027, | |
| "loss": 0.1069, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.097583219334245, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.00019016243422557768, | |
| "loss": 0.1351, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.1203830369357046, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 0.0001900480439258751, | |
| "loss": 0.1372, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.1431828545371636, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 0.0001899336536261725, | |
| "loss": 0.1093, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.165982672138623, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 0.00018981926332646993, | |
| "loss": 0.1, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.188782489740082, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00018970487302676735, | |
| "loss": 0.1016, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.2115823073415415, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.00018959048272706473, | |
| "loss": 0.0976, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.2343821249430005, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 0.00018947609242736218, | |
| "loss": 0.1132, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.2571819425444595, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 0.00018936170212765957, | |
| "loss": 0.1174, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.279981760145919, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 0.000189247311827957, | |
| "loss": 0.1191, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.302781577747378, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.00018913292152825443, | |
| "loss": 0.0933, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.3255813953488373, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 0.00018901853122855181, | |
| "loss": 0.1065, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.3483812129502963, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.00018890414092884926, | |
| "loss": 0.0993, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.3711810305517558, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 0.00018878975062914665, | |
| "loss": 0.1179, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.3939808481532148, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 0.00018867536032944406, | |
| "loss": 0.0982, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.4167806657546738, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 0.00018856097002974148, | |
| "loss": 0.0831, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.439580483356133, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 0.0001884465797300389, | |
| "loss": 0.0943, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.462380300957592, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.00018833218943033634, | |
| "loss": 0.088, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.4851801185590516, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 0.00018821779913063373, | |
| "loss": 0.0952, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.5079799361605106, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 0.00018810340883093114, | |
| "loss": 0.0952, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.53077975376197, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.00018798901853122856, | |
| "loss": 0.0901, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.553579571363429, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00018787462823152598, | |
| "loss": 0.0989, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.576379388964888, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 0.0001877602379318234, | |
| "loss": 0.0794, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.5991792065663475, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 0.0001876458476321208, | |
| "loss": 0.1076, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.621979024167807, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.00018753145733241822, | |
| "loss": 0.0745, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.644778841769266, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.00018741706703271564, | |
| "loss": 0.0938, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.667578659370725, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.00018730267673301306, | |
| "loss": 0.0888, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.6903784769721844, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 0.00018718828643331044, | |
| "loss": 0.0836, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.7131782945736433, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0001870738961336079, | |
| "loss": 0.0821, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.7359781121751023, | |
| "grad_norm": 0.875, | |
| "learning_rate": 0.0001869595058339053, | |
| "loss": 0.1043, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.7587779297765618, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.00018684511553420272, | |
| "loss": 0.1005, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.781577747378021, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.00018673072523450014, | |
| "loss": 0.1028, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.80437756497948, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 0.00018661633493479752, | |
| "loss": 0.1046, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.827177382580939, | |
| "grad_norm": 0.875, | |
| "learning_rate": 0.00018650194463509497, | |
| "loss": 0.0964, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.8499772001823986, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 0.00018638755433539236, | |
| "loss": 0.0775, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.8727770177838576, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.00018627316403568977, | |
| "loss": 0.0963, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.895576835385317, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.0001861587737359872, | |
| "loss": 0.0903, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.918376652986776, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 0.0001860443834362846, | |
| "loss": 0.0914, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 0.00018592999313658205, | |
| "loss": 0.108, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.9639762881896945, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 0.00018581560283687944, | |
| "loss": 0.0876, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.9867761057911535, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.00018570121253717685, | |
| "loss": 0.0834, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.009575923392613, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 0.00018558682223747427, | |
| "loss": 0.0786, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.032375740994072, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 0.00018547243193777169, | |
| "loss": 0.0803, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.0551755585955314, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 0.0001853580416380691, | |
| "loss": 0.0846, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.0779753761969904, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00018524365133836652, | |
| "loss": 0.0562, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.10077519379845, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 0.00018512926103866393, | |
| "loss": 0.0934, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.123575011399909, | |
| "grad_norm": 0.7578125, | |
| "learning_rate": 0.00018501487073896135, | |
| "loss": 0.0813, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.146374829001368, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 0.00018490048043925877, | |
| "loss": 0.0661, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.169174646602827, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00018478609013955615, | |
| "loss": 0.0589, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.191974464204286, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 0.0001846716998398536, | |
| "loss": 0.0679, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.2147742818057456, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 0.000184557309540151, | |
| "loss": 0.0564, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 3.2375740994072046, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 0.0001844429192404484, | |
| "loss": 0.0763, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.260373917008664, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 0.00018432852894074585, | |
| "loss": 0.0768, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 3.283173734610123, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.00018421413864104323, | |
| "loss": 0.0686, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.305973552211582, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 0.00018409974834134068, | |
| "loss": 0.0638, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.3287733698130415, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00018398535804163807, | |
| "loss": 0.0643, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.3515731874145005, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 0.00018387096774193548, | |
| "loss": 0.067, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 3.37437300501596, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.00018375657744223293, | |
| "loss": 0.0756, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.397172822617419, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 0.00018364218714253031, | |
| "loss": 0.0579, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 3.4199726402188784, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.00018352779684282773, | |
| "loss": 0.0533, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.4427724578203374, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.00018341340654312515, | |
| "loss": 0.0569, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 3.465572275421797, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 0.00018329901624342256, | |
| "loss": 0.0593, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.488372093023256, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 0.00018318462594371998, | |
| "loss": 0.0664, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 3.5111719106247152, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.0001830702356440174, | |
| "loss": 0.0593, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.5339717282261742, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0001829558453443148, | |
| "loss": 0.063, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 3.556771545827633, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 0.00018284145504461223, | |
| "loss": 0.0553, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.5795713634290927, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00018272706474490964, | |
| "loss": 0.0513, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 3.6023711810305517, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 0.00018261267444520706, | |
| "loss": 0.0678, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.625170998632011, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 0.00018249828414550448, | |
| "loss": 0.0446, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 3.64797081623347, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.0001823838938458019, | |
| "loss": 0.0588, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.6707706338349295, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.0001822695035460993, | |
| "loss": 0.0564, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 3.6935704514363885, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00018215511324639672, | |
| "loss": 0.0526, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.7163702690378475, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 0.0001820407229466941, | |
| "loss": 0.0609, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 3.739170086639307, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.00018192633264699156, | |
| "loss": 0.0679, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.761969904240766, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.00018181194234728894, | |
| "loss": 0.0638, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 3.7847697218422254, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 0.0001816975520475864, | |
| "loss": 0.0647, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.8075695394436844, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 0.00018158316174788378, | |
| "loss": 0.0747, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 3.830369357045144, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.0001814687714481812, | |
| "loss": 0.0586, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.853169174646603, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 0.00018135438114847864, | |
| "loss": 0.0544, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 3.875968992248062, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 0.00018123999084877602, | |
| "loss": 0.0644, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.8987688098495212, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 0.00018112560054907344, | |
| "loss": 0.0559, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 3.9215686274509802, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.00018101121024937086, | |
| "loss": 0.0608, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.9443684450524397, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 0.00018089681994966827, | |
| "loss": 0.0688, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 3.9671682626538987, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 0.0001807824296499657, | |
| "loss": 0.0614, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.989968080255358, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 0.0001806680393502631, | |
| "loss": 0.0515, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.0127678978568175, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 0.00018055364905056052, | |
| "loss": 0.0527, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.035567715458276, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 0.00018043925875085794, | |
| "loss": 0.0567, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 4.0583675330597355, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 0.00018032486845115535, | |
| "loss": 0.0501, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.081167350661195, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00018021047815145274, | |
| "loss": 0.0329, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 4.1039671682626535, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 0.00018009608785175019, | |
| "loss": 0.0615, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.126766985864113, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.0001799816975520476, | |
| "loss": 0.0471, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 4.149566803465572, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.00017986730725234502, | |
| "loss": 0.0467, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.172366621067032, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.00017975291695264243, | |
| "loss": 0.0381, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 4.19516643866849, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 0.00017963852665293982, | |
| "loss": 0.0414, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.21796625626995, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.00017952413635323727, | |
| "loss": 0.0412, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.240766073871409, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 0.00017940974605353465, | |
| "loss": 0.0492, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.263565891472869, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.00017929535575383207, | |
| "loss": 0.0526, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 4.286365709074327, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.0001791809654541295, | |
| "loss": 0.0403, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.309165526675787, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 0.0001790665751544269, | |
| "loss": 0.0415, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 4.331965344277246, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.00017895218485472435, | |
| "loss": 0.04, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 4.354765161878705, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.00017883779455502173, | |
| "loss": 0.0461, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 4.377564979480164, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 0.00017872340425531915, | |
| "loss": 0.0499, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.4003647970816235, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00017860901395561657, | |
| "loss": 0.038, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 4.423164614683083, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 0.00017849462365591398, | |
| "loss": 0.0383, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 4.4459644322845415, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 0.00017838023335621143, | |
| "loss": 0.0385, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 4.468764249886001, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 0.00017826584305650881, | |
| "loss": 0.0361, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 4.49156406748746, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.00017815145275680623, | |
| "loss": 0.0485, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 4.514363885088919, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.00017803706245710365, | |
| "loss": 0.0372, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 4.537163702690378, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 0.00017792267215740106, | |
| "loss": 0.0433, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 4.559963520291838, | |
| "grad_norm": 0.63671875, | |
| "learning_rate": 0.00017780828185769848, | |
| "loss": 0.0337, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.582763337893297, | |
| "grad_norm": 1.375, | |
| "learning_rate": 0.0001776938915579959, | |
| "loss": 0.0392, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 4.605563155494756, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 0.0001775795012582933, | |
| "loss": 0.0404, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 4.628362973096215, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.00017746511095859073, | |
| "loss": 0.0312, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 4.651162790697675, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00017735072065888814, | |
| "loss": 0.0398, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 4.673962608299133, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00017723633035918553, | |
| "loss": 0.0386, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 4.696762425900593, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 0.00017712194005948298, | |
| "loss": 0.0343, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 4.719562243502052, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 0.0001770075497597804, | |
| "loss": 0.0472, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 4.7423620611035116, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 0.00017689315946007778, | |
| "loss": 0.042, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 4.76516187870497, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 0.00017677876916037522, | |
| "loss": 0.0412, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 4.7879616963064295, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.0001766643788606726, | |
| "loss": 0.0432, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 4.810761513907889, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.00017654998856097006, | |
| "loss": 0.0525, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 4.8335613315093475, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 0.00017643559826126744, | |
| "loss": 0.0395, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 4.856361149110807, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 0.00017632120796156486, | |
| "loss": 0.0387, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 4.879160966712266, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 0.00017620681766186228, | |
| "loss": 0.041, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 4.901960784313726, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 0.0001760924273621597, | |
| "loss": 0.038, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 4.924760601915184, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.0001759780370624571, | |
| "loss": 0.0448, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 4.947560419516644, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 0.00017586364676275452, | |
| "loss": 0.0417, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 4.970360237118103, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00017574925646305194, | |
| "loss": 0.0393, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 4.993160054719562, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 0.00017563486616334936, | |
| "loss": 0.0353, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 5.015959872321021, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 0.00017552047586364677, | |
| "loss": 0.0376, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 5.038759689922481, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 0.0001754060855639442, | |
| "loss": 0.0388, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 5.06155950752394, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.0001752916952642416, | |
| "loss": 0.0318, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 5.084359325125399, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 0.00017517730496453902, | |
| "loss": 0.0251, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 5.107159142726858, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.0001750629146648364, | |
| "loss": 0.0443, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 5.1299589603283176, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.00017494852436513385, | |
| "loss": 0.0285, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 5.152758777929777, | |
| "grad_norm": 0.75, | |
| "learning_rate": 0.00017483413406543124, | |
| "loss": 0.0319, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 5.1755585955312355, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.00017471974376572869, | |
| "loss": 0.0292, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 5.198358413132695, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.0001746053534660261, | |
| "loss": 0.0281, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 5.221158230734154, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 0.0001744909631663235, | |
| "loss": 0.029, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 5.243958048335613, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00017437657286662093, | |
| "loss": 0.0382, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 5.266757865937072, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 0.00017426218256691832, | |
| "loss": 0.0328, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 5.289557683538532, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.00017414779226721577, | |
| "loss": 0.03, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 5.312357501139991, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00017403340196751315, | |
| "loss": 0.0251, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 5.33515731874145, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.00017391901166781057, | |
| "loss": 0.0267, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 5.357957136342909, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.000173804621368108, | |
| "loss": 0.0357, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 5.380756953944369, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 0.0001736902310684054, | |
| "loss": 0.035, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 5.403556771545827, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.00017357584076870282, | |
| "loss": 0.0235, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 5.426356589147287, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00017346145046900023, | |
| "loss": 0.0277, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 5.449156406748746, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 0.00017334706016929765, | |
| "loss": 0.0294, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 5.471956224350206, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 0.00017323266986959507, | |
| "loss": 0.0245, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 5.494756041951664, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 0.00017311827956989248, | |
| "loss": 0.0343, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 5.5175558595531236, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.0001730038892701899, | |
| "loss": 0.0279, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 5.540355677154583, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 0.00017288949897048731, | |
| "loss": 0.0322, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 5.563155494756042, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.00017277510867078473, | |
| "loss": 0.0233, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 5.585955312357501, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00017266071837108212, | |
| "loss": 0.0303, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 5.60875512995896, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00017254632807137956, | |
| "loss": 0.026, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 5.63155494756042, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 0.00017243193777167698, | |
| "loss": 0.022, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 5.654354765161878, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 0.0001723175474719744, | |
| "loss": 0.0304, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 5.677154582763338, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.0001722031571722718, | |
| "loss": 0.0294, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 5.699954400364797, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 0.0001720887668725692, | |
| "loss": 0.025, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 5.722754217966257, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 0.00017197437657286664, | |
| "loss": 0.0307, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 5.745554035567715, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 0.00017185998627316403, | |
| "loss": 0.0306, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 5.768353853169175, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00017174559597346145, | |
| "loss": 0.0329, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 5.791153670770634, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.00017163120567375886, | |
| "loss": 0.0273, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 5.813953488372093, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00017151681537405628, | |
| "loss": 0.0399, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 5.836753305973552, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00017140242507435372, | |
| "loss": 0.0268, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 5.859553123575012, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0001712880347746511, | |
| "loss": 0.0308, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 5.882352941176471, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 0.00017117364447494853, | |
| "loss": 0.0286, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 5.9051527587779296, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 0.00017105925417524594, | |
| "loss": 0.0288, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 5.927952576379389, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 0.00017094486387554336, | |
| "loss": 0.036, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 5.950752393980848, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 0.00017083047357584078, | |
| "loss": 0.0276, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 5.973552211582307, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0001707160832761382, | |
| "loss": 0.0259, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 5.996352029183766, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 0.0001706016929764356, | |
| "loss": 0.0262, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 6.019151846785226, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.00017048730267673302, | |
| "loss": 0.0287, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 6.041951664386685, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.00017037291237703044, | |
| "loss": 0.0318, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 6.064751481988144, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00017025852207732783, | |
| "loss": 0.019, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 6.087551299589603, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 0.00017014413177762527, | |
| "loss": 0.0233, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 6.110351117191063, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.0001700297414779227, | |
| "loss": 0.0283, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 6.133150934792521, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.0001699153511782201, | |
| "loss": 0.0206, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 6.155950752393981, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 0.00016980096087851752, | |
| "loss": 0.0228, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 6.17875056999544, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 0.0001696865705788149, | |
| "loss": 0.0238, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 6.2015503875969, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 0.00016957218027911235, | |
| "loss": 0.0209, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 6.224350205198358, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.00016945778997940974, | |
| "loss": 0.0216, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 6.247150022799818, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 0.00016934339967970716, | |
| "loss": 0.0306, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 6.269949840401277, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0001692290093800046, | |
| "loss": 0.0229, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 6.292749658002736, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.000169114619080302, | |
| "loss": 0.0214, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 6.315549475604195, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00016900022878059943, | |
| "loss": 0.0199, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 6.338349293205654, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 0.00016888583848089682, | |
| "loss": 0.0212, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 6.361149110807114, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00016877144818119424, | |
| "loss": 0.0264, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 6.383948928408572, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00016865705788149165, | |
| "loss": 0.0236, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 6.406748746010032, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.00016854266758178907, | |
| "loss": 0.0189, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 6.429548563611491, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 0.0001684282772820865, | |
| "loss": 0.0223, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 6.45234838121295, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 0.0001683138869823839, | |
| "loss": 0.0196, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 6.475148198814409, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00016819949668268132, | |
| "loss": 0.0193, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 6.497948016415869, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 0.00016808510638297873, | |
| "loss": 0.0266, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 6.520747834017328, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.00016797071608327615, | |
| "loss": 0.0222, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 6.543547651618787, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 0.00016785632578357357, | |
| "loss": 0.0238, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 6.566347469220246, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 0.00016774193548387098, | |
| "loss": 0.0171, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 6.589147286821706, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 0.0001676275451841684, | |
| "loss": 0.026, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 6.611947104423164, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.0001675131548844658, | |
| "loss": 0.0182, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 6.634746922024624, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00016739876458476323, | |
| "loss": 0.02, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 6.657546739626083, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.00016728437428506062, | |
| "loss": 0.0227, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 6.6803465572275424, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.00016716998398535806, | |
| "loss": 0.0216, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 6.703146374829001, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 0.00016705559368565545, | |
| "loss": 0.0196, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 6.72594619243046, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.00016694120338595287, | |
| "loss": 0.0227, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 6.74874601003192, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.0001668268130862503, | |
| "loss": 0.024, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 6.771545827633379, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.0001667124227865477, | |
| "loss": 0.0244, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 6.794345645234838, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 0.00016659803248684512, | |
| "loss": 0.025, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 6.817145462836297, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.00016648364218714253, | |
| "loss": 0.0276, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 6.839945280437757, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.00016636925188743995, | |
| "loss": 0.019, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 6.862745098039216, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 0.00016625486158773736, | |
| "loss": 0.023, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 6.885544915640675, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.00016614047128803478, | |
| "loss": 0.0216, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 6.908344733242134, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0001660260809883322, | |
| "loss": 0.0235, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 6.931144550843594, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.0001659116906886296, | |
| "loss": 0.0281, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 6.953944368445052, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.00016579730038892703, | |
| "loss": 0.0213, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 6.976744186046512, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.00016568291008922444, | |
| "loss": 0.0207, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 6.999544003647971, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 0.00016556851978952186, | |
| "loss": 0.0197, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 7.02234382124943, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 0.00016545412948981928, | |
| "loss": 0.0218, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 7.045143638850889, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.0001653397391901167, | |
| "loss": 0.0253, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 7.0679434564523484, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0001652253488904141, | |
| "loss": 0.0148, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 7.090743274053808, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 0.0001651109585907115, | |
| "loss": 0.0185, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 7.113543091655266, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.00016499656829100894, | |
| "loss": 0.0203, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 7.136342909256726, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 0.00016488217799130633, | |
| "loss": 0.0178, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 7.159142726858185, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 0.00016476778769160377, | |
| "loss": 0.0209, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 7.181942544459645, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.0001646533973919012, | |
| "loss": 0.0165, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 7.204742362061103, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 0.00016453900709219858, | |
| "loss": 0.017, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 7.227542179662563, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.00016442461679249602, | |
| "loss": 0.0156, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 7.250341997264022, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 0.0001643102264927934, | |
| "loss": 0.0258, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 7.273141814865481, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.00016419583619309083, | |
| "loss": 0.0191, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 7.29594163246694, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.00016408144589338824, | |
| "loss": 0.0158, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 7.3187414500684, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.00016396705559368566, | |
| "loss": 0.0166, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 7.341541267669859, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 0.00016385266529398307, | |
| "loss": 0.0178, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 7.364341085271318, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.0001637382749942805, | |
| "loss": 0.0216, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 7.387140902872777, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0001636238846945779, | |
| "loss": 0.0199, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 7.4099407204742365, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00016350949439487532, | |
| "loss": 0.0146, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 7.432740538075695, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.00016339510409517274, | |
| "loss": 0.0193, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 7.4555403556771545, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 0.00016328071379547015, | |
| "loss": 0.0172, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 7.478340173278614, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00016316632349576757, | |
| "loss": 0.0154, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 7.501139990880073, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 0.000163051933196065, | |
| "loss": 0.0201, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 7.523939808481532, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.0001629375428963624, | |
| "loss": 0.0187, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 7.546739626082991, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.00016282315259665982, | |
| "loss": 0.02, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 7.569539443684451, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.0001627087622969572, | |
| "loss": 0.0152, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 7.592339261285909, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 0.00016259437199725465, | |
| "loss": 0.02, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 7.615139078887369, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00016247998169755204, | |
| "loss": 0.0158, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 7.637938896488828, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.00016236559139784946, | |
| "loss": 0.0182, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 7.660738714090288, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.0001622512010981469, | |
| "loss": 0.0176, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 7.683538531691746, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.0001621368107984443, | |
| "loss": 0.0161, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 7.706338349293206, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00016202242049874173, | |
| "loss": 0.0172, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 7.729138166894665, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00016190803019903912, | |
| "loss": 0.0189, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 7.751937984496124, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.00016179363989933654, | |
| "loss": 0.0191, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 7.774737802097583, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00016167924959963395, | |
| "loss": 0.0193, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 7.7975376196990425, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.00016156485929993137, | |
| "loss": 0.0213, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 7.820337437300502, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.0001614504690002288, | |
| "loss": 0.0204, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 7.8431372549019605, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.0001613360787005262, | |
| "loss": 0.0197, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 7.86593707250342, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 0.00016122168840082362, | |
| "loss": 0.0164, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 7.888736890104879, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 0.00016110729810112103, | |
| "loss": 0.0157, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 7.911536707706338, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.00016099290780141845, | |
| "loss": 0.0201, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 7.934336525307797, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00016087851750171586, | |
| "loss": 0.0239, | |
| "step": 17400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 87720, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 40, | |
| "save_steps": 200, | |
| "total_flos": 0.0, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |