{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6997188411365105, "eval_steps": 1431, "global_step": 10017, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009779438730070028, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 5.7482, "step": 14 }, { "epoch": 0.0019558877460140057, "grad_norm": 0.1259765625, "learning_rate": 0.001, "loss": 3.6003, "step": 28 }, { "epoch": 0.0029338316190210085, "grad_norm": 0.12109375, "learning_rate": 0.001, "loss": 3.4079, "step": 42 }, { "epoch": 0.003911775492028011, "grad_norm": 0.22265625, "learning_rate": 0.001, "loss": 3.201, "step": 56 }, { "epoch": 0.004889719365035014, "grad_norm": 0.21484375, "learning_rate": 0.001, "loss": 2.955, "step": 70 }, { "epoch": 0.005867663238042017, "grad_norm": 0.2119140625, "learning_rate": 0.001, "loss": 2.8744, "step": 84 }, { "epoch": 0.006845607111049019, "grad_norm": 0.15625, "learning_rate": 0.001, "loss": 2.8267, "step": 98 }, { "epoch": 0.007823550984056023, "grad_norm": 0.1884765625, "learning_rate": 0.001, "loss": 2.7249, "step": 112 }, { "epoch": 0.008801494857063025, "grad_norm": 2.625, "learning_rate": 0.001, "loss": 2.6441, "step": 126 }, { "epoch": 0.009779438730070028, "grad_norm": 0.173828125, "learning_rate": 0.001, "loss": 2.6126, "step": 140 }, { "epoch": 0.01075738260307703, "grad_norm": 0.10693359375, "learning_rate": 0.001, "loss": 2.5241, "step": 154 }, { "epoch": 0.011735326476084034, "grad_norm": 0.158203125, "learning_rate": 0.001, "loss": 2.4803, "step": 168 }, { "epoch": 0.012713270349091036, "grad_norm": 0.119140625, "learning_rate": 0.001, "loss": 2.3886, "step": 182 }, { "epoch": 0.013691214222098038, "grad_norm": 0.1923828125, "learning_rate": 0.001, "loss": 2.3931, "step": 196 }, { "epoch": 0.014669158095105042, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 2.3493, "step": 210 }, { "epoch": 0.015647101968112045, "grad_norm": 0.234375, "learning_rate": 0.001, "loss": 2.3317, "step": 224 }, { "epoch": 0.016625045841119047, "grad_norm": 0.2138671875, "learning_rate": 0.001, "loss": 2.3284, "step": 238 }, { "epoch": 0.01760298971412605, "grad_norm": 0.16796875, "learning_rate": 0.001, "loss": 2.2322, "step": 252 }, { "epoch": 0.01858093358713305, "grad_norm": 0.1650390625, "learning_rate": 0.001, "loss": 2.2145, "step": 266 }, { "epoch": 0.019558877460140057, "grad_norm": 0.2890625, "learning_rate": 0.001, "loss": 2.202, "step": 280 }, { "epoch": 0.02053682133314706, "grad_norm": 0.201171875, "learning_rate": 0.001, "loss": 2.2156, "step": 294 }, { "epoch": 0.02151476520615406, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 2.1436, "step": 308 }, { "epoch": 0.022492709079161063, "grad_norm": 0.390625, "learning_rate": 0.001, "loss": 2.1198, "step": 322 }, { "epoch": 0.023470652952168068, "grad_norm": 0.234375, "learning_rate": 0.001, "loss": 2.1031, "step": 336 }, { "epoch": 0.02444859682517507, "grad_norm": 0.1298828125, "learning_rate": 0.001, "loss": 2.0943, "step": 350 }, { "epoch": 0.025426540698182072, "grad_norm": 0.10693359375, "learning_rate": 0.001, "loss": 2.0352, "step": 364 }, { "epoch": 0.026404484571189074, "grad_norm": 0.158203125, "learning_rate": 0.001, "loss": 2.0439, "step": 378 }, { "epoch": 0.027382428444196076, "grad_norm": 0.16796875, "learning_rate": 0.001, "loss": 2.0176, "step": 392 }, { "epoch": 0.02836037231720308, "grad_norm": 0.474609375, "learning_rate": 0.001, "loss": 2.0212, "step": 406 }, { "epoch": 0.029338316190210083, "grad_norm": 0.240234375, "learning_rate": 0.001, "loss": 1.9963, "step": 420 }, { "epoch": 0.030316260063217085, "grad_norm": 0.1689453125, "learning_rate": 0.001, "loss": 1.9738, "step": 434 }, { "epoch": 0.03129420393622409, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.9362, "step": 448 }, { "epoch": 0.03227214780923109, "grad_norm": 0.1982421875, "learning_rate": 0.001, "loss": 1.9393, "step": 462 }, { "epoch": 0.033250091682238095, "grad_norm": 0.400390625, "learning_rate": 0.001, "loss": 1.9876, "step": 476 }, { "epoch": 0.0342280355552451, "grad_norm": 0.23046875, "learning_rate": 0.001, "loss": 1.9551, "step": 490 }, { "epoch": 0.0352059794282521, "grad_norm": 0.1591796875, "learning_rate": 0.001, "loss": 1.9103, "step": 504 }, { "epoch": 0.0361839233012591, "grad_norm": 0.1767578125, "learning_rate": 0.001, "loss": 1.9243, "step": 518 }, { "epoch": 0.0371618671742661, "grad_norm": 0.478515625, "learning_rate": 0.001, "loss": 1.9207, "step": 532 }, { "epoch": 0.03813981104727311, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.9029, "step": 546 }, { "epoch": 0.039117754920280114, "grad_norm": 0.2294921875, "learning_rate": 0.001, "loss": 1.8988, "step": 560 }, { "epoch": 0.040095698793287116, "grad_norm": 0.189453125, "learning_rate": 0.001, "loss": 1.8653, "step": 574 }, { "epoch": 0.04107364266629412, "grad_norm": 0.2158203125, "learning_rate": 0.001, "loss": 1.8383, "step": 588 }, { "epoch": 0.04205158653930112, "grad_norm": 0.236328125, "learning_rate": 0.001, "loss": 1.8191, "step": 602 }, { "epoch": 0.04302953041230812, "grad_norm": 0.55859375, "learning_rate": 0.001, "loss": 1.8837, "step": 616 }, { "epoch": 0.044007474285315124, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.8838, "step": 630 }, { "epoch": 0.044985418158322125, "grad_norm": 0.1513671875, "learning_rate": 0.001, "loss": 1.8423, "step": 644 }, { "epoch": 0.04596336203132913, "grad_norm": 0.134765625, "learning_rate": 0.001, "loss": 1.785, "step": 658 }, { "epoch": 0.046941305904336136, "grad_norm": 0.2265625, "learning_rate": 0.001, "loss": 1.8072, "step": 672 }, { "epoch": 0.04791924977734314, "grad_norm": 0.1669921875, "learning_rate": 0.001, "loss": 1.7771, "step": 686 }, { "epoch": 0.04889719365035014, "grad_norm": 0.33203125, "learning_rate": 0.001, "loss": 1.8465, "step": 700 }, { "epoch": 0.04987513752335714, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.7964, "step": 714 }, { "epoch": 0.050853081396364144, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.81, "step": 728 }, { "epoch": 0.051831025269371146, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.7867, "step": 742 }, { "epoch": 0.05280896914237815, "grad_norm": 0.19140625, "learning_rate": 0.001, "loss": 1.7581, "step": 756 }, { "epoch": 0.05378691301538515, "grad_norm": 0.2158203125, "learning_rate": 0.001, "loss": 1.7116, "step": 770 }, { "epoch": 0.05476485688839215, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.7431, "step": 784 }, { "epoch": 0.05574280076139916, "grad_norm": 0.2255859375, "learning_rate": 0.001, "loss": 1.7647, "step": 798 }, { "epoch": 0.05672074463440616, "grad_norm": 0.177734375, "learning_rate": 0.001, "loss": 1.709, "step": 812 }, { "epoch": 0.057698688507413165, "grad_norm": 0.384765625, "learning_rate": 0.001, "loss": 1.7235, "step": 826 }, { "epoch": 0.05867663238042017, "grad_norm": 0.26171875, "learning_rate": 0.001, "loss": 1.7161, "step": 840 }, { "epoch": 0.05965457625342717, "grad_norm": 0.455078125, "learning_rate": 0.001, "loss": 1.812, "step": 854 }, { "epoch": 0.06063252012643417, "grad_norm": 0.2099609375, "learning_rate": 0.001, "loss": 1.7164, "step": 868 }, { "epoch": 0.06161046399944117, "grad_norm": 0.154296875, "learning_rate": 0.001, "loss": 1.7677, "step": 882 }, { "epoch": 0.06258840787244818, "grad_norm": 0.1796875, "learning_rate": 0.001, "loss": 1.7001, "step": 896 }, { "epoch": 0.06356635174545518, "grad_norm": 0.173828125, "learning_rate": 0.001, "loss": 1.7015, "step": 910 }, { "epoch": 0.06454429561846219, "grad_norm": 0.203125, "learning_rate": 0.001, "loss": 1.6904, "step": 924 }, { "epoch": 0.06552223949146918, "grad_norm": 0.2236328125, "learning_rate": 0.001, "loss": 1.6692, "step": 938 }, { "epoch": 0.06650018336447619, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.6802, "step": 952 }, { "epoch": 0.06747812723748318, "grad_norm": 0.228515625, "learning_rate": 0.001, "loss": 1.6566, "step": 966 }, { "epoch": 0.0684560711104902, "grad_norm": 0.275390625, "learning_rate": 0.001, "loss": 1.7042, "step": 980 }, { "epoch": 0.0694340149834972, "grad_norm": 0.2158203125, "learning_rate": 0.001, "loss": 1.6653, "step": 994 }, { "epoch": 0.0704119588565042, "grad_norm": 0.357421875, "learning_rate": 0.001, "loss": 1.6729, "step": 1008 }, { "epoch": 0.0713899027295112, "grad_norm": 0.28125, "learning_rate": 0.001, "loss": 1.6864, "step": 1022 }, { "epoch": 0.0723678466025182, "grad_norm": 0.1513671875, "learning_rate": 0.001, "loss": 1.6541, "step": 1036 }, { "epoch": 0.07334579047552521, "grad_norm": 1.0390625, "learning_rate": 0.001, "loss": 1.7122, "step": 1050 }, { "epoch": 0.0743237343485322, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.7231, "step": 1064 }, { "epoch": 0.07530167822153921, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.6781, "step": 1078 }, { "epoch": 0.07627962209454622, "grad_norm": 0.23828125, "learning_rate": 0.001, "loss": 1.667, "step": 1092 }, { "epoch": 0.07725756596755322, "grad_norm": 0.2333984375, "learning_rate": 0.001, "loss": 1.6667, "step": 1106 }, { "epoch": 0.07823550984056023, "grad_norm": 0.2119140625, "learning_rate": 0.001, "loss": 1.622, "step": 1120 }, { "epoch": 0.07921345371356722, "grad_norm": 0.1533203125, "learning_rate": 0.001, "loss": 1.636, "step": 1134 }, { "epoch": 0.08019139758657423, "grad_norm": 0.1943359375, "learning_rate": 0.001, "loss": 1.6423, "step": 1148 }, { "epoch": 0.08116934145958123, "grad_norm": 0.1806640625, "learning_rate": 0.001, "loss": 1.6136, "step": 1162 }, { "epoch": 0.08214728533258824, "grad_norm": 0.224609375, "learning_rate": 0.001, "loss": 1.5988, "step": 1176 }, { "epoch": 0.08312522920559523, "grad_norm": 0.2890625, "learning_rate": 0.001, "loss": 1.6216, "step": 1190 }, { "epoch": 0.08410317307860224, "grad_norm": 0.2060546875, "learning_rate": 0.001, "loss": 1.6296, "step": 1204 }, { "epoch": 0.08508111695160925, "grad_norm": 0.2451171875, "learning_rate": 0.001, "loss": 1.5944, "step": 1218 }, { "epoch": 0.08605906082461624, "grad_norm": 0.2392578125, "learning_rate": 0.001, "loss": 1.6144, "step": 1232 }, { "epoch": 0.08703700469762325, "grad_norm": 0.357421875, "learning_rate": 0.001, "loss": 1.585, "step": 1246 }, { "epoch": 0.08801494857063025, "grad_norm": 0.2470703125, "learning_rate": 0.001, "loss": 1.6033, "step": 1260 }, { "epoch": 0.08899289244363726, "grad_norm": 0.201171875, "learning_rate": 0.001, "loss": 1.6122, "step": 1274 }, { "epoch": 0.08997083631664425, "grad_norm": 0.396484375, "learning_rate": 0.001, "loss": 1.5978, "step": 1288 }, { "epoch": 0.09094878018965126, "grad_norm": 0.4453125, "learning_rate": 0.001, "loss": 1.6392, "step": 1302 }, { "epoch": 0.09192672406265825, "grad_norm": 0.2470703125, "learning_rate": 0.001, "loss": 1.6247, "step": 1316 }, { "epoch": 0.09290466793566526, "grad_norm": 0.193359375, "learning_rate": 0.001, "loss": 1.5888, "step": 1330 }, { "epoch": 0.09388261180867227, "grad_norm": 0.2392578125, "learning_rate": 0.001, "loss": 1.572, "step": 1344 }, { "epoch": 0.09486055568167927, "grad_norm": 0.17578125, "learning_rate": 0.001, "loss": 1.5725, "step": 1358 }, { "epoch": 0.09583849955468628, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.6131, "step": 1372 }, { "epoch": 0.09681644342769327, "grad_norm": 0.22265625, "learning_rate": 0.001, "loss": 1.5467, "step": 1386 }, { "epoch": 0.09779438730070028, "grad_norm": 0.296875, "learning_rate": 0.001, "loss": 1.5843, "step": 1400 }, { "epoch": 0.09877233117370728, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.5519, "step": 1414 }, { "epoch": 0.09975027504671428, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.5531, "step": 1428 }, { "epoch": 0.09995983444807292, "eval_loss": 2.1689391136169434, "eval_runtime": 9.1273, "eval_samples_per_second": 109.561, "eval_steps_per_second": 1.424, "step": 1431 }, { "epoch": 0.10072821891972128, "grad_norm": 0.2333984375, "learning_rate": 0.001, "loss": 1.561, "step": 1442 }, { "epoch": 0.10170616279272829, "grad_norm": 0.302734375, "learning_rate": 0.001, "loss": 1.5818, "step": 1456 }, { "epoch": 0.1026841066657353, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 1.5653, "step": 1470 }, { "epoch": 0.10366205053874229, "grad_norm": 0.296875, "learning_rate": 0.001, "loss": 1.585, "step": 1484 }, { "epoch": 0.1046399944117493, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.5502, "step": 1498 }, { "epoch": 0.1056179382847563, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 1.5873, "step": 1512 }, { "epoch": 0.1065958821577633, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.5825, "step": 1526 }, { "epoch": 0.1075738260307703, "grad_norm": 0.2578125, "learning_rate": 0.001, "loss": 1.5712, "step": 1540 }, { "epoch": 0.10855176990377731, "grad_norm": 0.2392578125, "learning_rate": 0.001, "loss": 1.5443, "step": 1554 }, { "epoch": 0.1095297137767843, "grad_norm": 0.95703125, "learning_rate": 0.001, "loss": 1.5989, "step": 1568 }, { "epoch": 0.11050765764979131, "grad_norm": 0.318359375, "learning_rate": 0.001, "loss": 1.6003, "step": 1582 }, { "epoch": 0.11148560152279832, "grad_norm": 0.2578125, "learning_rate": 0.001, "loss": 1.5907, "step": 1596 }, { "epoch": 0.11246354539580532, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.5373, "step": 1610 }, { "epoch": 0.11344148926881233, "grad_norm": 0.2392578125, "learning_rate": 0.001, "loss": 1.5531, "step": 1624 }, { "epoch": 0.11441943314181932, "grad_norm": 0.232421875, "learning_rate": 0.001, "loss": 1.5317, "step": 1638 }, { "epoch": 0.11539737701482633, "grad_norm": 0.3671875, "learning_rate": 0.001, "loss": 1.5157, "step": 1652 }, { "epoch": 0.11637532088783333, "grad_norm": 0.2001953125, "learning_rate": 0.001, "loss": 1.5462, "step": 1666 }, { "epoch": 0.11735326476084033, "grad_norm": 0.2236328125, "learning_rate": 0.001, "loss": 1.5598, "step": 1680 }, { "epoch": 0.11833120863384734, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.5345, "step": 1694 }, { "epoch": 0.11930915250685434, "grad_norm": 0.470703125, "learning_rate": 0.001, "loss": 1.5324, "step": 1708 }, { "epoch": 0.12028709637986135, "grad_norm": 0.2158203125, "learning_rate": 0.001, "loss": 1.498, "step": 1722 }, { "epoch": 0.12126504025286834, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.5072, "step": 1736 }, { "epoch": 0.12224298412587535, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.5364, "step": 1750 }, { "epoch": 0.12322092799888235, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.5, "step": 1764 }, { "epoch": 0.12419887187188935, "grad_norm": 0.31640625, "learning_rate": 0.001, "loss": 1.5113, "step": 1778 }, { "epoch": 0.12517681574489636, "grad_norm": 0.400390625, "learning_rate": 0.001, "loss": 1.5099, "step": 1792 }, { "epoch": 0.12615475961790334, "grad_norm": 0.2294921875, "learning_rate": 0.001, "loss": 1.5076, "step": 1806 }, { "epoch": 0.12713270349091035, "grad_norm": 0.205078125, "learning_rate": 0.001, "loss": 1.4994, "step": 1820 }, { "epoch": 0.12811064736391736, "grad_norm": 0.349609375, "learning_rate": 0.001, "loss": 1.5135, "step": 1834 }, { "epoch": 0.12908859123692437, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.5214, "step": 1848 }, { "epoch": 0.13006653510993138, "grad_norm": 0.326171875, "learning_rate": 0.001, "loss": 1.4525, "step": 1862 }, { "epoch": 0.13104447898293836, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.51, "step": 1876 }, { "epoch": 0.13202242285594537, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 1.4965, "step": 1890 }, { "epoch": 0.13300036672895238, "grad_norm": 0.1943359375, "learning_rate": 0.001, "loss": 1.4854, "step": 1904 }, { "epoch": 0.1339783106019594, "grad_norm": 0.1943359375, "learning_rate": 0.001, "loss": 1.4926, "step": 1918 }, { "epoch": 0.13495625447496637, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.4897, "step": 1932 }, { "epoch": 0.13593419834797338, "grad_norm": 0.365234375, "learning_rate": 0.001, "loss": 1.498, "step": 1946 }, { "epoch": 0.1369121422209804, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.5029, "step": 1960 }, { "epoch": 0.1378900860939874, "grad_norm": 0.419921875, "learning_rate": 0.001, "loss": 1.482, "step": 1974 }, { "epoch": 0.1388680299669944, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.5212, "step": 1988 }, { "epoch": 0.1398459738400014, "grad_norm": 0.80859375, "learning_rate": 0.001, "loss": 1.549, "step": 2002 }, { "epoch": 0.1408239177130084, "grad_norm": 0.36328125, "learning_rate": 0.001, "loss": 1.5148, "step": 2016 }, { "epoch": 0.1418018615860154, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.4993, "step": 2030 }, { "epoch": 0.1427798054590224, "grad_norm": 0.220703125, "learning_rate": 0.001, "loss": 1.4985, "step": 2044 }, { "epoch": 0.14375774933202942, "grad_norm": 0.25, "learning_rate": 0.001, "loss": 1.4938, "step": 2058 }, { "epoch": 0.1447356932050364, "grad_norm": 0.177734375, "learning_rate": 0.001, "loss": 1.4777, "step": 2072 }, { "epoch": 0.1457136370780434, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.4865, "step": 2086 }, { "epoch": 0.14669158095105042, "grad_norm": 0.19140625, "learning_rate": 0.001, "loss": 1.4567, "step": 2100 }, { "epoch": 0.14766952482405743, "grad_norm": 0.2099609375, "learning_rate": 0.001, "loss": 1.4706, "step": 2114 }, { "epoch": 0.1486474686970644, "grad_norm": 0.216796875, "learning_rate": 0.001, "loss": 1.4959, "step": 2128 }, { "epoch": 0.14962541257007142, "grad_norm": 0.224609375, "learning_rate": 0.001, "loss": 1.4631, "step": 2142 }, { "epoch": 0.15060335644307843, "grad_norm": 0.2490234375, "learning_rate": 0.001, "loss": 1.4669, "step": 2156 }, { "epoch": 0.15158130031608544, "grad_norm": 0.2099609375, "learning_rate": 0.001, "loss": 1.4479, "step": 2170 }, { "epoch": 0.15255924418909245, "grad_norm": 0.25, "learning_rate": 0.001, "loss": 1.4584, "step": 2184 }, { "epoch": 0.15353718806209943, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.4609, "step": 2198 }, { "epoch": 0.15451513193510644, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.4499, "step": 2212 }, { "epoch": 0.15549307580811345, "grad_norm": 0.2373046875, "learning_rate": 0.001, "loss": 1.4562, "step": 2226 }, { "epoch": 0.15647101968112045, "grad_norm": 0.302734375, "learning_rate": 0.001, "loss": 1.4745, "step": 2240 }, { "epoch": 0.15744896355412744, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.4773, "step": 2254 }, { "epoch": 0.15842690742713444, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 1.4501, "step": 2268 }, { "epoch": 0.15940485130014145, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.4378, "step": 2282 }, { "epoch": 0.16038279517314846, "grad_norm": 0.19921875, "learning_rate": 0.001, "loss": 1.4452, "step": 2296 }, { "epoch": 0.16136073904615547, "grad_norm": 0.322265625, "learning_rate": 0.001, "loss": 1.4536, "step": 2310 }, { "epoch": 0.16233868291916245, "grad_norm": 0.302734375, "learning_rate": 0.001, "loss": 1.462, "step": 2324 }, { "epoch": 0.16331662679216946, "grad_norm": 0.2353515625, "learning_rate": 0.001, "loss": 1.451, "step": 2338 }, { "epoch": 0.16429457066517647, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.4266, "step": 2352 }, { "epoch": 0.16527251453818348, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.4503, "step": 2366 }, { "epoch": 0.16625045841119046, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.4502, "step": 2380 }, { "epoch": 0.16722840228419747, "grad_norm": 0.279296875, "learning_rate": 0.001, "loss": 1.4587, "step": 2394 }, { "epoch": 0.16820634615720448, "grad_norm": 0.279296875, "learning_rate": 0.001, "loss": 1.4588, "step": 2408 }, { "epoch": 0.1691842900302115, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.4679, "step": 2422 }, { "epoch": 0.1701622339032185, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.4329, "step": 2436 }, { "epoch": 0.17114017777622548, "grad_norm": 1.109375, "learning_rate": 0.001, "loss": 1.4566, "step": 2450 }, { "epoch": 0.1721181216492325, "grad_norm": 0.357421875, "learning_rate": 0.001, "loss": 1.4728, "step": 2464 }, { "epoch": 0.1730960655222395, "grad_norm": 0.2734375, "learning_rate": 0.001, "loss": 1.4518, "step": 2478 }, { "epoch": 0.1740740093952465, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.4507, "step": 2492 }, { "epoch": 0.17505195326825349, "grad_norm": 0.2412109375, "learning_rate": 0.001, "loss": 1.4242, "step": 2506 }, { "epoch": 0.1760298971412605, "grad_norm": 0.197265625, "learning_rate": 0.001, "loss": 1.4116, "step": 2520 }, { "epoch": 0.1770078410142675, "grad_norm": 0.3671875, "learning_rate": 0.001, "loss": 1.4294, "step": 2534 }, { "epoch": 0.1779857848872745, "grad_norm": 0.451171875, "learning_rate": 0.001, "loss": 1.4448, "step": 2548 }, { "epoch": 0.17896372876028152, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.4468, "step": 2562 }, { "epoch": 0.1799416726332885, "grad_norm": 0.419921875, "learning_rate": 0.001, "loss": 1.4319, "step": 2576 }, { "epoch": 0.1809196165062955, "grad_norm": 0.2275390625, "learning_rate": 0.001, "loss": 1.4309, "step": 2590 }, { "epoch": 0.18189756037930252, "grad_norm": 0.236328125, "learning_rate": 0.001, "loss": 1.4324, "step": 2604 }, { "epoch": 0.18287550425230953, "grad_norm": 0.1875, "learning_rate": 0.001, "loss": 1.4143, "step": 2618 }, { "epoch": 0.1838534481253165, "grad_norm": 0.197265625, "learning_rate": 0.001, "loss": 1.417, "step": 2632 }, { "epoch": 0.18483139199832352, "grad_norm": 0.2099609375, "learning_rate": 0.001, "loss": 1.4318, "step": 2646 }, { "epoch": 0.18580933587133053, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.4151, "step": 2660 }, { "epoch": 0.18678727974433754, "grad_norm": 0.275390625, "learning_rate": 0.001, "loss": 1.4169, "step": 2674 }, { "epoch": 0.18776522361734455, "grad_norm": 0.302734375, "learning_rate": 0.001, "loss": 1.4352, "step": 2688 }, { "epoch": 0.18874316749035153, "grad_norm": 0.4296875, "learning_rate": 0.001, "loss": 1.4346, "step": 2702 }, { "epoch": 0.18972111136335854, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.4253, "step": 2716 }, { "epoch": 0.19069905523636554, "grad_norm": 0.31640625, "learning_rate": 0.001, "loss": 1.4272, "step": 2730 }, { "epoch": 0.19167699910937255, "grad_norm": 0.8984375, "learning_rate": 0.001, "loss": 1.4618, "step": 2744 }, { "epoch": 0.19265494298237953, "grad_norm": 0.322265625, "learning_rate": 0.001, "loss": 1.4616, "step": 2758 }, { "epoch": 0.19363288685538654, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.4231, "step": 2772 }, { "epoch": 0.19461083072839355, "grad_norm": 0.20703125, "learning_rate": 0.001, "loss": 1.4185, "step": 2786 }, { "epoch": 0.19558877460140056, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.4578, "step": 2800 }, { "epoch": 0.19656671847440757, "grad_norm": 0.26953125, "learning_rate": 0.001, "loss": 1.4134, "step": 2814 }, { "epoch": 0.19754466234741455, "grad_norm": 0.2099609375, "learning_rate": 0.001, "loss": 1.4253, "step": 2828 }, { "epoch": 0.19852260622042156, "grad_norm": 0.208984375, "learning_rate": 0.001, "loss": 1.414, "step": 2842 }, { "epoch": 0.19950055009342857, "grad_norm": 0.337890625, "learning_rate": 0.001, "loss": 1.4069, "step": 2856 }, { "epoch": 0.19991966889614585, "eval_loss": 1.8960140943527222, "eval_runtime": 9.2129, "eval_samples_per_second": 108.543, "eval_steps_per_second": 1.411, "step": 2862 }, { "epoch": 0.20047849396643558, "grad_norm": 0.31640625, "learning_rate": 0.001, "loss": 1.4267, "step": 2870 }, { "epoch": 0.20145643783944256, "grad_norm": 0.2373046875, "learning_rate": 0.001, "loss": 1.4186, "step": 2884 }, { "epoch": 0.20243438171244957, "grad_norm": 0.2236328125, "learning_rate": 0.001, "loss": 1.4105, "step": 2898 }, { "epoch": 0.20341232558545658, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.4012, "step": 2912 }, { "epoch": 0.20439026945846359, "grad_norm": 0.234375, "learning_rate": 0.001, "loss": 1.3917, "step": 2926 }, { "epoch": 0.2053682133314706, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3952, "step": 2940 }, { "epoch": 0.20634615720447758, "grad_norm": 0.1923828125, "learning_rate": 0.001, "loss": 1.396, "step": 2954 }, { "epoch": 0.20732410107748458, "grad_norm": 0.453125, "learning_rate": 0.001, "loss": 1.4556, "step": 2968 }, { "epoch": 0.2083020449504916, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.4215, "step": 2982 }, { "epoch": 0.2092799888234986, "grad_norm": 0.2158203125, "learning_rate": 0.001, "loss": 1.4166, "step": 2996 }, { "epoch": 0.21025793269650558, "grad_norm": 0.25, "learning_rate": 0.001, "loss": 1.3975, "step": 3010 }, { "epoch": 0.2112358765695126, "grad_norm": 0.2392578125, "learning_rate": 0.001, "loss": 1.4019, "step": 3024 }, { "epoch": 0.2122138204425196, "grad_norm": 0.2578125, "learning_rate": 0.001, "loss": 1.4016, "step": 3038 }, { "epoch": 0.2131917643155266, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.4069, "step": 3052 }, { "epoch": 0.21416970818853362, "grad_norm": 0.275390625, "learning_rate": 0.001, "loss": 1.4214, "step": 3066 }, { "epoch": 0.2151476520615406, "grad_norm": 0.23046875, "learning_rate": 0.001, "loss": 1.4123, "step": 3080 }, { "epoch": 0.2161255959345476, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.3981, "step": 3094 }, { "epoch": 0.21710353980755462, "grad_norm": 0.2451171875, "learning_rate": 0.001, "loss": 1.4067, "step": 3108 }, { "epoch": 0.21808148368056163, "grad_norm": 0.2373046875, "learning_rate": 0.001, "loss": 1.3893, "step": 3122 }, { "epoch": 0.2190594275535686, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.4263, "step": 3136 }, { "epoch": 0.22003737142657562, "grad_norm": 0.318359375, "learning_rate": 0.001, "loss": 1.3931, "step": 3150 }, { "epoch": 0.22101531529958263, "grad_norm": 0.24609375, "learning_rate": 0.001, "loss": 1.3901, "step": 3164 }, { "epoch": 0.22199325917258964, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3805, "step": 3178 }, { "epoch": 0.22297120304559664, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.4044, "step": 3192 }, { "epoch": 0.22394914691860363, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.4046, "step": 3206 }, { "epoch": 0.22492709079161063, "grad_norm": 0.369140625, "learning_rate": 0.001, "loss": 1.4084, "step": 3220 }, { "epoch": 0.22590503466461764, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.3929, "step": 3234 }, { "epoch": 0.22688297853762465, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.3462, "step": 3248 }, { "epoch": 0.22786092241063163, "grad_norm": 0.29296875, "learning_rate": 0.001, "loss": 1.3721, "step": 3262 }, { "epoch": 0.22883886628363864, "grad_norm": 0.3359375, "learning_rate": 0.001, "loss": 1.4027, "step": 3276 }, { "epoch": 0.22981681015664565, "grad_norm": 0.2890625, "learning_rate": 0.001, "loss": 1.4081, "step": 3290 }, { "epoch": 0.23079475402965266, "grad_norm": 0.34765625, "learning_rate": 0.001, "loss": 1.3706, "step": 3304 }, { "epoch": 0.23177269790265967, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.3652, "step": 3318 }, { "epoch": 0.23275064177566665, "grad_norm": 0.4296875, "learning_rate": 0.001, "loss": 1.3623, "step": 3332 }, { "epoch": 0.23372858564867366, "grad_norm": 0.236328125, "learning_rate": 0.001, "loss": 1.37, "step": 3346 }, { "epoch": 0.23470652952168067, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.3627, "step": 3360 }, { "epoch": 0.23568447339468768, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.3979, "step": 3374 }, { "epoch": 0.23666241726769469, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.3967, "step": 3388 }, { "epoch": 0.23764036114070167, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.3696, "step": 3402 }, { "epoch": 0.23861830501370868, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.3867, "step": 3416 }, { "epoch": 0.23959624888671568, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.4115, "step": 3430 }, { "epoch": 0.2405741927597227, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.3501, "step": 3444 }, { "epoch": 0.24155213663272967, "grad_norm": 0.337890625, "learning_rate": 0.001, "loss": 1.377, "step": 3458 }, { "epoch": 0.24253008050573668, "grad_norm": 0.205078125, "learning_rate": 0.001, "loss": 1.3585, "step": 3472 }, { "epoch": 0.2435080243787437, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.36, "step": 3486 }, { "epoch": 0.2444859682517507, "grad_norm": 0.296875, "learning_rate": 0.001, "loss": 1.3759, "step": 3500 }, { "epoch": 0.2454639121247577, "grad_norm": 0.24609375, "learning_rate": 0.001, "loss": 1.3782, "step": 3514 }, { "epoch": 0.2464418559977647, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 1.3489, "step": 3528 }, { "epoch": 0.2474197998707717, "grad_norm": 1.203125, "learning_rate": 0.001, "loss": 1.4092, "step": 3542 }, { "epoch": 0.2483977437437787, "grad_norm": 0.38671875, "learning_rate": 0.001, "loss": 1.3806, "step": 3556 }, { "epoch": 0.24937568761678572, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.3973, "step": 3570 }, { "epoch": 0.2503536314897927, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.3725, "step": 3584 }, { "epoch": 0.2513315753627997, "grad_norm": 0.28125, "learning_rate": 0.001, "loss": 1.3523, "step": 3598 }, { "epoch": 0.2523095192358067, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 1.3929, "step": 3612 }, { "epoch": 0.2532874631088137, "grad_norm": 0.359375, "learning_rate": 0.001, "loss": 1.3541, "step": 3626 }, { "epoch": 0.2542654069818207, "grad_norm": 0.2197265625, "learning_rate": 0.001, "loss": 1.3471, "step": 3640 }, { "epoch": 0.25524335085482774, "grad_norm": 0.248046875, "learning_rate": 0.001, "loss": 1.3887, "step": 3654 }, { "epoch": 0.2562212947278347, "grad_norm": 0.38671875, "learning_rate": 0.001, "loss": 1.3675, "step": 3668 }, { "epoch": 0.2571992386008417, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3591, "step": 3682 }, { "epoch": 0.25817718247384874, "grad_norm": 0.462890625, "learning_rate": 0.001, "loss": 1.3813, "step": 3696 }, { "epoch": 0.2591551263468557, "grad_norm": 0.2099609375, "learning_rate": 0.001, "loss": 1.3555, "step": 3710 }, { "epoch": 0.26013307021986276, "grad_norm": 0.95703125, "learning_rate": 0.001, "loss": 1.3931, "step": 3724 }, { "epoch": 0.26111101409286974, "grad_norm": 0.60546875, "learning_rate": 0.001, "loss": 1.4361, "step": 3738 }, { "epoch": 0.2620889579658767, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3841, "step": 3752 }, { "epoch": 0.26306690183888376, "grad_norm": 0.2490234375, "learning_rate": 0.001, "loss": 1.3808, "step": 3766 }, { "epoch": 0.26404484571189074, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.3792, "step": 3780 }, { "epoch": 0.2650227895848978, "grad_norm": 0.2109375, "learning_rate": 0.001, "loss": 1.3488, "step": 3794 }, { "epoch": 0.26600073345790476, "grad_norm": 0.3359375, "learning_rate": 0.001, "loss": 1.3644, "step": 3808 }, { "epoch": 0.26697867733091174, "grad_norm": 0.359375, "learning_rate": 0.001, "loss": 1.3707, "step": 3822 }, { "epoch": 0.2679566212039188, "grad_norm": 0.2236328125, "learning_rate": 0.001, "loss": 1.3584, "step": 3836 }, { "epoch": 0.26893456507692576, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3605, "step": 3850 }, { "epoch": 0.26991250894993274, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.3703, "step": 3864 }, { "epoch": 0.2708904528229398, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.3507, "step": 3878 }, { "epoch": 0.27186839669594676, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.3601, "step": 3892 }, { "epoch": 0.2728463405689538, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3663, "step": 3906 }, { "epoch": 0.2738242844419608, "grad_norm": 0.2490234375, "learning_rate": 0.001, "loss": 1.3625, "step": 3920 }, { "epoch": 0.27480222831496776, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.3722, "step": 3934 }, { "epoch": 0.2757801721879748, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.342, "step": 3948 }, { "epoch": 0.2767581160609818, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.3682, "step": 3962 }, { "epoch": 0.2777360599339888, "grad_norm": 0.2373046875, "learning_rate": 0.001, "loss": 1.3662, "step": 3976 }, { "epoch": 0.2787140038069958, "grad_norm": 0.24609375, "learning_rate": 0.001, "loss": 1.3528, "step": 3990 }, { "epoch": 0.2796919476800028, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.3576, "step": 4004 }, { "epoch": 0.2806698915530098, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.3744, "step": 4018 }, { "epoch": 0.2816478354260168, "grad_norm": 0.2333984375, "learning_rate": 0.001, "loss": 1.3554, "step": 4032 }, { "epoch": 0.2826257792990238, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.3518, "step": 4046 }, { "epoch": 0.2836037231720308, "grad_norm": 0.25, "learning_rate": 0.001, "loss": 1.3595, "step": 4060 }, { "epoch": 0.2845816670450378, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.3361, "step": 4074 }, { "epoch": 0.2855596109180448, "grad_norm": 0.275390625, "learning_rate": 0.001, "loss": 1.3664, "step": 4088 }, { "epoch": 0.2865375547910518, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.3633, "step": 4102 }, { "epoch": 0.28751549866405884, "grad_norm": 0.2275390625, "learning_rate": 0.001, "loss": 1.3437, "step": 4116 }, { "epoch": 0.2884934425370658, "grad_norm": 0.1943359375, "learning_rate": 0.001, "loss": 1.3401, "step": 4130 }, { "epoch": 0.2894713864100728, "grad_norm": 0.2392578125, "learning_rate": 0.001, "loss": 1.3465, "step": 4144 }, { "epoch": 0.29044933028307984, "grad_norm": 0.5, "learning_rate": 0.001, "loss": 1.3517, "step": 4158 }, { "epoch": 0.2914272741560868, "grad_norm": 0.36328125, "learning_rate": 0.001, "loss": 1.3272, "step": 4172 }, { "epoch": 0.2924052180290938, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.3676, "step": 4186 }, { "epoch": 0.29338316190210084, "grad_norm": 0.216796875, "learning_rate": 0.001, "loss": 1.3501, "step": 4200 }, { "epoch": 0.2943611057751078, "grad_norm": 0.24609375, "learning_rate": 0.001, "loss": 1.3423, "step": 4214 }, { "epoch": 0.29533904964811486, "grad_norm": 0.3671875, "learning_rate": 0.001, "loss": 1.3396, "step": 4228 }, { "epoch": 0.29631699352112184, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.3416, "step": 4242 }, { "epoch": 0.2972949373941288, "grad_norm": 0.326171875, "learning_rate": 0.001, "loss": 1.3419, "step": 4256 }, { "epoch": 0.29827288126713586, "grad_norm": 0.3671875, "learning_rate": 0.001, "loss": 1.3128, "step": 4270 }, { "epoch": 0.29925082514014284, "grad_norm": 0.28125, "learning_rate": 0.001, "loss": 1.3528, "step": 4284 }, { "epoch": 0.29987950334421876, "eval_loss": 1.7873083353042603, "eval_runtime": 9.1065, "eval_samples_per_second": 109.812, "eval_steps_per_second": 1.428, "step": 4293 }, { "epoch": 0.3002287690131499, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.3496, "step": 4298 }, { "epoch": 0.30120671288615686, "grad_norm": 0.2451171875, "learning_rate": 0.001, "loss": 1.3369, "step": 4312 }, { "epoch": 0.30218465675916384, "grad_norm": 0.34765625, "learning_rate": 0.001, "loss": 1.3803, "step": 4326 }, { "epoch": 0.3031626006321709, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 1.324, "step": 4340 }, { "epoch": 0.30414054450517786, "grad_norm": 0.42578125, "learning_rate": 0.001, "loss": 1.3559, "step": 4354 }, { "epoch": 0.3051184883781849, "grad_norm": 0.373046875, "learning_rate": 0.001, "loss": 1.3166, "step": 4368 }, { "epoch": 0.3060964322511919, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.3376, "step": 4382 }, { "epoch": 0.30707437612419886, "grad_norm": 0.2197265625, "learning_rate": 0.001, "loss": 1.3155, "step": 4396 }, { "epoch": 0.3080523199972059, "grad_norm": 0.337890625, "learning_rate": 0.001, "loss": 1.3278, "step": 4410 }, { "epoch": 0.3090302638702129, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.34, "step": 4424 }, { "epoch": 0.31000820774321985, "grad_norm": 0.2431640625, "learning_rate": 0.001, "loss": 1.313, "step": 4438 }, { "epoch": 0.3109861516162269, "grad_norm": 0.359375, "learning_rate": 0.001, "loss": 1.3421, "step": 4452 }, { "epoch": 0.3119640954892339, "grad_norm": 0.423828125, "learning_rate": 0.001, "loss": 1.3327, "step": 4466 }, { "epoch": 0.3129420393622409, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.3345, "step": 4480 }, { "epoch": 0.3139199832352479, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.3157, "step": 4494 }, { "epoch": 0.31489792710825487, "grad_norm": 0.19921875, "learning_rate": 0.001, "loss": 1.334, "step": 4508 }, { "epoch": 0.3158758709812619, "grad_norm": 0.26171875, "learning_rate": 0.001, "loss": 1.3526, "step": 4522 }, { "epoch": 0.3168538148542689, "grad_norm": 0.35546875, "learning_rate": 0.001, "loss": 1.3565, "step": 4536 }, { "epoch": 0.3178317587272759, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.3343, "step": 4550 }, { "epoch": 0.3188097026002829, "grad_norm": 0.28125, "learning_rate": 0.001, "loss": 1.3372, "step": 4564 }, { "epoch": 0.3197876464732899, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.3105, "step": 4578 }, { "epoch": 0.3207655903462969, "grad_norm": 0.34765625, "learning_rate": 0.001, "loss": 1.3297, "step": 4592 }, { "epoch": 0.3217435342193039, "grad_norm": 0.341796875, "learning_rate": 0.001, "loss": 1.3549, "step": 4606 }, { "epoch": 0.32272147809231094, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 1.3499, "step": 4620 }, { "epoch": 0.3236994219653179, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 1.3642, "step": 4634 }, { "epoch": 0.3246773658383249, "grad_norm": 0.58984375, "learning_rate": 0.001, "loss": 1.3611, "step": 4648 }, { "epoch": 0.32565530971133194, "grad_norm": 0.3359375, "learning_rate": 0.001, "loss": 1.3255, "step": 4662 }, { "epoch": 0.3266332535843389, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.3255, "step": 4676 }, { "epoch": 0.3276111974573459, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.3431, "step": 4690 }, { "epoch": 0.32858914133035294, "grad_norm": 0.326171875, "learning_rate": 0.001, "loss": 1.322, "step": 4704 }, { "epoch": 0.3295670852033599, "grad_norm": 0.26171875, "learning_rate": 0.001, "loss": 1.3271, "step": 4718 }, { "epoch": 0.33054502907636696, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.3364, "step": 4732 }, { "epoch": 0.33152297294937394, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.3311, "step": 4746 }, { "epoch": 0.3325009168223809, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.3555, "step": 4760 }, { "epoch": 0.33347886069538796, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.329, "step": 4774 }, { "epoch": 0.33445680456839494, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.3264, "step": 4788 }, { "epoch": 0.335434748441402, "grad_norm": 0.279296875, "learning_rate": 0.001, "loss": 1.3142, "step": 4802 }, { "epoch": 0.33641269231440896, "grad_norm": 0.205078125, "learning_rate": 0.001, "loss": 1.3202, "step": 4816 }, { "epoch": 0.33739063618741594, "grad_norm": 0.234375, "learning_rate": 0.001, "loss": 1.3516, "step": 4830 }, { "epoch": 0.338368580060423, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3365, "step": 4844 }, { "epoch": 0.33934652393342996, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3334, "step": 4858 }, { "epoch": 0.340324467806437, "grad_norm": 0.421875, "learning_rate": 0.001, "loss": 1.3485, "step": 4872 }, { "epoch": 0.341302411679444, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.3343, "step": 4886 }, { "epoch": 0.34228035555245095, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.3395, "step": 4900 }, { "epoch": 0.343258299425458, "grad_norm": 0.359375, "learning_rate": 0.001, "loss": 1.3193, "step": 4914 }, { "epoch": 0.344236243298465, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.327, "step": 4928 }, { "epoch": 0.34521418717147195, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.2993, "step": 4942 }, { "epoch": 0.346192131044479, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 1.3182, "step": 4956 }, { "epoch": 0.34717007491748597, "grad_norm": 0.2333984375, "learning_rate": 0.001, "loss": 1.3409, "step": 4970 }, { "epoch": 0.348148018790493, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.3354, "step": 4984 }, { "epoch": 0.3491259626635, "grad_norm": 0.28125, "learning_rate": 0.001, "loss": 1.305, "step": 4998 }, { "epoch": 0.35010390653650697, "grad_norm": 0.2216796875, "learning_rate": 0.001, "loss": 1.314, "step": 5012 }, { "epoch": 0.351081850409514, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.3267, "step": 5026 }, { "epoch": 0.352059794282521, "grad_norm": 0.42578125, "learning_rate": 0.001, "loss": 1.3327, "step": 5040 }, { "epoch": 0.353037738155528, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.3113, "step": 5054 }, { "epoch": 0.354015682028535, "grad_norm": 0.3984375, "learning_rate": 0.001, "loss": 1.3293, "step": 5068 }, { "epoch": 0.354993625901542, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 1.3094, "step": 5082 }, { "epoch": 0.355971569774549, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.3053, "step": 5096 }, { "epoch": 0.356949513647556, "grad_norm": 0.2109375, "learning_rate": 0.001, "loss": 1.325, "step": 5110 }, { "epoch": 0.35792745752056304, "grad_norm": 0.337890625, "learning_rate": 0.001, "loss": 1.3392, "step": 5124 }, { "epoch": 0.35890540139357, "grad_norm": 0.412109375, "learning_rate": 0.001, "loss": 1.3363, "step": 5138 }, { "epoch": 0.359883345266577, "grad_norm": 0.59375, "learning_rate": 0.001, "loss": 1.3693, "step": 5152 }, { "epoch": 0.36086128913958404, "grad_norm": 0.390625, "learning_rate": 0.001, "loss": 1.3341, "step": 5166 }, { "epoch": 0.361839233012591, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.3104, "step": 5180 }, { "epoch": 0.362817176885598, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.3277, "step": 5194 }, { "epoch": 0.36379512075860504, "grad_norm": 0.2275390625, "learning_rate": 0.001, "loss": 1.3274, "step": 5208 }, { "epoch": 0.364773064631612, "grad_norm": 0.26953125, "learning_rate": 0.001, "loss": 1.3295, "step": 5222 }, { "epoch": 0.36575100850461906, "grad_norm": 0.2490234375, "learning_rate": 0.001, "loss": 1.334, "step": 5236 }, { "epoch": 0.36672895237762604, "grad_norm": 0.318359375, "learning_rate": 0.001, "loss": 1.3081, "step": 5250 }, { "epoch": 0.367706896250633, "grad_norm": 0.232421875, "learning_rate": 0.001, "loss": 1.2691, "step": 5264 }, { "epoch": 0.36868484012364006, "grad_norm": 0.47265625, "learning_rate": 0.001, "loss": 1.3241, "step": 5278 }, { "epoch": 0.36966278399664704, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.3015, "step": 5292 }, { "epoch": 0.3706407278696541, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.3269, "step": 5306 }, { "epoch": 0.37161867174266106, "grad_norm": 0.23046875, "learning_rate": 0.001, "loss": 1.3249, "step": 5320 }, { "epoch": 0.37259661561566804, "grad_norm": 0.384765625, "learning_rate": 0.001, "loss": 1.3091, "step": 5334 }, { "epoch": 0.3735745594886751, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.3251, "step": 5348 }, { "epoch": 0.37455250336168205, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2866, "step": 5362 }, { "epoch": 0.3755304472346891, "grad_norm": 0.29296875, "learning_rate": 0.001, "loss": 1.2987, "step": 5376 }, { "epoch": 0.37650839110769607, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.3095, "step": 5390 }, { "epoch": 0.37748633498070305, "grad_norm": 0.236328125, "learning_rate": 0.001, "loss": 1.304, "step": 5404 }, { "epoch": 0.3784642788537101, "grad_norm": 0.21484375, "learning_rate": 0.001, "loss": 1.3158, "step": 5418 }, { "epoch": 0.37944222272671707, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.3164, "step": 5432 }, { "epoch": 0.3804201665997241, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.3078, "step": 5446 }, { "epoch": 0.3813981104727311, "grad_norm": 0.365234375, "learning_rate": 0.001, "loss": 1.3603, "step": 5460 }, { "epoch": 0.38237605434573807, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.3174, "step": 5474 }, { "epoch": 0.3833539982187451, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.3204, "step": 5488 }, { "epoch": 0.3843319420917521, "grad_norm": 0.33203125, "learning_rate": 0.001, "loss": 1.3005, "step": 5502 }, { "epoch": 0.38530988596475907, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 1.3396, "step": 5516 }, { "epoch": 0.3862878298377661, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.3231, "step": 5530 }, { "epoch": 0.3872657737107731, "grad_norm": 0.375, "learning_rate": 0.001, "loss": 1.3353, "step": 5544 }, { "epoch": 0.3882437175837801, "grad_norm": 0.33203125, "learning_rate": 0.001, "loss": 1.3314, "step": 5558 }, { "epoch": 0.3892216614567871, "grad_norm": 0.35546875, "learning_rate": 0.001, "loss": 1.3228, "step": 5572 }, { "epoch": 0.3901996053297941, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 1.3034, "step": 5586 }, { "epoch": 0.3911775492028011, "grad_norm": 0.349609375, "learning_rate": 0.001, "loss": 1.3307, "step": 5600 }, { "epoch": 0.3921554930758081, "grad_norm": 0.26171875, "learning_rate": 0.001, "loss": 1.2888, "step": 5614 }, { "epoch": 0.39313343694881514, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.3107, "step": 5628 }, { "epoch": 0.3941113808218221, "grad_norm": 0.3984375, "learning_rate": 0.001, "loss": 1.32, "step": 5642 }, { "epoch": 0.3950893246948291, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.3383, "step": 5656 }, { "epoch": 0.39606726856783614, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.2903, "step": 5670 }, { "epoch": 0.3970452124408431, "grad_norm": 0.330078125, "learning_rate": 0.001, "loss": 1.3338, "step": 5684 }, { "epoch": 0.39802315631385016, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.3371, "step": 5698 }, { "epoch": 0.39900110018685714, "grad_norm": 0.2265625, "learning_rate": 0.001, "loss": 1.3169, "step": 5712 }, { "epoch": 0.3998393377922917, "eval_loss": 1.7229478359222412, "eval_runtime": 9.1252, "eval_samples_per_second": 109.587, "eval_steps_per_second": 1.425, "step": 5724 }, { "epoch": 0.3999790440598641, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.3098, "step": 5726 }, { "epoch": 0.40095698793287116, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.3131, "step": 5740 }, { "epoch": 0.40193493180587814, "grad_norm": 0.408203125, "learning_rate": 0.001, "loss": 1.3199, "step": 5754 }, { "epoch": 0.4029128756788851, "grad_norm": 0.46875, "learning_rate": 0.001, "loss": 1.3238, "step": 5768 }, { "epoch": 0.40389081955189216, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 1.3192, "step": 5782 }, { "epoch": 0.40486876342489914, "grad_norm": 0.31640625, "learning_rate": 0.001, "loss": 1.3274, "step": 5796 }, { "epoch": 0.4058467072979062, "grad_norm": 0.318359375, "learning_rate": 0.001, "loss": 1.2836, "step": 5810 }, { "epoch": 0.40682465117091315, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.3021, "step": 5824 }, { "epoch": 0.40780259504392014, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.2903, "step": 5838 }, { "epoch": 0.40878053891692717, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.2965, "step": 5852 }, { "epoch": 0.40975848278993415, "grad_norm": 0.2421875, "learning_rate": 0.001, "loss": 1.2816, "step": 5866 }, { "epoch": 0.4107364266629412, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.3082, "step": 5880 }, { "epoch": 0.41171437053594817, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.3229, "step": 5894 }, { "epoch": 0.41269231440895515, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.3056, "step": 5908 }, { "epoch": 0.4136702582819622, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.2618, "step": 5922 }, { "epoch": 0.41464820215496917, "grad_norm": 0.24609375, "learning_rate": 0.001, "loss": 1.29, "step": 5936 }, { "epoch": 0.4156261460279762, "grad_norm": 0.36328125, "learning_rate": 0.001, "loss": 1.3132, "step": 5950 }, { "epoch": 0.4166040899009832, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.3083, "step": 5964 }, { "epoch": 0.41758203377399017, "grad_norm": 0.326171875, "learning_rate": 0.001, "loss": 1.3118, "step": 5978 }, { "epoch": 0.4185599776469972, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.3105, "step": 5992 }, { "epoch": 0.4195379215200042, "grad_norm": 0.25, "learning_rate": 0.001, "loss": 1.3032, "step": 6006 }, { "epoch": 0.42051586539301117, "grad_norm": 0.232421875, "learning_rate": 0.001, "loss": 1.2815, "step": 6020 }, { "epoch": 0.4214938092660182, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2913, "step": 6034 }, { "epoch": 0.4224717531390252, "grad_norm": 0.2353515625, "learning_rate": 0.001, "loss": 1.3015, "step": 6048 }, { "epoch": 0.4234496970120322, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.3072, "step": 6062 }, { "epoch": 0.4244276408850392, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.3412, "step": 6076 }, { "epoch": 0.4254055847580462, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.3024, "step": 6090 }, { "epoch": 0.4263835286310532, "grad_norm": 0.2255859375, "learning_rate": 0.001, "loss": 1.3124, "step": 6104 }, { "epoch": 0.4273614725040602, "grad_norm": 0.55078125, "learning_rate": 0.001, "loss": 1.3066, "step": 6118 }, { "epoch": 0.42833941637706724, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.3194, "step": 6132 }, { "epoch": 0.4293173602500742, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.2943, "step": 6146 }, { "epoch": 0.4302953041230812, "grad_norm": 0.279296875, "learning_rate": 0.001, "loss": 1.2726, "step": 6160 }, { "epoch": 0.43127324799608824, "grad_norm": 0.2158203125, "learning_rate": 0.001, "loss": 1.2693, "step": 6174 }, { "epoch": 0.4322511918690952, "grad_norm": 0.240234375, "learning_rate": 0.001, "loss": 1.2966, "step": 6188 }, { "epoch": 0.43322913574210226, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.3144, "step": 6202 }, { "epoch": 0.43420707961510924, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.2774, "step": 6216 }, { "epoch": 0.4351850234881162, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.3109, "step": 6230 }, { "epoch": 0.43616296736112325, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.2897, "step": 6244 }, { "epoch": 0.43714091123413024, "grad_norm": 0.216796875, "learning_rate": 0.001, "loss": 1.2796, "step": 6258 }, { "epoch": 0.4381188551071372, "grad_norm": 0.3125, "learning_rate": 0.001, "loss": 1.3235, "step": 6272 }, { "epoch": 0.43909679898014425, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.2996, "step": 6286 }, { "epoch": 0.44007474285315124, "grad_norm": 0.32421875, "learning_rate": 0.001, "loss": 1.3074, "step": 6300 }, { "epoch": 0.44105268672615827, "grad_norm": 0.34375, "learning_rate": 0.001, "loss": 1.2851, "step": 6314 }, { "epoch": 0.44203063059916525, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.3315, "step": 6328 }, { "epoch": 0.44300857447217223, "grad_norm": 0.2421875, "learning_rate": 0.001, "loss": 1.3049, "step": 6342 }, { "epoch": 0.44398651834517927, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.3186, "step": 6356 }, { "epoch": 0.44496446221818625, "grad_norm": 0.392578125, "learning_rate": 0.001, "loss": 1.276, "step": 6370 }, { "epoch": 0.4459424060911933, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 1.2827, "step": 6384 }, { "epoch": 0.44692034996420027, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.2925, "step": 6398 }, { "epoch": 0.44789829383720725, "grad_norm": 0.2294921875, "learning_rate": 0.001, "loss": 1.2788, "step": 6412 }, { "epoch": 0.4488762377102143, "grad_norm": 0.32421875, "learning_rate": 0.001, "loss": 1.2982, "step": 6426 }, { "epoch": 0.44985418158322127, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.2977, "step": 6440 }, { "epoch": 0.4508321254562283, "grad_norm": 0.349609375, "learning_rate": 0.001, "loss": 1.3041, "step": 6454 }, { "epoch": 0.4518100693292353, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.2828, "step": 6468 }, { "epoch": 0.45278801320224227, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.2942, "step": 6482 }, { "epoch": 0.4537659570752493, "grad_norm": 0.26953125, "learning_rate": 0.001, "loss": 1.2626, "step": 6496 }, { "epoch": 0.4547439009482563, "grad_norm": 0.32421875, "learning_rate": 0.001, "loss": 1.2788, "step": 6510 }, { "epoch": 0.45572184482126327, "grad_norm": 0.349609375, "learning_rate": 0.001, "loss": 1.2733, "step": 6524 }, { "epoch": 0.4566997886942703, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.3146, "step": 6538 }, { "epoch": 0.4576777325672773, "grad_norm": 0.302734375, "learning_rate": 0.001, "loss": 1.3027, "step": 6552 }, { "epoch": 0.4586556764402843, "grad_norm": 0.28125, "learning_rate": 0.001, "loss": 1.2753, "step": 6566 }, { "epoch": 0.4596336203132913, "grad_norm": 0.439453125, "learning_rate": 0.001, "loss": 1.2955, "step": 6580 }, { "epoch": 0.4606115641862983, "grad_norm": 0.353515625, "learning_rate": 0.001, "loss": 1.3105, "step": 6594 }, { "epoch": 0.4615895080593053, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.2798, "step": 6608 }, { "epoch": 0.4625674519323123, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.2983, "step": 6622 }, { "epoch": 0.46354539580531934, "grad_norm": 0.52734375, "learning_rate": 0.001, "loss": 1.3166, "step": 6636 }, { "epoch": 0.4645233396783263, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.287, "step": 6650 }, { "epoch": 0.4655012835513333, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.28, "step": 6664 }, { "epoch": 0.46647922742434034, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.2896, "step": 6678 }, { "epoch": 0.4674571712973473, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.2933, "step": 6692 }, { "epoch": 0.46843511517035435, "grad_norm": 0.34375, "learning_rate": 0.001, "loss": 1.2602, "step": 6706 }, { "epoch": 0.46941305904336134, "grad_norm": 0.25, "learning_rate": 0.001, "loss": 1.267, "step": 6720 }, { "epoch": 0.4703910029163683, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.2745, "step": 6734 }, { "epoch": 0.47136894678937535, "grad_norm": 0.453125, "learning_rate": 0.001, "loss": 1.3102, "step": 6748 }, { "epoch": 0.47234689066238233, "grad_norm": 0.36328125, "learning_rate": 0.001, "loss": 1.2844, "step": 6762 }, { "epoch": 0.47332483453538937, "grad_norm": 0.404296875, "learning_rate": 0.001, "loss": 1.2787, "step": 6776 }, { "epoch": 0.47430277840839635, "grad_norm": 0.41796875, "learning_rate": 0.001, "loss": 1.2921, "step": 6790 }, { "epoch": 0.47528072228140333, "grad_norm": 0.447265625, "learning_rate": 0.001, "loss": 1.3157, "step": 6804 }, { "epoch": 0.47625866615441037, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.2932, "step": 6818 }, { "epoch": 0.47723661002741735, "grad_norm": 0.296875, "learning_rate": 0.001, "loss": 1.3011, "step": 6832 }, { "epoch": 0.47821455390042433, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.2883, "step": 6846 }, { "epoch": 0.47919249777343137, "grad_norm": 0.2431640625, "learning_rate": 0.001, "loss": 1.3087, "step": 6860 }, { "epoch": 0.48017044164643835, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.2855, "step": 6874 }, { "epoch": 0.4811483855194454, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.3106, "step": 6888 }, { "epoch": 0.48212632939245237, "grad_norm": 0.29296875, "learning_rate": 0.001, "loss": 1.2869, "step": 6902 }, { "epoch": 0.48310427326545935, "grad_norm": 0.53125, "learning_rate": 0.001, "loss": 1.2852, "step": 6916 }, { "epoch": 0.4840822171384664, "grad_norm": 0.46484375, "learning_rate": 0.001, "loss": 1.2995, "step": 6930 }, { "epoch": 0.48506016101147337, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.3098, "step": 6944 }, { "epoch": 0.4860381048844804, "grad_norm": 0.322265625, "learning_rate": 0.001, "loss": 1.2687, "step": 6958 }, { "epoch": 0.4870160487574874, "grad_norm": 0.400390625, "learning_rate": 0.001, "loss": 1.2885, "step": 6972 }, { "epoch": 0.48799399263049437, "grad_norm": 0.302734375, "learning_rate": 0.001, "loss": 1.3135, "step": 6986 }, { "epoch": 0.4889719365035014, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.2776, "step": 7000 }, { "epoch": 0.4899498803765084, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2761, "step": 7014 }, { "epoch": 0.4909278242495154, "grad_norm": 0.326171875, "learning_rate": 0.001, "loss": 1.2805, "step": 7028 }, { "epoch": 0.4919057681225224, "grad_norm": 0.310546875, "learning_rate": 0.001, "loss": 1.2836, "step": 7042 }, { "epoch": 0.4928837119955294, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.3029, "step": 7056 }, { "epoch": 0.4938616558685364, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.2929, "step": 7070 }, { "epoch": 0.4948395997415434, "grad_norm": 0.80859375, "learning_rate": 0.001, "loss": 1.2995, "step": 7084 }, { "epoch": 0.4958175436145504, "grad_norm": 0.4609375, "learning_rate": 0.001, "loss": 1.2788, "step": 7098 }, { "epoch": 0.4967954874875574, "grad_norm": 0.279296875, "learning_rate": 0.001, "loss": 1.276, "step": 7112 }, { "epoch": 0.4977734313605644, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.2883, "step": 7126 }, { "epoch": 0.49875137523357144, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.2803, "step": 7140 }, { "epoch": 0.4997293191065784, "grad_norm": 0.22265625, "learning_rate": 0.001, "loss": 1.2576, "step": 7154 }, { "epoch": 0.49979917224036463, "eval_loss": 1.6829583644866943, "eval_runtime": 9.1256, "eval_samples_per_second": 109.582, "eval_steps_per_second": 1.425, "step": 7155 }, { "epoch": 0.5007072629795855, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.2987, "step": 7168 }, { "epoch": 0.5016852068525924, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.2759, "step": 7182 }, { "epoch": 0.5026631507255994, "grad_norm": 0.376953125, "learning_rate": 0.001, "loss": 1.2814, "step": 7196 }, { "epoch": 0.5036410945986064, "grad_norm": 0.359375, "learning_rate": 0.001, "loss": 1.2701, "step": 7210 }, { "epoch": 0.5046190384716134, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.2857, "step": 7224 }, { "epoch": 0.5055969823446205, "grad_norm": 0.234375, "learning_rate": 0.001, "loss": 1.2707, "step": 7238 }, { "epoch": 0.5065749262176275, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.2851, "step": 7252 }, { "epoch": 0.5075528700906344, "grad_norm": 0.337890625, "learning_rate": 0.001, "loss": 1.2722, "step": 7266 }, { "epoch": 0.5085308139636414, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.277, "step": 7280 }, { "epoch": 0.5095087578366484, "grad_norm": 0.33203125, "learning_rate": 0.001, "loss": 1.3021, "step": 7294 }, { "epoch": 0.5104867017096555, "grad_norm": 0.2314453125, "learning_rate": 0.001, "loss": 1.2856, "step": 7308 }, { "epoch": 0.5114646455826625, "grad_norm": 0.25, "learning_rate": 0.001, "loss": 1.2704, "step": 7322 }, { "epoch": 0.5124425894556695, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.2837, "step": 7336 }, { "epoch": 0.5134205333286764, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.277, "step": 7350 }, { "epoch": 0.5143984772016834, "grad_norm": 0.2734375, "learning_rate": 0.001, "loss": 1.2838, "step": 7364 }, { "epoch": 0.5153764210746905, "grad_norm": 0.296875, "learning_rate": 0.001, "loss": 1.2762, "step": 7378 }, { "epoch": 0.5163543649476975, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.2749, "step": 7392 }, { "epoch": 0.5173323088207045, "grad_norm": 0.28125, "learning_rate": 0.001, "loss": 1.2791, "step": 7406 }, { "epoch": 0.5183102526937114, "grad_norm": 0.400390625, "learning_rate": 0.001, "loss": 1.2708, "step": 7420 }, { "epoch": 0.5192881965667184, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.2773, "step": 7434 }, { "epoch": 0.5202661404397255, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.2783, "step": 7448 }, { "epoch": 0.5212440843127325, "grad_norm": 0.341796875, "learning_rate": 0.001, "loss": 1.2944, "step": 7462 }, { "epoch": 0.5222220281857395, "grad_norm": 0.369140625, "learning_rate": 0.001, "loss": 1.2714, "step": 7476 }, { "epoch": 0.5231999720587465, "grad_norm": 0.2421875, "learning_rate": 0.001, "loss": 1.2711, "step": 7490 }, { "epoch": 0.5241779159317534, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 1.2808, "step": 7504 }, { "epoch": 0.5251558598047605, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2765, "step": 7518 }, { "epoch": 0.5261338036777675, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.2702, "step": 7532 }, { "epoch": 0.5271117475507745, "grad_norm": 0.26953125, "learning_rate": 0.001, "loss": 1.2802, "step": 7546 }, { "epoch": 0.5280896914237815, "grad_norm": 0.57421875, "learning_rate": 0.001, "loss": 1.2733, "step": 7560 }, { "epoch": 0.5290676352967885, "grad_norm": 0.494140625, "learning_rate": 0.001, "loss": 1.2575, "step": 7574 }, { "epoch": 0.5300455791697956, "grad_norm": 0.376953125, "learning_rate": 0.001, "loss": 1.2863, "step": 7588 }, { "epoch": 0.5310235230428025, "grad_norm": 0.369140625, "learning_rate": 0.001, "loss": 1.2815, "step": 7602 }, { "epoch": 0.5320014669158095, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.2745, "step": 7616 }, { "epoch": 0.5329794107888165, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.294, "step": 7630 }, { "epoch": 0.5339573546618235, "grad_norm": 0.2265625, "learning_rate": 0.001, "loss": 1.2797, "step": 7644 }, { "epoch": 0.5349352985348306, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.2665, "step": 7658 }, { "epoch": 0.5359132424078376, "grad_norm": 0.458984375, "learning_rate": 0.001, "loss": 1.28, "step": 7672 }, { "epoch": 0.5368911862808445, "grad_norm": 0.359375, "learning_rate": 0.001, "loss": 1.3057, "step": 7686 }, { "epoch": 0.5378691301538515, "grad_norm": 0.37109375, "learning_rate": 0.001, "loss": 1.258, "step": 7700 }, { "epoch": 0.5388470740268585, "grad_norm": 0.33203125, "learning_rate": 0.001, "loss": 1.2742, "step": 7714 }, { "epoch": 0.5398250178998655, "grad_norm": 0.365234375, "learning_rate": 0.001, "loss": 1.277, "step": 7728 }, { "epoch": 0.5408029617728726, "grad_norm": 0.412109375, "learning_rate": 0.001, "loss": 1.2819, "step": 7742 }, { "epoch": 0.5417809056458796, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.3018, "step": 7756 }, { "epoch": 0.5427588495188865, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2619, "step": 7770 }, { "epoch": 0.5437367933918935, "grad_norm": 0.296875, "learning_rate": 0.001, "loss": 1.2513, "step": 7784 }, { "epoch": 0.5447147372649005, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2456, "step": 7798 }, { "epoch": 0.5456926811379076, "grad_norm": 0.62109375, "learning_rate": 0.001, "loss": 1.2768, "step": 7812 }, { "epoch": 0.5466706250109146, "grad_norm": 0.44140625, "learning_rate": 0.001, "loss": 1.265, "step": 7826 }, { "epoch": 0.5476485688839215, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.2683, "step": 7840 }, { "epoch": 0.5486265127569285, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.2809, "step": 7854 }, { "epoch": 0.5496044566299355, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2498, "step": 7868 }, { "epoch": 0.5505824005029426, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.2632, "step": 7882 }, { "epoch": 0.5515603443759496, "grad_norm": 0.29296875, "learning_rate": 0.001, "loss": 1.2711, "step": 7896 }, { "epoch": 0.5525382882489566, "grad_norm": 0.373046875, "learning_rate": 0.001, "loss": 1.2813, "step": 7910 }, { "epoch": 0.5535162321219635, "grad_norm": 0.427734375, "learning_rate": 0.001, "loss": 1.2993, "step": 7924 }, { "epoch": 0.5544941759949705, "grad_norm": 0.373046875, "learning_rate": 0.001, "loss": 1.3001, "step": 7938 }, { "epoch": 0.5554721198679776, "grad_norm": 0.416015625, "learning_rate": 0.001, "loss": 1.2786, "step": 7952 }, { "epoch": 0.5564500637409846, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.2976, "step": 7966 }, { "epoch": 0.5574280076139916, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.286, "step": 7980 }, { "epoch": 0.5584059514869986, "grad_norm": 0.59765625, "learning_rate": 0.001, "loss": 1.282, "step": 7994 }, { "epoch": 0.5593838953600055, "grad_norm": 0.244140625, "learning_rate": 0.001, "loss": 1.2853, "step": 8008 }, { "epoch": 0.5603618392330126, "grad_norm": 0.2265625, "learning_rate": 0.001, "loss": 1.2572, "step": 8022 }, { "epoch": 0.5613397831060196, "grad_norm": 0.306640625, "learning_rate": 0.001, "loss": 1.2572, "step": 8036 }, { "epoch": 0.5623177269790266, "grad_norm": 0.63671875, "learning_rate": 0.001, "loss": 1.315, "step": 8050 }, { "epoch": 0.5632956708520336, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.3007, "step": 8064 }, { "epoch": 0.5642736147250406, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2737, "step": 8078 }, { "epoch": 0.5652515585980477, "grad_norm": 0.24609375, "learning_rate": 0.001, "loss": 1.2766, "step": 8092 }, { "epoch": 0.5662295024710546, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.3102, "step": 8106 }, { "epoch": 0.5672074463440616, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.3044, "step": 8120 }, { "epoch": 0.5681853902170686, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.2612, "step": 8134 }, { "epoch": 0.5691633340900756, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.2701, "step": 8148 }, { "epoch": 0.5701412779630827, "grad_norm": 0.21875, "learning_rate": 0.001, "loss": 1.2649, "step": 8162 }, { "epoch": 0.5711192218360897, "grad_norm": 0.2236328125, "learning_rate": 0.001, "loss": 1.2761, "step": 8176 }, { "epoch": 0.5720971657090966, "grad_norm": 0.29296875, "learning_rate": 0.001, "loss": 1.2668, "step": 8190 }, { "epoch": 0.5730751095821036, "grad_norm": 0.31640625, "learning_rate": 0.001, "loss": 1.2847, "step": 8204 }, { "epoch": 0.5740530534551106, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.2722, "step": 8218 }, { "epoch": 0.5750309973281177, "grad_norm": 0.224609375, "learning_rate": 0.001, "loss": 1.253, "step": 8232 }, { "epoch": 0.5760089412011247, "grad_norm": 0.2890625, "learning_rate": 0.001, "loss": 1.2454, "step": 8246 }, { "epoch": 0.5769868850741317, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.2558, "step": 8260 }, { "epoch": 0.5779648289471386, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.2765, "step": 8274 }, { "epoch": 0.5789427728201456, "grad_norm": 0.314453125, "learning_rate": 0.001, "loss": 1.289, "step": 8288 }, { "epoch": 0.5799207166931526, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.2724, "step": 8302 }, { "epoch": 0.5808986605661597, "grad_norm": 0.44140625, "learning_rate": 0.001, "loss": 1.2753, "step": 8316 }, { "epoch": 0.5818766044391667, "grad_norm": 0.326171875, "learning_rate": 0.001, "loss": 1.2558, "step": 8330 }, { "epoch": 0.5828545483121736, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2697, "step": 8344 }, { "epoch": 0.5838324921851806, "grad_norm": 0.34375, "learning_rate": 0.001, "loss": 1.2685, "step": 8358 }, { "epoch": 0.5848104360581876, "grad_norm": 0.275390625, "learning_rate": 0.001, "loss": 1.2724, "step": 8372 }, { "epoch": 0.5857883799311947, "grad_norm": 0.2255859375, "learning_rate": 0.001, "loss": 1.2287, "step": 8386 }, { "epoch": 0.5867663238042017, "grad_norm": 0.212890625, "learning_rate": 0.001, "loss": 1.2363, "step": 8400 }, { "epoch": 0.5877442676772087, "grad_norm": 0.279296875, "learning_rate": 0.001, "loss": 1.2648, "step": 8414 }, { "epoch": 0.5887222115502156, "grad_norm": 0.427734375, "learning_rate": 0.001, "loss": 1.2949, "step": 8428 }, { "epoch": 0.5897001554232226, "grad_norm": 0.23828125, "learning_rate": 0.001, "loss": 1.2571, "step": 8442 }, { "epoch": 0.5906780992962297, "grad_norm": 0.349609375, "learning_rate": 0.001, "loss": 1.2831, "step": 8456 }, { "epoch": 0.5916560431692367, "grad_norm": 0.34765625, "learning_rate": 0.001, "loss": 1.2965, "step": 8470 }, { "epoch": 0.5926339870422437, "grad_norm": 0.412109375, "learning_rate": 0.001, "loss": 1.2685, "step": 8484 }, { "epoch": 0.5936119309152507, "grad_norm": 0.439453125, "learning_rate": 0.001, "loss": 1.2637, "step": 8498 }, { "epoch": 0.5945898747882576, "grad_norm": 0.3671875, "learning_rate": 0.001, "loss": 1.28, "step": 8512 }, { "epoch": 0.5955678186612647, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 1.2636, "step": 8526 }, { "epoch": 0.5965457625342717, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.251, "step": 8540 }, { "epoch": 0.5975237064072787, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.262, "step": 8554 }, { "epoch": 0.5985016502802857, "grad_norm": 0.365234375, "learning_rate": 0.001, "loss": 1.2696, "step": 8568 }, { "epoch": 0.5994795941532927, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.2872, "step": 8582 }, { "epoch": 0.5997590066884375, "eval_loss": 1.661841869354248, "eval_runtime": 9.1193, "eval_samples_per_second": 109.657, "eval_steps_per_second": 1.426, "step": 8586 }, { "epoch": 0.6004575380262998, "grad_norm": 0.2451171875, "learning_rate": 0.001, "loss": 1.2767, "step": 8596 }, { "epoch": 0.6014354818993067, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.2623, "step": 8610 }, { "epoch": 0.6024134257723137, "grad_norm": 0.26953125, "learning_rate": 0.001, "loss": 1.2617, "step": 8624 }, { "epoch": 0.6033913696453207, "grad_norm": 0.25390625, "learning_rate": 0.001, "loss": 1.2514, "step": 8638 }, { "epoch": 0.6043693135183277, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.2664, "step": 8652 }, { "epoch": 0.6053472573913348, "grad_norm": 0.357421875, "learning_rate": 0.001, "loss": 1.2421, "step": 8666 }, { "epoch": 0.6063252012643418, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2386, "step": 8680 }, { "epoch": 0.6073031451373487, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.2601, "step": 8694 }, { "epoch": 0.6082810890103557, "grad_norm": 0.94921875, "learning_rate": 0.001, "loss": 1.2715, "step": 8708 }, { "epoch": 0.6092590328833627, "grad_norm": 0.43359375, "learning_rate": 0.001, "loss": 1.2848, "step": 8722 }, { "epoch": 0.6102369767563698, "grad_norm": 0.34375, "learning_rate": 0.001, "loss": 1.2632, "step": 8736 }, { "epoch": 0.6112149206293768, "grad_norm": 0.283203125, "learning_rate": 0.001, "loss": 1.2912, "step": 8750 }, { "epoch": 0.6121928645023837, "grad_norm": 0.388671875, "learning_rate": 0.001, "loss": 1.2613, "step": 8764 }, { "epoch": 0.6131708083753907, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.2357, "step": 8778 }, { "epoch": 0.6141487522483977, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.2541, "step": 8792 }, { "epoch": 0.6151266961214047, "grad_norm": 0.32421875, "learning_rate": 0.001, "loss": 1.2746, "step": 8806 }, { "epoch": 0.6161046399944118, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2445, "step": 8820 }, { "epoch": 0.6170825838674188, "grad_norm": 0.255859375, "learning_rate": 0.001, "loss": 1.2854, "step": 8834 }, { "epoch": 0.6180605277404257, "grad_norm": 0.54296875, "learning_rate": 0.001, "loss": 1.2746, "step": 8848 }, { "epoch": 0.6190384716134327, "grad_norm": 0.35546875, "learning_rate": 0.001, "loss": 1.2837, "step": 8862 }, { "epoch": 0.6200164154864397, "grad_norm": 0.341796875, "learning_rate": 0.001, "loss": 1.2682, "step": 8876 }, { "epoch": 0.6209943593594468, "grad_norm": 0.51953125, "learning_rate": 0.001, "loss": 1.2751, "step": 8890 }, { "epoch": 0.6219723032324538, "grad_norm": 0.41015625, "learning_rate": 0.001, "loss": 1.2666, "step": 8904 }, { "epoch": 0.6229502471054608, "grad_norm": 0.4140625, "learning_rate": 0.001, "loss": 1.2618, "step": 8918 }, { "epoch": 0.6239281909784677, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.2721, "step": 8932 }, { "epoch": 0.6249061348514747, "grad_norm": 0.3515625, "learning_rate": 0.001, "loss": 1.2528, "step": 8946 }, { "epoch": 0.6258840787244818, "grad_norm": 0.34375, "learning_rate": 0.001, "loss": 1.2771, "step": 8960 }, { "epoch": 0.6268620225974888, "grad_norm": 0.275390625, "learning_rate": 0.001, "loss": 1.2751, "step": 8974 }, { "epoch": 0.6278399664704958, "grad_norm": 0.28515625, "learning_rate": 0.001, "loss": 1.2749, "step": 8988 }, { "epoch": 0.6288179103435028, "grad_norm": 0.27734375, "learning_rate": 0.001, "loss": 1.2851, "step": 9002 }, { "epoch": 0.6297958542165097, "grad_norm": 0.23828125, "learning_rate": 0.001, "loss": 1.2529, "step": 9016 }, { "epoch": 0.6307737980895168, "grad_norm": 0.2890625, "learning_rate": 0.001, "loss": 1.2673, "step": 9030 }, { "epoch": 0.6317517419625238, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.2746, "step": 9044 }, { "epoch": 0.6327296858355308, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.2649, "step": 9058 }, { "epoch": 0.6337076297085378, "grad_norm": 0.318359375, "learning_rate": 0.001, "loss": 1.2849, "step": 9072 }, { "epoch": 0.6346855735815448, "grad_norm": 0.30078125, "learning_rate": 0.001, "loss": 1.243, "step": 9086 }, { "epoch": 0.6356635174545519, "grad_norm": 0.375, "learning_rate": 0.001, "loss": 1.2641, "step": 9100 }, { "epoch": 0.6366414613275588, "grad_norm": 0.361328125, "learning_rate": 0.001, "loss": 1.2554, "step": 9114 }, { "epoch": 0.6376194052005658, "grad_norm": 0.396484375, "learning_rate": 0.001, "loss": 1.2396, "step": 9128 }, { "epoch": 0.6385973490735728, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2508, "step": 9142 }, { "epoch": 0.6395752929465798, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.2772, "step": 9156 }, { "epoch": 0.6405532368195869, "grad_norm": 0.53515625, "learning_rate": 0.001, "loss": 1.2453, "step": 9170 }, { "epoch": 0.6415311806925938, "grad_norm": 0.2099609375, "learning_rate": 0.001, "loss": 1.2764, "step": 9184 }, { "epoch": 0.6425091245656008, "grad_norm": 0.2333984375, "learning_rate": 0.001, "loss": 1.251, "step": 9198 }, { "epoch": 0.6434870684386078, "grad_norm": 0.35546875, "learning_rate": 0.001, "loss": 1.2855, "step": 9212 }, { "epoch": 0.6444650123116148, "grad_norm": 1.1953125, "learning_rate": 0.001, "loss": 1.3198, "step": 9226 }, { "epoch": 0.6454429561846219, "grad_norm": 0.427734375, "learning_rate": 0.001, "loss": 1.2773, "step": 9240 }, { "epoch": 0.6464209000576289, "grad_norm": 0.5703125, "learning_rate": 0.001, "loss": 1.2786, "step": 9254 }, { "epoch": 0.6473988439306358, "grad_norm": 0.3125, "learning_rate": 0.001, "loss": 1.2389, "step": 9268 }, { "epoch": 0.6483767878036428, "grad_norm": 0.36328125, "learning_rate": 0.001, "loss": 1.2587, "step": 9282 }, { "epoch": 0.6493547316766498, "grad_norm": 0.380859375, "learning_rate": 0.001, "loss": 1.2806, "step": 9296 }, { "epoch": 0.6503326755496568, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.2307, "step": 9310 }, { "epoch": 0.6513106194226639, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2657, "step": 9324 }, { "epoch": 0.6522885632956709, "grad_norm": 0.263671875, "learning_rate": 0.001, "loss": 1.2605, "step": 9338 }, { "epoch": 0.6532665071686778, "grad_norm": 0.2412109375, "learning_rate": 0.001, "loss": 1.2538, "step": 9352 }, { "epoch": 0.6542444510416848, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.2633, "step": 9366 }, { "epoch": 0.6552223949146918, "grad_norm": 0.31640625, "learning_rate": 0.001, "loss": 1.2582, "step": 9380 }, { "epoch": 0.6562003387876989, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.2515, "step": 9394 }, { "epoch": 0.6571782826607059, "grad_norm": 0.33984375, "learning_rate": 0.001, "loss": 1.2679, "step": 9408 }, { "epoch": 0.6581562265337129, "grad_norm": 0.3828125, "learning_rate": 0.001, "loss": 1.2539, "step": 9422 }, { "epoch": 0.6591341704067198, "grad_norm": 0.26171875, "learning_rate": 0.001, "loss": 1.2632, "step": 9436 }, { "epoch": 0.6601121142797268, "grad_norm": 0.3203125, "learning_rate": 0.001, "loss": 1.2946, "step": 9450 }, { "epoch": 0.6610900581527339, "grad_norm": 0.38671875, "learning_rate": 0.001, "loss": 1.2691, "step": 9464 }, { "epoch": 0.6620680020257409, "grad_norm": 0.2890625, "learning_rate": 0.001, "loss": 1.246, "step": 9478 }, { "epoch": 0.6630459458987479, "grad_norm": 0.431640625, "learning_rate": 0.001, "loss": 1.2606, "step": 9492 }, { "epoch": 0.6640238897717549, "grad_norm": 0.8671875, "learning_rate": 0.001, "loss": 1.2782, "step": 9506 }, { "epoch": 0.6650018336447618, "grad_norm": 0.4375, "learning_rate": 0.001, "loss": 1.2687, "step": 9520 }, { "epoch": 0.6659797775177689, "grad_norm": 0.37109375, "learning_rate": 0.001, "loss": 1.2778, "step": 9534 }, { "epoch": 0.6669577213907759, "grad_norm": 0.2490234375, "learning_rate": 0.001, "loss": 1.2544, "step": 9548 }, { "epoch": 0.6679356652637829, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.246, "step": 9562 }, { "epoch": 0.6689136091367899, "grad_norm": 0.287109375, "learning_rate": 0.001, "loss": 1.2577, "step": 9576 }, { "epoch": 0.6698915530097969, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.2639, "step": 9590 }, { "epoch": 0.670869496882804, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2493, "step": 9604 }, { "epoch": 0.6718474407558109, "grad_norm": 0.2119140625, "learning_rate": 0.001, "loss": 1.2586, "step": 9618 }, { "epoch": 0.6728253846288179, "grad_norm": 0.240234375, "learning_rate": 0.001, "loss": 1.2719, "step": 9632 }, { "epoch": 0.6738033285018249, "grad_norm": 0.294921875, "learning_rate": 0.001, "loss": 1.2272, "step": 9646 }, { "epoch": 0.6747812723748319, "grad_norm": 0.423828125, "learning_rate": 0.001, "loss": 1.2667, "step": 9660 }, { "epoch": 0.675759216247839, "grad_norm": 0.30859375, "learning_rate": 0.001, "loss": 1.2567, "step": 9674 }, { "epoch": 0.676737160120846, "grad_norm": 0.21875, "learning_rate": 0.001, "loss": 1.2403, "step": 9688 }, { "epoch": 0.6777151039938529, "grad_norm": 0.2490234375, "learning_rate": 0.001, "loss": 1.2642, "step": 9702 }, { "epoch": 0.6786930478668599, "grad_norm": 0.248046875, "learning_rate": 0.001, "loss": 1.2123, "step": 9716 }, { "epoch": 0.6796709917398669, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2413, "step": 9730 }, { "epoch": 0.680648935612874, "grad_norm": 0.259765625, "learning_rate": 0.001, "loss": 1.2442, "step": 9744 }, { "epoch": 0.681626879485881, "grad_norm": 0.271484375, "learning_rate": 0.001, "loss": 1.2298, "step": 9758 }, { "epoch": 0.682604823358888, "grad_norm": 0.232421875, "learning_rate": 0.001, "loss": 1.2421, "step": 9772 }, { "epoch": 0.6835827672318949, "grad_norm": 0.302734375, "learning_rate": 0.001, "loss": 1.2725, "step": 9786 }, { "epoch": 0.6845607111049019, "grad_norm": 0.267578125, "learning_rate": 0.001, "loss": 1.2417, "step": 9800 }, { "epoch": 0.685538654977909, "grad_norm": 0.232421875, "learning_rate": 0.001, "loss": 1.2526, "step": 9814 }, { "epoch": 0.686516598850916, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.2352, "step": 9828 }, { "epoch": 0.687494542723923, "grad_norm": 0.361328125, "learning_rate": 0.001, "loss": 1.2653, "step": 9842 }, { "epoch": 0.68847248659693, "grad_norm": 0.328125, "learning_rate": 0.001, "loss": 1.2569, "step": 9856 }, { "epoch": 0.6894504304699369, "grad_norm": 0.26171875, "learning_rate": 0.001, "loss": 1.248, "step": 9870 }, { "epoch": 0.6904283743429439, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.2864, "step": 9884 }, { "epoch": 0.691406318215951, "grad_norm": 0.345703125, "learning_rate": 0.001, "loss": 1.2663, "step": 9898 }, { "epoch": 0.692384262088958, "grad_norm": 0.36328125, "learning_rate": 0.001, "loss": 1.2534, "step": 9912 }, { "epoch": 0.693362205961965, "grad_norm": 0.333984375, "learning_rate": 0.001, "loss": 1.2439, "step": 9926 }, { "epoch": 0.6943401498349719, "grad_norm": 0.298828125, "learning_rate": 0.001, "loss": 1.2592, "step": 9940 }, { "epoch": 0.6953180937079789, "grad_norm": 0.3046875, "learning_rate": 0.001, "loss": 1.2655, "step": 9954 }, { "epoch": 0.696296037580986, "grad_norm": 0.265625, "learning_rate": 0.001, "loss": 1.2569, "step": 9968 }, { "epoch": 0.697273981453993, "grad_norm": 0.376953125, "learning_rate": 0.001, "loss": 1.2663, "step": 9982 }, { "epoch": 0.698251925327, "grad_norm": 0.291015625, "learning_rate": 0.001, "loss": 1.2419, "step": 9996 }, { "epoch": 0.699229869200007, "grad_norm": 0.251953125, "learning_rate": 0.001, "loss": 1.2406, "step": 10010 }, { "epoch": 0.6997188411365105, "eval_loss": 1.6404287815093994, "eval_runtime": 9.1224, "eval_samples_per_second": 109.621, "eval_steps_per_second": 1.425, "step": 10017 } ], "logging_steps": 14, "max_steps": 14315, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1431, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.66909141699448e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }