| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.6997188411365105, |
| "eval_steps": 1431, |
| "global_step": 10017, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0009779438730070028, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.001, |
| "loss": 5.7482, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0019558877460140057, |
| "grad_norm": 0.1259765625, |
| "learning_rate": 0.001, |
| "loss": 3.6003, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0029338316190210085, |
| "grad_norm": 0.12109375, |
| "learning_rate": 0.001, |
| "loss": 3.4079, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.003911775492028011, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.001, |
| "loss": 3.201, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.004889719365035014, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.001, |
| "loss": 2.955, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.005867663238042017, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.001, |
| "loss": 2.8744, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.006845607111049019, |
| "grad_norm": 0.15625, |
| "learning_rate": 0.001, |
| "loss": 2.8267, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.007823550984056023, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 0.001, |
| "loss": 2.7249, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.008801494857063025, |
| "grad_norm": 2.625, |
| "learning_rate": 0.001, |
| "loss": 2.6441, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.009779438730070028, |
| "grad_norm": 0.173828125, |
| "learning_rate": 0.001, |
| "loss": 2.6126, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.01075738260307703, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.001, |
| "loss": 2.5241, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.011735326476084034, |
| "grad_norm": 0.158203125, |
| "learning_rate": 0.001, |
| "loss": 2.4803, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.012713270349091036, |
| "grad_norm": 0.119140625, |
| "learning_rate": 0.001, |
| "loss": 2.3886, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.013691214222098038, |
| "grad_norm": 0.1923828125, |
| "learning_rate": 0.001, |
| "loss": 2.3931, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.014669158095105042, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 2.3493, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.015647101968112045, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.001, |
| "loss": 2.3317, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.016625045841119047, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.001, |
| "loss": 2.3284, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.01760298971412605, |
| "grad_norm": 0.16796875, |
| "learning_rate": 0.001, |
| "loss": 2.2322, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.01858093358713305, |
| "grad_norm": 0.1650390625, |
| "learning_rate": 0.001, |
| "loss": 2.2145, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.019558877460140057, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.001, |
| "loss": 2.202, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.02053682133314706, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.001, |
| "loss": 2.2156, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.02151476520615406, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 2.1436, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.022492709079161063, |
| "grad_norm": 0.390625, |
| "learning_rate": 0.001, |
| "loss": 2.1198, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.023470652952168068, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.001, |
| "loss": 2.1031, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.02444859682517507, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.001, |
| "loss": 2.0943, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.025426540698182072, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.001, |
| "loss": 2.0352, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.026404484571189074, |
| "grad_norm": 0.158203125, |
| "learning_rate": 0.001, |
| "loss": 2.0439, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.027382428444196076, |
| "grad_norm": 0.16796875, |
| "learning_rate": 0.001, |
| "loss": 2.0176, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.02836037231720308, |
| "grad_norm": 0.474609375, |
| "learning_rate": 0.001, |
| "loss": 2.0212, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.029338316190210083, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.001, |
| "loss": 1.9963, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.030316260063217085, |
| "grad_norm": 0.1689453125, |
| "learning_rate": 0.001, |
| "loss": 1.9738, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.03129420393622409, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.9362, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.03227214780923109, |
| "grad_norm": 0.1982421875, |
| "learning_rate": 0.001, |
| "loss": 1.9393, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.033250091682238095, |
| "grad_norm": 0.400390625, |
| "learning_rate": 0.001, |
| "loss": 1.9876, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.0342280355552451, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.001, |
| "loss": 1.9551, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0352059794282521, |
| "grad_norm": 0.1591796875, |
| "learning_rate": 0.001, |
| "loss": 1.9103, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.0361839233012591, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 0.001, |
| "loss": 1.9243, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.0371618671742661, |
| "grad_norm": 0.478515625, |
| "learning_rate": 0.001, |
| "loss": 1.9207, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.03813981104727311, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.9029, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.039117754920280114, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.001, |
| "loss": 1.8988, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.040095698793287116, |
| "grad_norm": 0.189453125, |
| "learning_rate": 0.001, |
| "loss": 1.8653, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.04107364266629412, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.001, |
| "loss": 1.8383, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.04205158653930112, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.001, |
| "loss": 1.8191, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.04302953041230812, |
| "grad_norm": 0.55859375, |
| "learning_rate": 0.001, |
| "loss": 1.8837, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.044007474285315124, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.8838, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.044985418158322125, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.001, |
| "loss": 1.8423, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.04596336203132913, |
| "grad_norm": 0.134765625, |
| "learning_rate": 0.001, |
| "loss": 1.785, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.046941305904336136, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.001, |
| "loss": 1.8072, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.04791924977734314, |
| "grad_norm": 0.1669921875, |
| "learning_rate": 0.001, |
| "loss": 1.7771, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.04889719365035014, |
| "grad_norm": 0.33203125, |
| "learning_rate": 0.001, |
| "loss": 1.8465, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.04987513752335714, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.7964, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.050853081396364144, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.81, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.051831025269371146, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.001, |
| "loss": 1.7867, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.05280896914237815, |
| "grad_norm": 0.19140625, |
| "learning_rate": 0.001, |
| "loss": 1.7581, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.05378691301538515, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.001, |
| "loss": 1.7116, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.05476485688839215, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.7431, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.05574280076139916, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.001, |
| "loss": 1.7647, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.05672074463440616, |
| "grad_norm": 0.177734375, |
| "learning_rate": 0.001, |
| "loss": 1.709, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.057698688507413165, |
| "grad_norm": 0.384765625, |
| "learning_rate": 0.001, |
| "loss": 1.7235, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.05867663238042017, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.001, |
| "loss": 1.7161, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.05965457625342717, |
| "grad_norm": 0.455078125, |
| "learning_rate": 0.001, |
| "loss": 1.812, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.06063252012643417, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.001, |
| "loss": 1.7164, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.06161046399944117, |
| "grad_norm": 0.154296875, |
| "learning_rate": 0.001, |
| "loss": 1.7677, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.06258840787244818, |
| "grad_norm": 0.1796875, |
| "learning_rate": 0.001, |
| "loss": 1.7001, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.06356635174545518, |
| "grad_norm": 0.173828125, |
| "learning_rate": 0.001, |
| "loss": 1.7015, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.06454429561846219, |
| "grad_norm": 0.203125, |
| "learning_rate": 0.001, |
| "loss": 1.6904, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.06552223949146918, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.001, |
| "loss": 1.6692, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.06650018336447619, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.6802, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.06747812723748318, |
| "grad_norm": 0.228515625, |
| "learning_rate": 0.001, |
| "loss": 1.6566, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.0684560711104902, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.001, |
| "loss": 1.7042, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.0694340149834972, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.001, |
| "loss": 1.6653, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.0704119588565042, |
| "grad_norm": 0.357421875, |
| "learning_rate": 0.001, |
| "loss": 1.6729, |
| "step": 1008 |
| }, |
| { |
| "epoch": 0.0713899027295112, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.001, |
| "loss": 1.6864, |
| "step": 1022 |
| }, |
| { |
| "epoch": 0.0723678466025182, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.001, |
| "loss": 1.6541, |
| "step": 1036 |
| }, |
| { |
| "epoch": 0.07334579047552521, |
| "grad_norm": 1.0390625, |
| "learning_rate": 0.001, |
| "loss": 1.7122, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.0743237343485322, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.7231, |
| "step": 1064 |
| }, |
| { |
| "epoch": 0.07530167822153921, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.6781, |
| "step": 1078 |
| }, |
| { |
| "epoch": 0.07627962209454622, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.001, |
| "loss": 1.667, |
| "step": 1092 |
| }, |
| { |
| "epoch": 0.07725756596755322, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.001, |
| "loss": 1.6667, |
| "step": 1106 |
| }, |
| { |
| "epoch": 0.07823550984056023, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.001, |
| "loss": 1.622, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.07921345371356722, |
| "grad_norm": 0.1533203125, |
| "learning_rate": 0.001, |
| "loss": 1.636, |
| "step": 1134 |
| }, |
| { |
| "epoch": 0.08019139758657423, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.001, |
| "loss": 1.6423, |
| "step": 1148 |
| }, |
| { |
| "epoch": 0.08116934145958123, |
| "grad_norm": 0.1806640625, |
| "learning_rate": 0.001, |
| "loss": 1.6136, |
| "step": 1162 |
| }, |
| { |
| "epoch": 0.08214728533258824, |
| "grad_norm": 0.224609375, |
| "learning_rate": 0.001, |
| "loss": 1.5988, |
| "step": 1176 |
| }, |
| { |
| "epoch": 0.08312522920559523, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.001, |
| "loss": 1.6216, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.08410317307860224, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.001, |
| "loss": 1.6296, |
| "step": 1204 |
| }, |
| { |
| "epoch": 0.08508111695160925, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.001, |
| "loss": 1.5944, |
| "step": 1218 |
| }, |
| { |
| "epoch": 0.08605906082461624, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.001, |
| "loss": 1.6144, |
| "step": 1232 |
| }, |
| { |
| "epoch": 0.08703700469762325, |
| "grad_norm": 0.357421875, |
| "learning_rate": 0.001, |
| "loss": 1.585, |
| "step": 1246 |
| }, |
| { |
| "epoch": 0.08801494857063025, |
| "grad_norm": 0.2470703125, |
| "learning_rate": 0.001, |
| "loss": 1.6033, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.08899289244363726, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.001, |
| "loss": 1.6122, |
| "step": 1274 |
| }, |
| { |
| "epoch": 0.08997083631664425, |
| "grad_norm": 0.396484375, |
| "learning_rate": 0.001, |
| "loss": 1.5978, |
| "step": 1288 |
| }, |
| { |
| "epoch": 0.09094878018965126, |
| "grad_norm": 0.4453125, |
| "learning_rate": 0.001, |
| "loss": 1.6392, |
| "step": 1302 |
| }, |
| { |
| "epoch": 0.09192672406265825, |
| "grad_norm": 0.2470703125, |
| "learning_rate": 0.001, |
| "loss": 1.6247, |
| "step": 1316 |
| }, |
| { |
| "epoch": 0.09290466793566526, |
| "grad_norm": 0.193359375, |
| "learning_rate": 0.001, |
| "loss": 1.5888, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.09388261180867227, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.001, |
| "loss": 1.572, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.09486055568167927, |
| "grad_norm": 0.17578125, |
| "learning_rate": 0.001, |
| "loss": 1.5725, |
| "step": 1358 |
| }, |
| { |
| "epoch": 0.09583849955468628, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.001, |
| "loss": 1.6131, |
| "step": 1372 |
| }, |
| { |
| "epoch": 0.09681644342769327, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.001, |
| "loss": 1.5467, |
| "step": 1386 |
| }, |
| { |
| "epoch": 0.09779438730070028, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.001, |
| "loss": 1.5843, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.09877233117370728, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.5519, |
| "step": 1414 |
| }, |
| { |
| "epoch": 0.09975027504671428, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.5531, |
| "step": 1428 |
| }, |
| { |
| "epoch": 0.09995983444807292, |
| "eval_loss": 2.1689391136169434, |
| "eval_runtime": 9.1273, |
| "eval_samples_per_second": 109.561, |
| "eval_steps_per_second": 1.424, |
| "step": 1431 |
| }, |
| { |
| "epoch": 0.10072821891972128, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.001, |
| "loss": 1.561, |
| "step": 1442 |
| }, |
| { |
| "epoch": 0.10170616279272829, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.001, |
| "loss": 1.5818, |
| "step": 1456 |
| }, |
| { |
| "epoch": 0.1026841066657353, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.001, |
| "loss": 1.5653, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.10366205053874229, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.001, |
| "loss": 1.585, |
| "step": 1484 |
| }, |
| { |
| "epoch": 0.1046399944117493, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.5502, |
| "step": 1498 |
| }, |
| { |
| "epoch": 0.1056179382847563, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.001, |
| "loss": 1.5873, |
| "step": 1512 |
| }, |
| { |
| "epoch": 0.1065958821577633, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.5825, |
| "step": 1526 |
| }, |
| { |
| "epoch": 0.1075738260307703, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.001, |
| "loss": 1.5712, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.10855176990377731, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.001, |
| "loss": 1.5443, |
| "step": 1554 |
| }, |
| { |
| "epoch": 0.1095297137767843, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.001, |
| "loss": 1.5989, |
| "step": 1568 |
| }, |
| { |
| "epoch": 0.11050765764979131, |
| "grad_norm": 0.318359375, |
| "learning_rate": 0.001, |
| "loss": 1.6003, |
| "step": 1582 |
| }, |
| { |
| "epoch": 0.11148560152279832, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.001, |
| "loss": 1.5907, |
| "step": 1596 |
| }, |
| { |
| "epoch": 0.11246354539580532, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.001, |
| "loss": 1.5373, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.11344148926881233, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.001, |
| "loss": 1.5531, |
| "step": 1624 |
| }, |
| { |
| "epoch": 0.11441943314181932, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.001, |
| "loss": 1.5317, |
| "step": 1638 |
| }, |
| { |
| "epoch": 0.11539737701482633, |
| "grad_norm": 0.3671875, |
| "learning_rate": 0.001, |
| "loss": 1.5157, |
| "step": 1652 |
| }, |
| { |
| "epoch": 0.11637532088783333, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 0.001, |
| "loss": 1.5462, |
| "step": 1666 |
| }, |
| { |
| "epoch": 0.11735326476084033, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.001, |
| "loss": 1.5598, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.11833120863384734, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.5345, |
| "step": 1694 |
| }, |
| { |
| "epoch": 0.11930915250685434, |
| "grad_norm": 0.470703125, |
| "learning_rate": 0.001, |
| "loss": 1.5324, |
| "step": 1708 |
| }, |
| { |
| "epoch": 0.12028709637986135, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.001, |
| "loss": 1.498, |
| "step": 1722 |
| }, |
| { |
| "epoch": 0.12126504025286834, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.5072, |
| "step": 1736 |
| }, |
| { |
| "epoch": 0.12224298412587535, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.5364, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.12322092799888235, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.5, |
| "step": 1764 |
| }, |
| { |
| "epoch": 0.12419887187188935, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.001, |
| "loss": 1.5113, |
| "step": 1778 |
| }, |
| { |
| "epoch": 0.12517681574489636, |
| "grad_norm": 0.400390625, |
| "learning_rate": 0.001, |
| "loss": 1.5099, |
| "step": 1792 |
| }, |
| { |
| "epoch": 0.12615475961790334, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.001, |
| "loss": 1.5076, |
| "step": 1806 |
| }, |
| { |
| "epoch": 0.12713270349091035, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.001, |
| "loss": 1.4994, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.12811064736391736, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.001, |
| "loss": 1.5135, |
| "step": 1834 |
| }, |
| { |
| "epoch": 0.12908859123692437, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.5214, |
| "step": 1848 |
| }, |
| { |
| "epoch": 0.13006653510993138, |
| "grad_norm": 0.326171875, |
| "learning_rate": 0.001, |
| "loss": 1.4525, |
| "step": 1862 |
| }, |
| { |
| "epoch": 0.13104447898293836, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.51, |
| "step": 1876 |
| }, |
| { |
| "epoch": 0.13202242285594537, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.001, |
| "loss": 1.4965, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.13300036672895238, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.001, |
| "loss": 1.4854, |
| "step": 1904 |
| }, |
| { |
| "epoch": 0.1339783106019594, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.001, |
| "loss": 1.4926, |
| "step": 1918 |
| }, |
| { |
| "epoch": 0.13495625447496637, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.4897, |
| "step": 1932 |
| }, |
| { |
| "epoch": 0.13593419834797338, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.001, |
| "loss": 1.498, |
| "step": 1946 |
| }, |
| { |
| "epoch": 0.1369121422209804, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.5029, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.1378900860939874, |
| "grad_norm": 0.419921875, |
| "learning_rate": 0.001, |
| "loss": 1.482, |
| "step": 1974 |
| }, |
| { |
| "epoch": 0.1388680299669944, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.5212, |
| "step": 1988 |
| }, |
| { |
| "epoch": 0.1398459738400014, |
| "grad_norm": 0.80859375, |
| "learning_rate": 0.001, |
| "loss": 1.549, |
| "step": 2002 |
| }, |
| { |
| "epoch": 0.1408239177130084, |
| "grad_norm": 0.36328125, |
| "learning_rate": 0.001, |
| "loss": 1.5148, |
| "step": 2016 |
| }, |
| { |
| "epoch": 0.1418018615860154, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.4993, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.1427798054590224, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.001, |
| "loss": 1.4985, |
| "step": 2044 |
| }, |
| { |
| "epoch": 0.14375774933202942, |
| "grad_norm": 0.25, |
| "learning_rate": 0.001, |
| "loss": 1.4938, |
| "step": 2058 |
| }, |
| { |
| "epoch": 0.1447356932050364, |
| "grad_norm": 0.177734375, |
| "learning_rate": 0.001, |
| "loss": 1.4777, |
| "step": 2072 |
| }, |
| { |
| "epoch": 0.1457136370780434, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.4865, |
| "step": 2086 |
| }, |
| { |
| "epoch": 0.14669158095105042, |
| "grad_norm": 0.19140625, |
| "learning_rate": 0.001, |
| "loss": 1.4567, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.14766952482405743, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.001, |
| "loss": 1.4706, |
| "step": 2114 |
| }, |
| { |
| "epoch": 0.1486474686970644, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.001, |
| "loss": 1.4959, |
| "step": 2128 |
| }, |
| { |
| "epoch": 0.14962541257007142, |
| "grad_norm": 0.224609375, |
| "learning_rate": 0.001, |
| "loss": 1.4631, |
| "step": 2142 |
| }, |
| { |
| "epoch": 0.15060335644307843, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.001, |
| "loss": 1.4669, |
| "step": 2156 |
| }, |
| { |
| "epoch": 0.15158130031608544, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.001, |
| "loss": 1.4479, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.15255924418909245, |
| "grad_norm": 0.25, |
| "learning_rate": 0.001, |
| "loss": 1.4584, |
| "step": 2184 |
| }, |
| { |
| "epoch": 0.15353718806209943, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.001, |
| "loss": 1.4609, |
| "step": 2198 |
| }, |
| { |
| "epoch": 0.15451513193510644, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.001, |
| "loss": 1.4499, |
| "step": 2212 |
| }, |
| { |
| "epoch": 0.15549307580811345, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.001, |
| "loss": 1.4562, |
| "step": 2226 |
| }, |
| { |
| "epoch": 0.15647101968112045, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.001, |
| "loss": 1.4745, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.15744896355412744, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.001, |
| "loss": 1.4773, |
| "step": 2254 |
| }, |
| { |
| "epoch": 0.15842690742713444, |
| "grad_norm": 0.4609375, |
| "learning_rate": 0.001, |
| "loss": 1.4501, |
| "step": 2268 |
| }, |
| { |
| "epoch": 0.15940485130014145, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.4378, |
| "step": 2282 |
| }, |
| { |
| "epoch": 0.16038279517314846, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.001, |
| "loss": 1.4452, |
| "step": 2296 |
| }, |
| { |
| "epoch": 0.16136073904615547, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.001, |
| "loss": 1.4536, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.16233868291916245, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.001, |
| "loss": 1.462, |
| "step": 2324 |
| }, |
| { |
| "epoch": 0.16331662679216946, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 0.001, |
| "loss": 1.451, |
| "step": 2338 |
| }, |
| { |
| "epoch": 0.16429457066517647, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.001, |
| "loss": 1.4266, |
| "step": 2352 |
| }, |
| { |
| "epoch": 0.16527251453818348, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.4503, |
| "step": 2366 |
| }, |
| { |
| "epoch": 0.16625045841119046, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.4502, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.16722840228419747, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.001, |
| "loss": 1.4587, |
| "step": 2394 |
| }, |
| { |
| "epoch": 0.16820634615720448, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.001, |
| "loss": 1.4588, |
| "step": 2408 |
| }, |
| { |
| "epoch": 0.1691842900302115, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.4679, |
| "step": 2422 |
| }, |
| { |
| "epoch": 0.1701622339032185, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.4329, |
| "step": 2436 |
| }, |
| { |
| "epoch": 0.17114017777622548, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.001, |
| "loss": 1.4566, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.1721181216492325, |
| "grad_norm": 0.357421875, |
| "learning_rate": 0.001, |
| "loss": 1.4728, |
| "step": 2464 |
| }, |
| { |
| "epoch": 0.1730960655222395, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.001, |
| "loss": 1.4518, |
| "step": 2478 |
| }, |
| { |
| "epoch": 0.1740740093952465, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.4507, |
| "step": 2492 |
| }, |
| { |
| "epoch": 0.17505195326825349, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.001, |
| "loss": 1.4242, |
| "step": 2506 |
| }, |
| { |
| "epoch": 0.1760298971412605, |
| "grad_norm": 0.197265625, |
| "learning_rate": 0.001, |
| "loss": 1.4116, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.1770078410142675, |
| "grad_norm": 0.3671875, |
| "learning_rate": 0.001, |
| "loss": 1.4294, |
| "step": 2534 |
| }, |
| { |
| "epoch": 0.1779857848872745, |
| "grad_norm": 0.451171875, |
| "learning_rate": 0.001, |
| "loss": 1.4448, |
| "step": 2548 |
| }, |
| { |
| "epoch": 0.17896372876028152, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.4468, |
| "step": 2562 |
| }, |
| { |
| "epoch": 0.1799416726332885, |
| "grad_norm": 0.419921875, |
| "learning_rate": 0.001, |
| "loss": 1.4319, |
| "step": 2576 |
| }, |
| { |
| "epoch": 0.1809196165062955, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.001, |
| "loss": 1.4309, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.18189756037930252, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.001, |
| "loss": 1.4324, |
| "step": 2604 |
| }, |
| { |
| "epoch": 0.18287550425230953, |
| "grad_norm": 0.1875, |
| "learning_rate": 0.001, |
| "loss": 1.4143, |
| "step": 2618 |
| }, |
| { |
| "epoch": 0.1838534481253165, |
| "grad_norm": 0.197265625, |
| "learning_rate": 0.001, |
| "loss": 1.417, |
| "step": 2632 |
| }, |
| { |
| "epoch": 0.18483139199832352, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.001, |
| "loss": 1.4318, |
| "step": 2646 |
| }, |
| { |
| "epoch": 0.18580933587133053, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.4151, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.18678727974433754, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.001, |
| "loss": 1.4169, |
| "step": 2674 |
| }, |
| { |
| "epoch": 0.18776522361734455, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.001, |
| "loss": 1.4352, |
| "step": 2688 |
| }, |
| { |
| "epoch": 0.18874316749035153, |
| "grad_norm": 0.4296875, |
| "learning_rate": 0.001, |
| "loss": 1.4346, |
| "step": 2702 |
| }, |
| { |
| "epoch": 0.18972111136335854, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.4253, |
| "step": 2716 |
| }, |
| { |
| "epoch": 0.19069905523636554, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.001, |
| "loss": 1.4272, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.19167699910937255, |
| "grad_norm": 0.8984375, |
| "learning_rate": 0.001, |
| "loss": 1.4618, |
| "step": 2744 |
| }, |
| { |
| "epoch": 0.19265494298237953, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.001, |
| "loss": 1.4616, |
| "step": 2758 |
| }, |
| { |
| "epoch": 0.19363288685538654, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.4231, |
| "step": 2772 |
| }, |
| { |
| "epoch": 0.19461083072839355, |
| "grad_norm": 0.20703125, |
| "learning_rate": 0.001, |
| "loss": 1.4185, |
| "step": 2786 |
| }, |
| { |
| "epoch": 0.19558877460140056, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.4578, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.19656671847440757, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.001, |
| "loss": 1.4134, |
| "step": 2814 |
| }, |
| { |
| "epoch": 0.19754466234741455, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.001, |
| "loss": 1.4253, |
| "step": 2828 |
| }, |
| { |
| "epoch": 0.19852260622042156, |
| "grad_norm": 0.208984375, |
| "learning_rate": 0.001, |
| "loss": 1.414, |
| "step": 2842 |
| }, |
| { |
| "epoch": 0.19950055009342857, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.001, |
| "loss": 1.4069, |
| "step": 2856 |
| }, |
| { |
| "epoch": 0.19991966889614585, |
| "eval_loss": 1.8960140943527222, |
| "eval_runtime": 9.2129, |
| "eval_samples_per_second": 108.543, |
| "eval_steps_per_second": 1.411, |
| "step": 2862 |
| }, |
| { |
| "epoch": 0.20047849396643558, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.001, |
| "loss": 1.4267, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.20145643783944256, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.001, |
| "loss": 1.4186, |
| "step": 2884 |
| }, |
| { |
| "epoch": 0.20243438171244957, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.001, |
| "loss": 1.4105, |
| "step": 2898 |
| }, |
| { |
| "epoch": 0.20341232558545658, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.4012, |
| "step": 2912 |
| }, |
| { |
| "epoch": 0.20439026945846359, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.001, |
| "loss": 1.3917, |
| "step": 2926 |
| }, |
| { |
| "epoch": 0.2053682133314706, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3952, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.20634615720447758, |
| "grad_norm": 0.1923828125, |
| "learning_rate": 0.001, |
| "loss": 1.396, |
| "step": 2954 |
| }, |
| { |
| "epoch": 0.20732410107748458, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.001, |
| "loss": 1.4556, |
| "step": 2968 |
| }, |
| { |
| "epoch": 0.2083020449504916, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.4215, |
| "step": 2982 |
| }, |
| { |
| "epoch": 0.2092799888234986, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.001, |
| "loss": 1.4166, |
| "step": 2996 |
| }, |
| { |
| "epoch": 0.21025793269650558, |
| "grad_norm": 0.25, |
| "learning_rate": 0.001, |
| "loss": 1.3975, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.2112358765695126, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.001, |
| "loss": 1.4019, |
| "step": 3024 |
| }, |
| { |
| "epoch": 0.2122138204425196, |
| "grad_norm": 0.2578125, |
| "learning_rate": 0.001, |
| "loss": 1.4016, |
| "step": 3038 |
| }, |
| { |
| "epoch": 0.2131917643155266, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.4069, |
| "step": 3052 |
| }, |
| { |
| "epoch": 0.21416970818853362, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.001, |
| "loss": 1.4214, |
| "step": 3066 |
| }, |
| { |
| "epoch": 0.2151476520615406, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.001, |
| "loss": 1.4123, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.2161255959345476, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.3981, |
| "step": 3094 |
| }, |
| { |
| "epoch": 0.21710353980755462, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.001, |
| "loss": 1.4067, |
| "step": 3108 |
| }, |
| { |
| "epoch": 0.21808148368056163, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.001, |
| "loss": 1.3893, |
| "step": 3122 |
| }, |
| { |
| "epoch": 0.2190594275535686, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.4263, |
| "step": 3136 |
| }, |
| { |
| "epoch": 0.22003737142657562, |
| "grad_norm": 0.318359375, |
| "learning_rate": 0.001, |
| "loss": 1.3931, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.22101531529958263, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.001, |
| "loss": 1.3901, |
| "step": 3164 |
| }, |
| { |
| "epoch": 0.22199325917258964, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3805, |
| "step": 3178 |
| }, |
| { |
| "epoch": 0.22297120304559664, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.4044, |
| "step": 3192 |
| }, |
| { |
| "epoch": 0.22394914691860363, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.4046, |
| "step": 3206 |
| }, |
| { |
| "epoch": 0.22492709079161063, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.001, |
| "loss": 1.4084, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.22590503466461764, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.3929, |
| "step": 3234 |
| }, |
| { |
| "epoch": 0.22688297853762465, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.3462, |
| "step": 3248 |
| }, |
| { |
| "epoch": 0.22786092241063163, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.001, |
| "loss": 1.3721, |
| "step": 3262 |
| }, |
| { |
| "epoch": 0.22883886628363864, |
| "grad_norm": 0.3359375, |
| "learning_rate": 0.001, |
| "loss": 1.4027, |
| "step": 3276 |
| }, |
| { |
| "epoch": 0.22981681015664565, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.001, |
| "loss": 1.4081, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.23079475402965266, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.001, |
| "loss": 1.3706, |
| "step": 3304 |
| }, |
| { |
| "epoch": 0.23177269790265967, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.3652, |
| "step": 3318 |
| }, |
| { |
| "epoch": 0.23275064177566665, |
| "grad_norm": 0.4296875, |
| "learning_rate": 0.001, |
| "loss": 1.3623, |
| "step": 3332 |
| }, |
| { |
| "epoch": 0.23372858564867366, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.001, |
| "loss": 1.37, |
| "step": 3346 |
| }, |
| { |
| "epoch": 0.23470652952168067, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.3627, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.23568447339468768, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.3979, |
| "step": 3374 |
| }, |
| { |
| "epoch": 0.23666241726769469, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.3967, |
| "step": 3388 |
| }, |
| { |
| "epoch": 0.23764036114070167, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.3696, |
| "step": 3402 |
| }, |
| { |
| "epoch": 0.23861830501370868, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.3867, |
| "step": 3416 |
| }, |
| { |
| "epoch": 0.23959624888671568, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.4115, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.2405741927597227, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.3501, |
| "step": 3444 |
| }, |
| { |
| "epoch": 0.24155213663272967, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.001, |
| "loss": 1.377, |
| "step": 3458 |
| }, |
| { |
| "epoch": 0.24253008050573668, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.001, |
| "loss": 1.3585, |
| "step": 3472 |
| }, |
| { |
| "epoch": 0.2435080243787437, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.36, |
| "step": 3486 |
| }, |
| { |
| "epoch": 0.2444859682517507, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.001, |
| "loss": 1.3759, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.2454639121247577, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.001, |
| "loss": 1.3782, |
| "step": 3514 |
| }, |
| { |
| "epoch": 0.2464418559977647, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.001, |
| "loss": 1.3489, |
| "step": 3528 |
| }, |
| { |
| "epoch": 0.2474197998707717, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.001, |
| "loss": 1.4092, |
| "step": 3542 |
| }, |
| { |
| "epoch": 0.2483977437437787, |
| "grad_norm": 0.38671875, |
| "learning_rate": 0.001, |
| "loss": 1.3806, |
| "step": 3556 |
| }, |
| { |
| "epoch": 0.24937568761678572, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.001, |
| "loss": 1.3973, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.2503536314897927, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.3725, |
| "step": 3584 |
| }, |
| { |
| "epoch": 0.2513315753627997, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.001, |
| "loss": 1.3523, |
| "step": 3598 |
| }, |
| { |
| "epoch": 0.2523095192358067, |
| "grad_norm": 0.41796875, |
| "learning_rate": 0.001, |
| "loss": 1.3929, |
| "step": 3612 |
| }, |
| { |
| "epoch": 0.2532874631088137, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.001, |
| "loss": 1.3541, |
| "step": 3626 |
| }, |
| { |
| "epoch": 0.2542654069818207, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.001, |
| "loss": 1.3471, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.25524335085482774, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.001, |
| "loss": 1.3887, |
| "step": 3654 |
| }, |
| { |
| "epoch": 0.2562212947278347, |
| "grad_norm": 0.38671875, |
| "learning_rate": 0.001, |
| "loss": 1.3675, |
| "step": 3668 |
| }, |
| { |
| "epoch": 0.2571992386008417, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3591, |
| "step": 3682 |
| }, |
| { |
| "epoch": 0.25817718247384874, |
| "grad_norm": 0.462890625, |
| "learning_rate": 0.001, |
| "loss": 1.3813, |
| "step": 3696 |
| }, |
| { |
| "epoch": 0.2591551263468557, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.001, |
| "loss": 1.3555, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.26013307021986276, |
| "grad_norm": 0.95703125, |
| "learning_rate": 0.001, |
| "loss": 1.3931, |
| "step": 3724 |
| }, |
| { |
| "epoch": 0.26111101409286974, |
| "grad_norm": 0.60546875, |
| "learning_rate": 0.001, |
| "loss": 1.4361, |
| "step": 3738 |
| }, |
| { |
| "epoch": 0.2620889579658767, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3841, |
| "step": 3752 |
| }, |
| { |
| "epoch": 0.26306690183888376, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.001, |
| "loss": 1.3808, |
| "step": 3766 |
| }, |
| { |
| "epoch": 0.26404484571189074, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.3792, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.2650227895848978, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.001, |
| "loss": 1.3488, |
| "step": 3794 |
| }, |
| { |
| "epoch": 0.26600073345790476, |
| "grad_norm": 0.3359375, |
| "learning_rate": 0.001, |
| "loss": 1.3644, |
| "step": 3808 |
| }, |
| { |
| "epoch": 0.26697867733091174, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.001, |
| "loss": 1.3707, |
| "step": 3822 |
| }, |
| { |
| "epoch": 0.2679566212039188, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.001, |
| "loss": 1.3584, |
| "step": 3836 |
| }, |
| { |
| "epoch": 0.26893456507692576, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3605, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.26991250894993274, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.3703, |
| "step": 3864 |
| }, |
| { |
| "epoch": 0.2708904528229398, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.001, |
| "loss": 1.3507, |
| "step": 3878 |
| }, |
| { |
| "epoch": 0.27186839669594676, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.3601, |
| "step": 3892 |
| }, |
| { |
| "epoch": 0.2728463405689538, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3663, |
| "step": 3906 |
| }, |
| { |
| "epoch": 0.2738242844419608, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.001, |
| "loss": 1.3625, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.27480222831496776, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.3722, |
| "step": 3934 |
| }, |
| { |
| "epoch": 0.2757801721879748, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.342, |
| "step": 3948 |
| }, |
| { |
| "epoch": 0.2767581160609818, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.3682, |
| "step": 3962 |
| }, |
| { |
| "epoch": 0.2777360599339888, |
| "grad_norm": 0.2373046875, |
| "learning_rate": 0.001, |
| "loss": 1.3662, |
| "step": 3976 |
| }, |
| { |
| "epoch": 0.2787140038069958, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.001, |
| "loss": 1.3528, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.2796919476800028, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.001, |
| "loss": 1.3576, |
| "step": 4004 |
| }, |
| { |
| "epoch": 0.2806698915530098, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.3744, |
| "step": 4018 |
| }, |
| { |
| "epoch": 0.2816478354260168, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.001, |
| "loss": 1.3554, |
| "step": 4032 |
| }, |
| { |
| "epoch": 0.2826257792990238, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.3518, |
| "step": 4046 |
| }, |
| { |
| "epoch": 0.2836037231720308, |
| "grad_norm": 0.25, |
| "learning_rate": 0.001, |
| "loss": 1.3595, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.2845816670450378, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.3361, |
| "step": 4074 |
| }, |
| { |
| "epoch": 0.2855596109180448, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.001, |
| "loss": 1.3664, |
| "step": 4088 |
| }, |
| { |
| "epoch": 0.2865375547910518, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.3633, |
| "step": 4102 |
| }, |
| { |
| "epoch": 0.28751549866405884, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.001, |
| "loss": 1.3437, |
| "step": 4116 |
| }, |
| { |
| "epoch": 0.2884934425370658, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.001, |
| "loss": 1.3401, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.2894713864100728, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 0.001, |
| "loss": 1.3465, |
| "step": 4144 |
| }, |
| { |
| "epoch": 0.29044933028307984, |
| "grad_norm": 0.5, |
| "learning_rate": 0.001, |
| "loss": 1.3517, |
| "step": 4158 |
| }, |
| { |
| "epoch": 0.2914272741560868, |
| "grad_norm": 0.36328125, |
| "learning_rate": 0.001, |
| "loss": 1.3272, |
| "step": 4172 |
| }, |
| { |
| "epoch": 0.2924052180290938, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.3676, |
| "step": 4186 |
| }, |
| { |
| "epoch": 0.29338316190210084, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.001, |
| "loss": 1.3501, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.2943611057751078, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.001, |
| "loss": 1.3423, |
| "step": 4214 |
| }, |
| { |
| "epoch": 0.29533904964811486, |
| "grad_norm": 0.3671875, |
| "learning_rate": 0.001, |
| "loss": 1.3396, |
| "step": 4228 |
| }, |
| { |
| "epoch": 0.29631699352112184, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.3416, |
| "step": 4242 |
| }, |
| { |
| "epoch": 0.2972949373941288, |
| "grad_norm": 0.326171875, |
| "learning_rate": 0.001, |
| "loss": 1.3419, |
| "step": 4256 |
| }, |
| { |
| "epoch": 0.29827288126713586, |
| "grad_norm": 0.3671875, |
| "learning_rate": 0.001, |
| "loss": 1.3128, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.29925082514014284, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.001, |
| "loss": 1.3528, |
| "step": 4284 |
| }, |
| { |
| "epoch": 0.29987950334421876, |
| "eval_loss": 1.7873083353042603, |
| "eval_runtime": 9.1065, |
| "eval_samples_per_second": 109.812, |
| "eval_steps_per_second": 1.428, |
| "step": 4293 |
| }, |
| { |
| "epoch": 0.3002287690131499, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.001, |
| "loss": 1.3496, |
| "step": 4298 |
| }, |
| { |
| "epoch": 0.30120671288615686, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.001, |
| "loss": 1.3369, |
| "step": 4312 |
| }, |
| { |
| "epoch": 0.30218465675916384, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.001, |
| "loss": 1.3803, |
| "step": 4326 |
| }, |
| { |
| "epoch": 0.3031626006321709, |
| "grad_norm": 0.41796875, |
| "learning_rate": 0.001, |
| "loss": 1.324, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.30414054450517786, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.001, |
| "loss": 1.3559, |
| "step": 4354 |
| }, |
| { |
| "epoch": 0.3051184883781849, |
| "grad_norm": 0.373046875, |
| "learning_rate": 0.001, |
| "loss": 1.3166, |
| "step": 4368 |
| }, |
| { |
| "epoch": 0.3060964322511919, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.3376, |
| "step": 4382 |
| }, |
| { |
| "epoch": 0.30707437612419886, |
| "grad_norm": 0.2197265625, |
| "learning_rate": 0.001, |
| "loss": 1.3155, |
| "step": 4396 |
| }, |
| { |
| "epoch": 0.3080523199972059, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.001, |
| "loss": 1.3278, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.3090302638702129, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.34, |
| "step": 4424 |
| }, |
| { |
| "epoch": 0.31000820774321985, |
| "grad_norm": 0.2431640625, |
| "learning_rate": 0.001, |
| "loss": 1.313, |
| "step": 4438 |
| }, |
| { |
| "epoch": 0.3109861516162269, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.001, |
| "loss": 1.3421, |
| "step": 4452 |
| }, |
| { |
| "epoch": 0.3119640954892339, |
| "grad_norm": 0.423828125, |
| "learning_rate": 0.001, |
| "loss": 1.3327, |
| "step": 4466 |
| }, |
| { |
| "epoch": 0.3129420393622409, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.3345, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.3139199832352479, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.3157, |
| "step": 4494 |
| }, |
| { |
| "epoch": 0.31489792710825487, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.001, |
| "loss": 1.334, |
| "step": 4508 |
| }, |
| { |
| "epoch": 0.3158758709812619, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.001, |
| "loss": 1.3526, |
| "step": 4522 |
| }, |
| { |
| "epoch": 0.3168538148542689, |
| "grad_norm": 0.35546875, |
| "learning_rate": 0.001, |
| "loss": 1.3565, |
| "step": 4536 |
| }, |
| { |
| "epoch": 0.3178317587272759, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.001, |
| "loss": 1.3343, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.3188097026002829, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.001, |
| "loss": 1.3372, |
| "step": 4564 |
| }, |
| { |
| "epoch": 0.3197876464732899, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.3105, |
| "step": 4578 |
| }, |
| { |
| "epoch": 0.3207655903462969, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.001, |
| "loss": 1.3297, |
| "step": 4592 |
| }, |
| { |
| "epoch": 0.3217435342193039, |
| "grad_norm": 0.341796875, |
| "learning_rate": 0.001, |
| "loss": 1.3549, |
| "step": 4606 |
| }, |
| { |
| "epoch": 0.32272147809231094, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.001, |
| "loss": 1.3499, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.3236994219653179, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.001, |
| "loss": 1.3642, |
| "step": 4634 |
| }, |
| { |
| "epoch": 0.3246773658383249, |
| "grad_norm": 0.58984375, |
| "learning_rate": 0.001, |
| "loss": 1.3611, |
| "step": 4648 |
| }, |
| { |
| "epoch": 0.32565530971133194, |
| "grad_norm": 0.3359375, |
| "learning_rate": 0.001, |
| "loss": 1.3255, |
| "step": 4662 |
| }, |
| { |
| "epoch": 0.3266332535843389, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.3255, |
| "step": 4676 |
| }, |
| { |
| "epoch": 0.3276111974573459, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.001, |
| "loss": 1.3431, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.32858914133035294, |
| "grad_norm": 0.326171875, |
| "learning_rate": 0.001, |
| "loss": 1.322, |
| "step": 4704 |
| }, |
| { |
| "epoch": 0.3295670852033599, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.001, |
| "loss": 1.3271, |
| "step": 4718 |
| }, |
| { |
| "epoch": 0.33054502907636696, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.001, |
| "loss": 1.3364, |
| "step": 4732 |
| }, |
| { |
| "epoch": 0.33152297294937394, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.3311, |
| "step": 4746 |
| }, |
| { |
| "epoch": 0.3325009168223809, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.001, |
| "loss": 1.3555, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.33347886069538796, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.329, |
| "step": 4774 |
| }, |
| { |
| "epoch": 0.33445680456839494, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.001, |
| "loss": 1.3264, |
| "step": 4788 |
| }, |
| { |
| "epoch": 0.335434748441402, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.001, |
| "loss": 1.3142, |
| "step": 4802 |
| }, |
| { |
| "epoch": 0.33641269231440896, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.001, |
| "loss": 1.3202, |
| "step": 4816 |
| }, |
| { |
| "epoch": 0.33739063618741594, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.001, |
| "loss": 1.3516, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.338368580060423, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3365, |
| "step": 4844 |
| }, |
| { |
| "epoch": 0.33934652393342996, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3334, |
| "step": 4858 |
| }, |
| { |
| "epoch": 0.340324467806437, |
| "grad_norm": 0.421875, |
| "learning_rate": 0.001, |
| "loss": 1.3485, |
| "step": 4872 |
| }, |
| { |
| "epoch": 0.341302411679444, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.3343, |
| "step": 4886 |
| }, |
| { |
| "epoch": 0.34228035555245095, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.3395, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.343258299425458, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.001, |
| "loss": 1.3193, |
| "step": 4914 |
| }, |
| { |
| "epoch": 0.344236243298465, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.327, |
| "step": 4928 |
| }, |
| { |
| "epoch": 0.34521418717147195, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.001, |
| "loss": 1.2993, |
| "step": 4942 |
| }, |
| { |
| "epoch": 0.346192131044479, |
| "grad_norm": 0.3828125, |
| "learning_rate": 0.001, |
| "loss": 1.3182, |
| "step": 4956 |
| }, |
| { |
| "epoch": 0.34717007491748597, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.001, |
| "loss": 1.3409, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.348148018790493, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.001, |
| "loss": 1.3354, |
| "step": 4984 |
| }, |
| { |
| "epoch": 0.3491259626635, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.001, |
| "loss": 1.305, |
| "step": 4998 |
| }, |
| { |
| "epoch": 0.35010390653650697, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 0.001, |
| "loss": 1.314, |
| "step": 5012 |
| }, |
| { |
| "epoch": 0.351081850409514, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.3267, |
| "step": 5026 |
| }, |
| { |
| "epoch": 0.352059794282521, |
| "grad_norm": 0.42578125, |
| "learning_rate": 0.001, |
| "loss": 1.3327, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.353037738155528, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.3113, |
| "step": 5054 |
| }, |
| { |
| "epoch": 0.354015682028535, |
| "grad_norm": 0.3984375, |
| "learning_rate": 0.001, |
| "loss": 1.3293, |
| "step": 5068 |
| }, |
| { |
| "epoch": 0.354993625901542, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.001, |
| "loss": 1.3094, |
| "step": 5082 |
| }, |
| { |
| "epoch": 0.355971569774549, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.3053, |
| "step": 5096 |
| }, |
| { |
| "epoch": 0.356949513647556, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.001, |
| "loss": 1.325, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.35792745752056304, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.001, |
| "loss": 1.3392, |
| "step": 5124 |
| }, |
| { |
| "epoch": 0.35890540139357, |
| "grad_norm": 0.412109375, |
| "learning_rate": 0.001, |
| "loss": 1.3363, |
| "step": 5138 |
| }, |
| { |
| "epoch": 0.359883345266577, |
| "grad_norm": 0.59375, |
| "learning_rate": 0.001, |
| "loss": 1.3693, |
| "step": 5152 |
| }, |
| { |
| "epoch": 0.36086128913958404, |
| "grad_norm": 0.390625, |
| "learning_rate": 0.001, |
| "loss": 1.3341, |
| "step": 5166 |
| }, |
| { |
| "epoch": 0.361839233012591, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.3104, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.362817176885598, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.3277, |
| "step": 5194 |
| }, |
| { |
| "epoch": 0.36379512075860504, |
| "grad_norm": 0.2275390625, |
| "learning_rate": 0.001, |
| "loss": 1.3274, |
| "step": 5208 |
| }, |
| { |
| "epoch": 0.364773064631612, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.001, |
| "loss": 1.3295, |
| "step": 5222 |
| }, |
| { |
| "epoch": 0.36575100850461906, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.001, |
| "loss": 1.334, |
| "step": 5236 |
| }, |
| { |
| "epoch": 0.36672895237762604, |
| "grad_norm": 0.318359375, |
| "learning_rate": 0.001, |
| "loss": 1.3081, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.367706896250633, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.001, |
| "loss": 1.2691, |
| "step": 5264 |
| }, |
| { |
| "epoch": 0.36868484012364006, |
| "grad_norm": 0.47265625, |
| "learning_rate": 0.001, |
| "loss": 1.3241, |
| "step": 5278 |
| }, |
| { |
| "epoch": 0.36966278399664704, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.3015, |
| "step": 5292 |
| }, |
| { |
| "epoch": 0.3706407278696541, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.3269, |
| "step": 5306 |
| }, |
| { |
| "epoch": 0.37161867174266106, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.001, |
| "loss": 1.3249, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.37259661561566804, |
| "grad_norm": 0.384765625, |
| "learning_rate": 0.001, |
| "loss": 1.3091, |
| "step": 5334 |
| }, |
| { |
| "epoch": 0.3735745594886751, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.3251, |
| "step": 5348 |
| }, |
| { |
| "epoch": 0.37455250336168205, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2866, |
| "step": 5362 |
| }, |
| { |
| "epoch": 0.3755304472346891, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.001, |
| "loss": 1.2987, |
| "step": 5376 |
| }, |
| { |
| "epoch": 0.37650839110769607, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.3095, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.37748633498070305, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.001, |
| "loss": 1.304, |
| "step": 5404 |
| }, |
| { |
| "epoch": 0.3784642788537101, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.001, |
| "loss": 1.3158, |
| "step": 5418 |
| }, |
| { |
| "epoch": 0.37944222272671707, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.3164, |
| "step": 5432 |
| }, |
| { |
| "epoch": 0.3804201665997241, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.3078, |
| "step": 5446 |
| }, |
| { |
| "epoch": 0.3813981104727311, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.001, |
| "loss": 1.3603, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.38237605434573807, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.3174, |
| "step": 5474 |
| }, |
| { |
| "epoch": 0.3833539982187451, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.3204, |
| "step": 5488 |
| }, |
| { |
| "epoch": 0.3843319420917521, |
| "grad_norm": 0.33203125, |
| "learning_rate": 0.001, |
| "loss": 1.3005, |
| "step": 5502 |
| }, |
| { |
| "epoch": 0.38530988596475907, |
| "grad_norm": 0.3828125, |
| "learning_rate": 0.001, |
| "loss": 1.3396, |
| "step": 5516 |
| }, |
| { |
| "epoch": 0.3862878298377661, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.3231, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.3872657737107731, |
| "grad_norm": 0.375, |
| "learning_rate": 0.001, |
| "loss": 1.3353, |
| "step": 5544 |
| }, |
| { |
| "epoch": 0.3882437175837801, |
| "grad_norm": 0.33203125, |
| "learning_rate": 0.001, |
| "loss": 1.3314, |
| "step": 5558 |
| }, |
| { |
| "epoch": 0.3892216614567871, |
| "grad_norm": 0.35546875, |
| "learning_rate": 0.001, |
| "loss": 1.3228, |
| "step": 5572 |
| }, |
| { |
| "epoch": 0.3901996053297941, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.001, |
| "loss": 1.3034, |
| "step": 5586 |
| }, |
| { |
| "epoch": 0.3911775492028011, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.001, |
| "loss": 1.3307, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.3921554930758081, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.001, |
| "loss": 1.2888, |
| "step": 5614 |
| }, |
| { |
| "epoch": 0.39313343694881514, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.3107, |
| "step": 5628 |
| }, |
| { |
| "epoch": 0.3941113808218221, |
| "grad_norm": 0.3984375, |
| "learning_rate": 0.001, |
| "loss": 1.32, |
| "step": 5642 |
| }, |
| { |
| "epoch": 0.3950893246948291, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.3383, |
| "step": 5656 |
| }, |
| { |
| "epoch": 0.39606726856783614, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.2903, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.3970452124408431, |
| "grad_norm": 0.330078125, |
| "learning_rate": 0.001, |
| "loss": 1.3338, |
| "step": 5684 |
| }, |
| { |
| "epoch": 0.39802315631385016, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.3371, |
| "step": 5698 |
| }, |
| { |
| "epoch": 0.39900110018685714, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.001, |
| "loss": 1.3169, |
| "step": 5712 |
| }, |
| { |
| "epoch": 0.3998393377922917, |
| "eval_loss": 1.7229478359222412, |
| "eval_runtime": 9.1252, |
| "eval_samples_per_second": 109.587, |
| "eval_steps_per_second": 1.425, |
| "step": 5724 |
| }, |
| { |
| "epoch": 0.3999790440598641, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.3098, |
| "step": 5726 |
| }, |
| { |
| "epoch": 0.40095698793287116, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.3131, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.40193493180587814, |
| "grad_norm": 0.408203125, |
| "learning_rate": 0.001, |
| "loss": 1.3199, |
| "step": 5754 |
| }, |
| { |
| "epoch": 0.4029128756788851, |
| "grad_norm": 0.46875, |
| "learning_rate": 0.001, |
| "loss": 1.3238, |
| "step": 5768 |
| }, |
| { |
| "epoch": 0.40389081955189216, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.001, |
| "loss": 1.3192, |
| "step": 5782 |
| }, |
| { |
| "epoch": 0.40486876342489914, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.001, |
| "loss": 1.3274, |
| "step": 5796 |
| }, |
| { |
| "epoch": 0.4058467072979062, |
| "grad_norm": 0.318359375, |
| "learning_rate": 0.001, |
| "loss": 1.2836, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.40682465117091315, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.001, |
| "loss": 1.3021, |
| "step": 5824 |
| }, |
| { |
| "epoch": 0.40780259504392014, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.2903, |
| "step": 5838 |
| }, |
| { |
| "epoch": 0.40878053891692717, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.2965, |
| "step": 5852 |
| }, |
| { |
| "epoch": 0.40975848278993415, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.001, |
| "loss": 1.2816, |
| "step": 5866 |
| }, |
| { |
| "epoch": 0.4107364266629412, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.3082, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.41171437053594817, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.3229, |
| "step": 5894 |
| }, |
| { |
| "epoch": 0.41269231440895515, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.3056, |
| "step": 5908 |
| }, |
| { |
| "epoch": 0.4136702582819622, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.2618, |
| "step": 5922 |
| }, |
| { |
| "epoch": 0.41464820215496917, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.001, |
| "loss": 1.29, |
| "step": 5936 |
| }, |
| { |
| "epoch": 0.4156261460279762, |
| "grad_norm": 0.36328125, |
| "learning_rate": 0.001, |
| "loss": 1.3132, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.4166040899009832, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.3083, |
| "step": 5964 |
| }, |
| { |
| "epoch": 0.41758203377399017, |
| "grad_norm": 0.326171875, |
| "learning_rate": 0.001, |
| "loss": 1.3118, |
| "step": 5978 |
| }, |
| { |
| "epoch": 0.4185599776469972, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.3105, |
| "step": 5992 |
| }, |
| { |
| "epoch": 0.4195379215200042, |
| "grad_norm": 0.25, |
| "learning_rate": 0.001, |
| "loss": 1.3032, |
| "step": 6006 |
| }, |
| { |
| "epoch": 0.42051586539301117, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.001, |
| "loss": 1.2815, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.4214938092660182, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2913, |
| "step": 6034 |
| }, |
| { |
| "epoch": 0.4224717531390252, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 0.001, |
| "loss": 1.3015, |
| "step": 6048 |
| }, |
| { |
| "epoch": 0.4234496970120322, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.3072, |
| "step": 6062 |
| }, |
| { |
| "epoch": 0.4244276408850392, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.3412, |
| "step": 6076 |
| }, |
| { |
| "epoch": 0.4254055847580462, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.3024, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.4263835286310532, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.001, |
| "loss": 1.3124, |
| "step": 6104 |
| }, |
| { |
| "epoch": 0.4273614725040602, |
| "grad_norm": 0.55078125, |
| "learning_rate": 0.001, |
| "loss": 1.3066, |
| "step": 6118 |
| }, |
| { |
| "epoch": 0.42833941637706724, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.001, |
| "loss": 1.3194, |
| "step": 6132 |
| }, |
| { |
| "epoch": 0.4293173602500742, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.001, |
| "loss": 1.2943, |
| "step": 6146 |
| }, |
| { |
| "epoch": 0.4302953041230812, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.001, |
| "loss": 1.2726, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.43127324799608824, |
| "grad_norm": 0.2158203125, |
| "learning_rate": 0.001, |
| "loss": 1.2693, |
| "step": 6174 |
| }, |
| { |
| "epoch": 0.4322511918690952, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.001, |
| "loss": 1.2966, |
| "step": 6188 |
| }, |
| { |
| "epoch": 0.43322913574210226, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.001, |
| "loss": 1.3144, |
| "step": 6202 |
| }, |
| { |
| "epoch": 0.43420707961510924, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.2774, |
| "step": 6216 |
| }, |
| { |
| "epoch": 0.4351850234881162, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.3109, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.43616296736112325, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.2897, |
| "step": 6244 |
| }, |
| { |
| "epoch": 0.43714091123413024, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.001, |
| "loss": 1.2796, |
| "step": 6258 |
| }, |
| { |
| "epoch": 0.4381188551071372, |
| "grad_norm": 0.3125, |
| "learning_rate": 0.001, |
| "loss": 1.3235, |
| "step": 6272 |
| }, |
| { |
| "epoch": 0.43909679898014425, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.2996, |
| "step": 6286 |
| }, |
| { |
| "epoch": 0.44007474285315124, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.001, |
| "loss": 1.3074, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.44105268672615827, |
| "grad_norm": 0.34375, |
| "learning_rate": 0.001, |
| "loss": 1.2851, |
| "step": 6314 |
| }, |
| { |
| "epoch": 0.44203063059916525, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.001, |
| "loss": 1.3315, |
| "step": 6328 |
| }, |
| { |
| "epoch": 0.44300857447217223, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.001, |
| "loss": 1.3049, |
| "step": 6342 |
| }, |
| { |
| "epoch": 0.44398651834517927, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.3186, |
| "step": 6356 |
| }, |
| { |
| "epoch": 0.44496446221818625, |
| "grad_norm": 0.392578125, |
| "learning_rate": 0.001, |
| "loss": 1.276, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.4459424060911933, |
| "grad_norm": 0.3828125, |
| "learning_rate": 0.001, |
| "loss": 1.2827, |
| "step": 6384 |
| }, |
| { |
| "epoch": 0.44692034996420027, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.2925, |
| "step": 6398 |
| }, |
| { |
| "epoch": 0.44789829383720725, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.001, |
| "loss": 1.2788, |
| "step": 6412 |
| }, |
| { |
| "epoch": 0.4488762377102143, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.001, |
| "loss": 1.2982, |
| "step": 6426 |
| }, |
| { |
| "epoch": 0.44985418158322127, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.2977, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.4508321254562283, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.001, |
| "loss": 1.3041, |
| "step": 6454 |
| }, |
| { |
| "epoch": 0.4518100693292353, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.2828, |
| "step": 6468 |
| }, |
| { |
| "epoch": 0.45278801320224227, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.2942, |
| "step": 6482 |
| }, |
| { |
| "epoch": 0.4537659570752493, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.001, |
| "loss": 1.2626, |
| "step": 6496 |
| }, |
| { |
| "epoch": 0.4547439009482563, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.001, |
| "loss": 1.2788, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.45572184482126327, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.001, |
| "loss": 1.2733, |
| "step": 6524 |
| }, |
| { |
| "epoch": 0.4566997886942703, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.3146, |
| "step": 6538 |
| }, |
| { |
| "epoch": 0.4576777325672773, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.001, |
| "loss": 1.3027, |
| "step": 6552 |
| }, |
| { |
| "epoch": 0.4586556764402843, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.001, |
| "loss": 1.2753, |
| "step": 6566 |
| }, |
| { |
| "epoch": 0.4596336203132913, |
| "grad_norm": 0.439453125, |
| "learning_rate": 0.001, |
| "loss": 1.2955, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.4606115641862983, |
| "grad_norm": 0.353515625, |
| "learning_rate": 0.001, |
| "loss": 1.3105, |
| "step": 6594 |
| }, |
| { |
| "epoch": 0.4615895080593053, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.2798, |
| "step": 6608 |
| }, |
| { |
| "epoch": 0.4625674519323123, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.2983, |
| "step": 6622 |
| }, |
| { |
| "epoch": 0.46354539580531934, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.001, |
| "loss": 1.3166, |
| "step": 6636 |
| }, |
| { |
| "epoch": 0.4645233396783263, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.287, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.4655012835513333, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.28, |
| "step": 6664 |
| }, |
| { |
| "epoch": 0.46647922742434034, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.2896, |
| "step": 6678 |
| }, |
| { |
| "epoch": 0.4674571712973473, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.2933, |
| "step": 6692 |
| }, |
| { |
| "epoch": 0.46843511517035435, |
| "grad_norm": 0.34375, |
| "learning_rate": 0.001, |
| "loss": 1.2602, |
| "step": 6706 |
| }, |
| { |
| "epoch": 0.46941305904336134, |
| "grad_norm": 0.25, |
| "learning_rate": 0.001, |
| "loss": 1.267, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.4703910029163683, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.2745, |
| "step": 6734 |
| }, |
| { |
| "epoch": 0.47136894678937535, |
| "grad_norm": 0.453125, |
| "learning_rate": 0.001, |
| "loss": 1.3102, |
| "step": 6748 |
| }, |
| { |
| "epoch": 0.47234689066238233, |
| "grad_norm": 0.36328125, |
| "learning_rate": 0.001, |
| "loss": 1.2844, |
| "step": 6762 |
| }, |
| { |
| "epoch": 0.47332483453538937, |
| "grad_norm": 0.404296875, |
| "learning_rate": 0.001, |
| "loss": 1.2787, |
| "step": 6776 |
| }, |
| { |
| "epoch": 0.47430277840839635, |
| "grad_norm": 0.41796875, |
| "learning_rate": 0.001, |
| "loss": 1.2921, |
| "step": 6790 |
| }, |
| { |
| "epoch": 0.47528072228140333, |
| "grad_norm": 0.447265625, |
| "learning_rate": 0.001, |
| "loss": 1.3157, |
| "step": 6804 |
| }, |
| { |
| "epoch": 0.47625866615441037, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.2932, |
| "step": 6818 |
| }, |
| { |
| "epoch": 0.47723661002741735, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.001, |
| "loss": 1.3011, |
| "step": 6832 |
| }, |
| { |
| "epoch": 0.47821455390042433, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.2883, |
| "step": 6846 |
| }, |
| { |
| "epoch": 0.47919249777343137, |
| "grad_norm": 0.2431640625, |
| "learning_rate": 0.001, |
| "loss": 1.3087, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.48017044164643835, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.2855, |
| "step": 6874 |
| }, |
| { |
| "epoch": 0.4811483855194454, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.3106, |
| "step": 6888 |
| }, |
| { |
| "epoch": 0.48212632939245237, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.001, |
| "loss": 1.2869, |
| "step": 6902 |
| }, |
| { |
| "epoch": 0.48310427326545935, |
| "grad_norm": 0.53125, |
| "learning_rate": 0.001, |
| "loss": 1.2852, |
| "step": 6916 |
| }, |
| { |
| "epoch": 0.4840822171384664, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.001, |
| "loss": 1.2995, |
| "step": 6930 |
| }, |
| { |
| "epoch": 0.48506016101147337, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.3098, |
| "step": 6944 |
| }, |
| { |
| "epoch": 0.4860381048844804, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.001, |
| "loss": 1.2687, |
| "step": 6958 |
| }, |
| { |
| "epoch": 0.4870160487574874, |
| "grad_norm": 0.400390625, |
| "learning_rate": 0.001, |
| "loss": 1.2885, |
| "step": 6972 |
| }, |
| { |
| "epoch": 0.48799399263049437, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.001, |
| "loss": 1.3135, |
| "step": 6986 |
| }, |
| { |
| "epoch": 0.4889719365035014, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.001, |
| "loss": 1.2776, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.4899498803765084, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2761, |
| "step": 7014 |
| }, |
| { |
| "epoch": 0.4909278242495154, |
| "grad_norm": 0.326171875, |
| "learning_rate": 0.001, |
| "loss": 1.2805, |
| "step": 7028 |
| }, |
| { |
| "epoch": 0.4919057681225224, |
| "grad_norm": 0.310546875, |
| "learning_rate": 0.001, |
| "loss": 1.2836, |
| "step": 7042 |
| }, |
| { |
| "epoch": 0.4928837119955294, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.3029, |
| "step": 7056 |
| }, |
| { |
| "epoch": 0.4938616558685364, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.2929, |
| "step": 7070 |
| }, |
| { |
| "epoch": 0.4948395997415434, |
| "grad_norm": 0.80859375, |
| "learning_rate": 0.001, |
| "loss": 1.2995, |
| "step": 7084 |
| }, |
| { |
| "epoch": 0.4958175436145504, |
| "grad_norm": 0.4609375, |
| "learning_rate": 0.001, |
| "loss": 1.2788, |
| "step": 7098 |
| }, |
| { |
| "epoch": 0.4967954874875574, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.001, |
| "loss": 1.276, |
| "step": 7112 |
| }, |
| { |
| "epoch": 0.4977734313605644, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.2883, |
| "step": 7126 |
| }, |
| { |
| "epoch": 0.49875137523357144, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.2803, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.4997293191065784, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.001, |
| "loss": 1.2576, |
| "step": 7154 |
| }, |
| { |
| "epoch": 0.49979917224036463, |
| "eval_loss": 1.6829583644866943, |
| "eval_runtime": 9.1256, |
| "eval_samples_per_second": 109.582, |
| "eval_steps_per_second": 1.425, |
| "step": 7155 |
| }, |
| { |
| "epoch": 0.5007072629795855, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.2987, |
| "step": 7168 |
| }, |
| { |
| "epoch": 0.5016852068525924, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.2759, |
| "step": 7182 |
| }, |
| { |
| "epoch": 0.5026631507255994, |
| "grad_norm": 0.376953125, |
| "learning_rate": 0.001, |
| "loss": 1.2814, |
| "step": 7196 |
| }, |
| { |
| "epoch": 0.5036410945986064, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.001, |
| "loss": 1.2701, |
| "step": 7210 |
| }, |
| { |
| "epoch": 0.5046190384716134, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.001, |
| "loss": 1.2857, |
| "step": 7224 |
| }, |
| { |
| "epoch": 0.5055969823446205, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.001, |
| "loss": 1.2707, |
| "step": 7238 |
| }, |
| { |
| "epoch": 0.5065749262176275, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.2851, |
| "step": 7252 |
| }, |
| { |
| "epoch": 0.5075528700906344, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.001, |
| "loss": 1.2722, |
| "step": 7266 |
| }, |
| { |
| "epoch": 0.5085308139636414, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.001, |
| "loss": 1.277, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.5095087578366484, |
| "grad_norm": 0.33203125, |
| "learning_rate": 0.001, |
| "loss": 1.3021, |
| "step": 7294 |
| }, |
| { |
| "epoch": 0.5104867017096555, |
| "grad_norm": 0.2314453125, |
| "learning_rate": 0.001, |
| "loss": 1.2856, |
| "step": 7308 |
| }, |
| { |
| "epoch": 0.5114646455826625, |
| "grad_norm": 0.25, |
| "learning_rate": 0.001, |
| "loss": 1.2704, |
| "step": 7322 |
| }, |
| { |
| "epoch": 0.5124425894556695, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.001, |
| "loss": 1.2837, |
| "step": 7336 |
| }, |
| { |
| "epoch": 0.5134205333286764, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.277, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.5143984772016834, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.001, |
| "loss": 1.2838, |
| "step": 7364 |
| }, |
| { |
| "epoch": 0.5153764210746905, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.001, |
| "loss": 1.2762, |
| "step": 7378 |
| }, |
| { |
| "epoch": 0.5163543649476975, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.2749, |
| "step": 7392 |
| }, |
| { |
| "epoch": 0.5173323088207045, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.001, |
| "loss": 1.2791, |
| "step": 7406 |
| }, |
| { |
| "epoch": 0.5183102526937114, |
| "grad_norm": 0.400390625, |
| "learning_rate": 0.001, |
| "loss": 1.2708, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.5192881965667184, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.2773, |
| "step": 7434 |
| }, |
| { |
| "epoch": 0.5202661404397255, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.2783, |
| "step": 7448 |
| }, |
| { |
| "epoch": 0.5212440843127325, |
| "grad_norm": 0.341796875, |
| "learning_rate": 0.001, |
| "loss": 1.2944, |
| "step": 7462 |
| }, |
| { |
| "epoch": 0.5222220281857395, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.001, |
| "loss": 1.2714, |
| "step": 7476 |
| }, |
| { |
| "epoch": 0.5231999720587465, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.001, |
| "loss": 1.2711, |
| "step": 7490 |
| }, |
| { |
| "epoch": 0.5241779159317534, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.001, |
| "loss": 1.2808, |
| "step": 7504 |
| }, |
| { |
| "epoch": 0.5251558598047605, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2765, |
| "step": 7518 |
| }, |
| { |
| "epoch": 0.5261338036777675, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.2702, |
| "step": 7532 |
| }, |
| { |
| "epoch": 0.5271117475507745, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.001, |
| "loss": 1.2802, |
| "step": 7546 |
| }, |
| { |
| "epoch": 0.5280896914237815, |
| "grad_norm": 0.57421875, |
| "learning_rate": 0.001, |
| "loss": 1.2733, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.5290676352967885, |
| "grad_norm": 0.494140625, |
| "learning_rate": 0.001, |
| "loss": 1.2575, |
| "step": 7574 |
| }, |
| { |
| "epoch": 0.5300455791697956, |
| "grad_norm": 0.376953125, |
| "learning_rate": 0.001, |
| "loss": 1.2863, |
| "step": 7588 |
| }, |
| { |
| "epoch": 0.5310235230428025, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.001, |
| "loss": 1.2815, |
| "step": 7602 |
| }, |
| { |
| "epoch": 0.5320014669158095, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.2745, |
| "step": 7616 |
| }, |
| { |
| "epoch": 0.5329794107888165, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.294, |
| "step": 7630 |
| }, |
| { |
| "epoch": 0.5339573546618235, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.001, |
| "loss": 1.2797, |
| "step": 7644 |
| }, |
| { |
| "epoch": 0.5349352985348306, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.2665, |
| "step": 7658 |
| }, |
| { |
| "epoch": 0.5359132424078376, |
| "grad_norm": 0.458984375, |
| "learning_rate": 0.001, |
| "loss": 1.28, |
| "step": 7672 |
| }, |
| { |
| "epoch": 0.5368911862808445, |
| "grad_norm": 0.359375, |
| "learning_rate": 0.001, |
| "loss": 1.3057, |
| "step": 7686 |
| }, |
| { |
| "epoch": 0.5378691301538515, |
| "grad_norm": 0.37109375, |
| "learning_rate": 0.001, |
| "loss": 1.258, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.5388470740268585, |
| "grad_norm": 0.33203125, |
| "learning_rate": 0.001, |
| "loss": 1.2742, |
| "step": 7714 |
| }, |
| { |
| "epoch": 0.5398250178998655, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.001, |
| "loss": 1.277, |
| "step": 7728 |
| }, |
| { |
| "epoch": 0.5408029617728726, |
| "grad_norm": 0.412109375, |
| "learning_rate": 0.001, |
| "loss": 1.2819, |
| "step": 7742 |
| }, |
| { |
| "epoch": 0.5417809056458796, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.3018, |
| "step": 7756 |
| }, |
| { |
| "epoch": 0.5427588495188865, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2619, |
| "step": 7770 |
| }, |
| { |
| "epoch": 0.5437367933918935, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.001, |
| "loss": 1.2513, |
| "step": 7784 |
| }, |
| { |
| "epoch": 0.5447147372649005, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2456, |
| "step": 7798 |
| }, |
| { |
| "epoch": 0.5456926811379076, |
| "grad_norm": 0.62109375, |
| "learning_rate": 0.001, |
| "loss": 1.2768, |
| "step": 7812 |
| }, |
| { |
| "epoch": 0.5466706250109146, |
| "grad_norm": 0.44140625, |
| "learning_rate": 0.001, |
| "loss": 1.265, |
| "step": 7826 |
| }, |
| { |
| "epoch": 0.5476485688839215, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.2683, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.5486265127569285, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.2809, |
| "step": 7854 |
| }, |
| { |
| "epoch": 0.5496044566299355, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2498, |
| "step": 7868 |
| }, |
| { |
| "epoch": 0.5505824005029426, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.2632, |
| "step": 7882 |
| }, |
| { |
| "epoch": 0.5515603443759496, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.001, |
| "loss": 1.2711, |
| "step": 7896 |
| }, |
| { |
| "epoch": 0.5525382882489566, |
| "grad_norm": 0.373046875, |
| "learning_rate": 0.001, |
| "loss": 1.2813, |
| "step": 7910 |
| }, |
| { |
| "epoch": 0.5535162321219635, |
| "grad_norm": 0.427734375, |
| "learning_rate": 0.001, |
| "loss": 1.2993, |
| "step": 7924 |
| }, |
| { |
| "epoch": 0.5544941759949705, |
| "grad_norm": 0.373046875, |
| "learning_rate": 0.001, |
| "loss": 1.3001, |
| "step": 7938 |
| }, |
| { |
| "epoch": 0.5554721198679776, |
| "grad_norm": 0.416015625, |
| "learning_rate": 0.001, |
| "loss": 1.2786, |
| "step": 7952 |
| }, |
| { |
| "epoch": 0.5564500637409846, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.2976, |
| "step": 7966 |
| }, |
| { |
| "epoch": 0.5574280076139916, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.286, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.5584059514869986, |
| "grad_norm": 0.59765625, |
| "learning_rate": 0.001, |
| "loss": 1.282, |
| "step": 7994 |
| }, |
| { |
| "epoch": 0.5593838953600055, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.001, |
| "loss": 1.2853, |
| "step": 8008 |
| }, |
| { |
| "epoch": 0.5603618392330126, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.001, |
| "loss": 1.2572, |
| "step": 8022 |
| }, |
| { |
| "epoch": 0.5613397831060196, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.001, |
| "loss": 1.2572, |
| "step": 8036 |
| }, |
| { |
| "epoch": 0.5623177269790266, |
| "grad_norm": 0.63671875, |
| "learning_rate": 0.001, |
| "loss": 1.315, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.5632956708520336, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.3007, |
| "step": 8064 |
| }, |
| { |
| "epoch": 0.5642736147250406, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2737, |
| "step": 8078 |
| }, |
| { |
| "epoch": 0.5652515585980477, |
| "grad_norm": 0.24609375, |
| "learning_rate": 0.001, |
| "loss": 1.2766, |
| "step": 8092 |
| }, |
| { |
| "epoch": 0.5662295024710546, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.001, |
| "loss": 1.3102, |
| "step": 8106 |
| }, |
| { |
| "epoch": 0.5672074463440616, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.3044, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.5681853902170686, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.2612, |
| "step": 8134 |
| }, |
| { |
| "epoch": 0.5691633340900756, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.2701, |
| "step": 8148 |
| }, |
| { |
| "epoch": 0.5701412779630827, |
| "grad_norm": 0.21875, |
| "learning_rate": 0.001, |
| "loss": 1.2649, |
| "step": 8162 |
| }, |
| { |
| "epoch": 0.5711192218360897, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.001, |
| "loss": 1.2761, |
| "step": 8176 |
| }, |
| { |
| "epoch": 0.5720971657090966, |
| "grad_norm": 0.29296875, |
| "learning_rate": 0.001, |
| "loss": 1.2668, |
| "step": 8190 |
| }, |
| { |
| "epoch": 0.5730751095821036, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.001, |
| "loss": 1.2847, |
| "step": 8204 |
| }, |
| { |
| "epoch": 0.5740530534551106, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.2722, |
| "step": 8218 |
| }, |
| { |
| "epoch": 0.5750309973281177, |
| "grad_norm": 0.224609375, |
| "learning_rate": 0.001, |
| "loss": 1.253, |
| "step": 8232 |
| }, |
| { |
| "epoch": 0.5760089412011247, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.001, |
| "loss": 1.2454, |
| "step": 8246 |
| }, |
| { |
| "epoch": 0.5769868850741317, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.2558, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.5779648289471386, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.2765, |
| "step": 8274 |
| }, |
| { |
| "epoch": 0.5789427728201456, |
| "grad_norm": 0.314453125, |
| "learning_rate": 0.001, |
| "loss": 1.289, |
| "step": 8288 |
| }, |
| { |
| "epoch": 0.5799207166931526, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.2724, |
| "step": 8302 |
| }, |
| { |
| "epoch": 0.5808986605661597, |
| "grad_norm": 0.44140625, |
| "learning_rate": 0.001, |
| "loss": 1.2753, |
| "step": 8316 |
| }, |
| { |
| "epoch": 0.5818766044391667, |
| "grad_norm": 0.326171875, |
| "learning_rate": 0.001, |
| "loss": 1.2558, |
| "step": 8330 |
| }, |
| { |
| "epoch": 0.5828545483121736, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2697, |
| "step": 8344 |
| }, |
| { |
| "epoch": 0.5838324921851806, |
| "grad_norm": 0.34375, |
| "learning_rate": 0.001, |
| "loss": 1.2685, |
| "step": 8358 |
| }, |
| { |
| "epoch": 0.5848104360581876, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.001, |
| "loss": 1.2724, |
| "step": 8372 |
| }, |
| { |
| "epoch": 0.5857883799311947, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 0.001, |
| "loss": 1.2287, |
| "step": 8386 |
| }, |
| { |
| "epoch": 0.5867663238042017, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.001, |
| "loss": 1.2363, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.5877442676772087, |
| "grad_norm": 0.279296875, |
| "learning_rate": 0.001, |
| "loss": 1.2648, |
| "step": 8414 |
| }, |
| { |
| "epoch": 0.5887222115502156, |
| "grad_norm": 0.427734375, |
| "learning_rate": 0.001, |
| "loss": 1.2949, |
| "step": 8428 |
| }, |
| { |
| "epoch": 0.5897001554232226, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.001, |
| "loss": 1.2571, |
| "step": 8442 |
| }, |
| { |
| "epoch": 0.5906780992962297, |
| "grad_norm": 0.349609375, |
| "learning_rate": 0.001, |
| "loss": 1.2831, |
| "step": 8456 |
| }, |
| { |
| "epoch": 0.5916560431692367, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.001, |
| "loss": 1.2965, |
| "step": 8470 |
| }, |
| { |
| "epoch": 0.5926339870422437, |
| "grad_norm": 0.412109375, |
| "learning_rate": 0.001, |
| "loss": 1.2685, |
| "step": 8484 |
| }, |
| { |
| "epoch": 0.5936119309152507, |
| "grad_norm": 0.439453125, |
| "learning_rate": 0.001, |
| "loss": 1.2637, |
| "step": 8498 |
| }, |
| { |
| "epoch": 0.5945898747882576, |
| "grad_norm": 0.3671875, |
| "learning_rate": 0.001, |
| "loss": 1.28, |
| "step": 8512 |
| }, |
| { |
| "epoch": 0.5955678186612647, |
| "grad_norm": 0.43359375, |
| "learning_rate": 0.001, |
| "loss": 1.2636, |
| "step": 8526 |
| }, |
| { |
| "epoch": 0.5965457625342717, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.251, |
| "step": 8540 |
| }, |
| { |
| "epoch": 0.5975237064072787, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.262, |
| "step": 8554 |
| }, |
| { |
| "epoch": 0.5985016502802857, |
| "grad_norm": 0.365234375, |
| "learning_rate": 0.001, |
| "loss": 1.2696, |
| "step": 8568 |
| }, |
| { |
| "epoch": 0.5994795941532927, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.2872, |
| "step": 8582 |
| }, |
| { |
| "epoch": 0.5997590066884375, |
| "eval_loss": 1.661841869354248, |
| "eval_runtime": 9.1193, |
| "eval_samples_per_second": 109.657, |
| "eval_steps_per_second": 1.426, |
| "step": 8586 |
| }, |
| { |
| "epoch": 0.6004575380262998, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.001, |
| "loss": 1.2767, |
| "step": 8596 |
| }, |
| { |
| "epoch": 0.6014354818993067, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.001, |
| "loss": 1.2623, |
| "step": 8610 |
| }, |
| { |
| "epoch": 0.6024134257723137, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.001, |
| "loss": 1.2617, |
| "step": 8624 |
| }, |
| { |
| "epoch": 0.6033913696453207, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.001, |
| "loss": 1.2514, |
| "step": 8638 |
| }, |
| { |
| "epoch": 0.6043693135183277, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.2664, |
| "step": 8652 |
| }, |
| { |
| "epoch": 0.6053472573913348, |
| "grad_norm": 0.357421875, |
| "learning_rate": 0.001, |
| "loss": 1.2421, |
| "step": 8666 |
| }, |
| { |
| "epoch": 0.6063252012643418, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2386, |
| "step": 8680 |
| }, |
| { |
| "epoch": 0.6073031451373487, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.2601, |
| "step": 8694 |
| }, |
| { |
| "epoch": 0.6082810890103557, |
| "grad_norm": 0.94921875, |
| "learning_rate": 0.001, |
| "loss": 1.2715, |
| "step": 8708 |
| }, |
| { |
| "epoch": 0.6092590328833627, |
| "grad_norm": 0.43359375, |
| "learning_rate": 0.001, |
| "loss": 1.2848, |
| "step": 8722 |
| }, |
| { |
| "epoch": 0.6102369767563698, |
| "grad_norm": 0.34375, |
| "learning_rate": 0.001, |
| "loss": 1.2632, |
| "step": 8736 |
| }, |
| { |
| "epoch": 0.6112149206293768, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.001, |
| "loss": 1.2912, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.6121928645023837, |
| "grad_norm": 0.388671875, |
| "learning_rate": 0.001, |
| "loss": 1.2613, |
| "step": 8764 |
| }, |
| { |
| "epoch": 0.6131708083753907, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.001, |
| "loss": 1.2357, |
| "step": 8778 |
| }, |
| { |
| "epoch": 0.6141487522483977, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.2541, |
| "step": 8792 |
| }, |
| { |
| "epoch": 0.6151266961214047, |
| "grad_norm": 0.32421875, |
| "learning_rate": 0.001, |
| "loss": 1.2746, |
| "step": 8806 |
| }, |
| { |
| "epoch": 0.6161046399944118, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2445, |
| "step": 8820 |
| }, |
| { |
| "epoch": 0.6170825838674188, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.001, |
| "loss": 1.2854, |
| "step": 8834 |
| }, |
| { |
| "epoch": 0.6180605277404257, |
| "grad_norm": 0.54296875, |
| "learning_rate": 0.001, |
| "loss": 1.2746, |
| "step": 8848 |
| }, |
| { |
| "epoch": 0.6190384716134327, |
| "grad_norm": 0.35546875, |
| "learning_rate": 0.001, |
| "loss": 1.2837, |
| "step": 8862 |
| }, |
| { |
| "epoch": 0.6200164154864397, |
| "grad_norm": 0.341796875, |
| "learning_rate": 0.001, |
| "loss": 1.2682, |
| "step": 8876 |
| }, |
| { |
| "epoch": 0.6209943593594468, |
| "grad_norm": 0.51953125, |
| "learning_rate": 0.001, |
| "loss": 1.2751, |
| "step": 8890 |
| }, |
| { |
| "epoch": 0.6219723032324538, |
| "grad_norm": 0.41015625, |
| "learning_rate": 0.001, |
| "loss": 1.2666, |
| "step": 8904 |
| }, |
| { |
| "epoch": 0.6229502471054608, |
| "grad_norm": 0.4140625, |
| "learning_rate": 0.001, |
| "loss": 1.2618, |
| "step": 8918 |
| }, |
| { |
| "epoch": 0.6239281909784677, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.2721, |
| "step": 8932 |
| }, |
| { |
| "epoch": 0.6249061348514747, |
| "grad_norm": 0.3515625, |
| "learning_rate": 0.001, |
| "loss": 1.2528, |
| "step": 8946 |
| }, |
| { |
| "epoch": 0.6258840787244818, |
| "grad_norm": 0.34375, |
| "learning_rate": 0.001, |
| "loss": 1.2771, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.6268620225974888, |
| "grad_norm": 0.275390625, |
| "learning_rate": 0.001, |
| "loss": 1.2751, |
| "step": 8974 |
| }, |
| { |
| "epoch": 0.6278399664704958, |
| "grad_norm": 0.28515625, |
| "learning_rate": 0.001, |
| "loss": 1.2749, |
| "step": 8988 |
| }, |
| { |
| "epoch": 0.6288179103435028, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.001, |
| "loss": 1.2851, |
| "step": 9002 |
| }, |
| { |
| "epoch": 0.6297958542165097, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.001, |
| "loss": 1.2529, |
| "step": 9016 |
| }, |
| { |
| "epoch": 0.6307737980895168, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.001, |
| "loss": 1.2673, |
| "step": 9030 |
| }, |
| { |
| "epoch": 0.6317517419625238, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.2746, |
| "step": 9044 |
| }, |
| { |
| "epoch": 0.6327296858355308, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.2649, |
| "step": 9058 |
| }, |
| { |
| "epoch": 0.6337076297085378, |
| "grad_norm": 0.318359375, |
| "learning_rate": 0.001, |
| "loss": 1.2849, |
| "step": 9072 |
| }, |
| { |
| "epoch": 0.6346855735815448, |
| "grad_norm": 0.30078125, |
| "learning_rate": 0.001, |
| "loss": 1.243, |
| "step": 9086 |
| }, |
| { |
| "epoch": 0.6356635174545519, |
| "grad_norm": 0.375, |
| "learning_rate": 0.001, |
| "loss": 1.2641, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.6366414613275588, |
| "grad_norm": 0.361328125, |
| "learning_rate": 0.001, |
| "loss": 1.2554, |
| "step": 9114 |
| }, |
| { |
| "epoch": 0.6376194052005658, |
| "grad_norm": 0.396484375, |
| "learning_rate": 0.001, |
| "loss": 1.2396, |
| "step": 9128 |
| }, |
| { |
| "epoch": 0.6385973490735728, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2508, |
| "step": 9142 |
| }, |
| { |
| "epoch": 0.6395752929465798, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.001, |
| "loss": 1.2772, |
| "step": 9156 |
| }, |
| { |
| "epoch": 0.6405532368195869, |
| "grad_norm": 0.53515625, |
| "learning_rate": 0.001, |
| "loss": 1.2453, |
| "step": 9170 |
| }, |
| { |
| "epoch": 0.6415311806925938, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.001, |
| "loss": 1.2764, |
| "step": 9184 |
| }, |
| { |
| "epoch": 0.6425091245656008, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 0.001, |
| "loss": 1.251, |
| "step": 9198 |
| }, |
| { |
| "epoch": 0.6434870684386078, |
| "grad_norm": 0.35546875, |
| "learning_rate": 0.001, |
| "loss": 1.2855, |
| "step": 9212 |
| }, |
| { |
| "epoch": 0.6444650123116148, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.001, |
| "loss": 1.3198, |
| "step": 9226 |
| }, |
| { |
| "epoch": 0.6454429561846219, |
| "grad_norm": 0.427734375, |
| "learning_rate": 0.001, |
| "loss": 1.2773, |
| "step": 9240 |
| }, |
| { |
| "epoch": 0.6464209000576289, |
| "grad_norm": 0.5703125, |
| "learning_rate": 0.001, |
| "loss": 1.2786, |
| "step": 9254 |
| }, |
| { |
| "epoch": 0.6473988439306358, |
| "grad_norm": 0.3125, |
| "learning_rate": 0.001, |
| "loss": 1.2389, |
| "step": 9268 |
| }, |
| { |
| "epoch": 0.6483767878036428, |
| "grad_norm": 0.36328125, |
| "learning_rate": 0.001, |
| "loss": 1.2587, |
| "step": 9282 |
| }, |
| { |
| "epoch": 0.6493547316766498, |
| "grad_norm": 0.380859375, |
| "learning_rate": 0.001, |
| "loss": 1.2806, |
| "step": 9296 |
| }, |
| { |
| "epoch": 0.6503326755496568, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.2307, |
| "step": 9310 |
| }, |
| { |
| "epoch": 0.6513106194226639, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2657, |
| "step": 9324 |
| }, |
| { |
| "epoch": 0.6522885632956709, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.001, |
| "loss": 1.2605, |
| "step": 9338 |
| }, |
| { |
| "epoch": 0.6532665071686778, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.001, |
| "loss": 1.2538, |
| "step": 9352 |
| }, |
| { |
| "epoch": 0.6542444510416848, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.2633, |
| "step": 9366 |
| }, |
| { |
| "epoch": 0.6552223949146918, |
| "grad_norm": 0.31640625, |
| "learning_rate": 0.001, |
| "loss": 1.2582, |
| "step": 9380 |
| }, |
| { |
| "epoch": 0.6562003387876989, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.2515, |
| "step": 9394 |
| }, |
| { |
| "epoch": 0.6571782826607059, |
| "grad_norm": 0.33984375, |
| "learning_rate": 0.001, |
| "loss": 1.2679, |
| "step": 9408 |
| }, |
| { |
| "epoch": 0.6581562265337129, |
| "grad_norm": 0.3828125, |
| "learning_rate": 0.001, |
| "loss": 1.2539, |
| "step": 9422 |
| }, |
| { |
| "epoch": 0.6591341704067198, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.001, |
| "loss": 1.2632, |
| "step": 9436 |
| }, |
| { |
| "epoch": 0.6601121142797268, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.001, |
| "loss": 1.2946, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.6610900581527339, |
| "grad_norm": 0.38671875, |
| "learning_rate": 0.001, |
| "loss": 1.2691, |
| "step": 9464 |
| }, |
| { |
| "epoch": 0.6620680020257409, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.001, |
| "loss": 1.246, |
| "step": 9478 |
| }, |
| { |
| "epoch": 0.6630459458987479, |
| "grad_norm": 0.431640625, |
| "learning_rate": 0.001, |
| "loss": 1.2606, |
| "step": 9492 |
| }, |
| { |
| "epoch": 0.6640238897717549, |
| "grad_norm": 0.8671875, |
| "learning_rate": 0.001, |
| "loss": 1.2782, |
| "step": 9506 |
| }, |
| { |
| "epoch": 0.6650018336447618, |
| "grad_norm": 0.4375, |
| "learning_rate": 0.001, |
| "loss": 1.2687, |
| "step": 9520 |
| }, |
| { |
| "epoch": 0.6659797775177689, |
| "grad_norm": 0.37109375, |
| "learning_rate": 0.001, |
| "loss": 1.2778, |
| "step": 9534 |
| }, |
| { |
| "epoch": 0.6669577213907759, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.001, |
| "loss": 1.2544, |
| "step": 9548 |
| }, |
| { |
| "epoch": 0.6679356652637829, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.246, |
| "step": 9562 |
| }, |
| { |
| "epoch": 0.6689136091367899, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.001, |
| "loss": 1.2577, |
| "step": 9576 |
| }, |
| { |
| "epoch": 0.6698915530097969, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.2639, |
| "step": 9590 |
| }, |
| { |
| "epoch": 0.670869496882804, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2493, |
| "step": 9604 |
| }, |
| { |
| "epoch": 0.6718474407558109, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.001, |
| "loss": 1.2586, |
| "step": 9618 |
| }, |
| { |
| "epoch": 0.6728253846288179, |
| "grad_norm": 0.240234375, |
| "learning_rate": 0.001, |
| "loss": 1.2719, |
| "step": 9632 |
| }, |
| { |
| "epoch": 0.6738033285018249, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.001, |
| "loss": 1.2272, |
| "step": 9646 |
| }, |
| { |
| "epoch": 0.6747812723748319, |
| "grad_norm": 0.423828125, |
| "learning_rate": 0.001, |
| "loss": 1.2667, |
| "step": 9660 |
| }, |
| { |
| "epoch": 0.675759216247839, |
| "grad_norm": 0.30859375, |
| "learning_rate": 0.001, |
| "loss": 1.2567, |
| "step": 9674 |
| }, |
| { |
| "epoch": 0.676737160120846, |
| "grad_norm": 0.21875, |
| "learning_rate": 0.001, |
| "loss": 1.2403, |
| "step": 9688 |
| }, |
| { |
| "epoch": 0.6777151039938529, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.001, |
| "loss": 1.2642, |
| "step": 9702 |
| }, |
| { |
| "epoch": 0.6786930478668599, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.001, |
| "loss": 1.2123, |
| "step": 9716 |
| }, |
| { |
| "epoch": 0.6796709917398669, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2413, |
| "step": 9730 |
| }, |
| { |
| "epoch": 0.680648935612874, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.001, |
| "loss": 1.2442, |
| "step": 9744 |
| }, |
| { |
| "epoch": 0.681626879485881, |
| "grad_norm": 0.271484375, |
| "learning_rate": 0.001, |
| "loss": 1.2298, |
| "step": 9758 |
| }, |
| { |
| "epoch": 0.682604823358888, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.001, |
| "loss": 1.2421, |
| "step": 9772 |
| }, |
| { |
| "epoch": 0.6835827672318949, |
| "grad_norm": 0.302734375, |
| "learning_rate": 0.001, |
| "loss": 1.2725, |
| "step": 9786 |
| }, |
| { |
| "epoch": 0.6845607111049019, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.001, |
| "loss": 1.2417, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.685538654977909, |
| "grad_norm": 0.232421875, |
| "learning_rate": 0.001, |
| "loss": 1.2526, |
| "step": 9814 |
| }, |
| { |
| "epoch": 0.686516598850916, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.2352, |
| "step": 9828 |
| }, |
| { |
| "epoch": 0.687494542723923, |
| "grad_norm": 0.361328125, |
| "learning_rate": 0.001, |
| "loss": 1.2653, |
| "step": 9842 |
| }, |
| { |
| "epoch": 0.68847248659693, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.001, |
| "loss": 1.2569, |
| "step": 9856 |
| }, |
| { |
| "epoch": 0.6894504304699369, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.001, |
| "loss": 1.248, |
| "step": 9870 |
| }, |
| { |
| "epoch": 0.6904283743429439, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.2864, |
| "step": 9884 |
| }, |
| { |
| "epoch": 0.691406318215951, |
| "grad_norm": 0.345703125, |
| "learning_rate": 0.001, |
| "loss": 1.2663, |
| "step": 9898 |
| }, |
| { |
| "epoch": 0.692384262088958, |
| "grad_norm": 0.36328125, |
| "learning_rate": 0.001, |
| "loss": 1.2534, |
| "step": 9912 |
| }, |
| { |
| "epoch": 0.693362205961965, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.001, |
| "loss": 1.2439, |
| "step": 9926 |
| }, |
| { |
| "epoch": 0.6943401498349719, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.001, |
| "loss": 1.2592, |
| "step": 9940 |
| }, |
| { |
| "epoch": 0.6953180937079789, |
| "grad_norm": 0.3046875, |
| "learning_rate": 0.001, |
| "loss": 1.2655, |
| "step": 9954 |
| }, |
| { |
| "epoch": 0.696296037580986, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.001, |
| "loss": 1.2569, |
| "step": 9968 |
| }, |
| { |
| "epoch": 0.697273981453993, |
| "grad_norm": 0.376953125, |
| "learning_rate": 0.001, |
| "loss": 1.2663, |
| "step": 9982 |
| }, |
| { |
| "epoch": 0.698251925327, |
| "grad_norm": 0.291015625, |
| "learning_rate": 0.001, |
| "loss": 1.2419, |
| "step": 9996 |
| }, |
| { |
| "epoch": 0.699229869200007, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.001, |
| "loss": 1.2406, |
| "step": 10010 |
| }, |
| { |
| "epoch": 0.6997188411365105, |
| "eval_loss": 1.6404287815093994, |
| "eval_runtime": 9.1224, |
| "eval_samples_per_second": 109.621, |
| "eval_steps_per_second": 1.425, |
| "step": 10017 |
| } |
| ], |
| "logging_steps": 14, |
| "max_steps": 14315, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1431, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.66909141699448e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|