99-base / trainer_state.json
RonanMcGovern's picture
Upload folder using huggingface_hub
cdad970 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6997188411365105,
"eval_steps": 1431,
"global_step": 10017,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009779438730070028,
"grad_norm": 0.52734375,
"learning_rate": 0.001,
"loss": 5.7482,
"step": 14
},
{
"epoch": 0.0019558877460140057,
"grad_norm": 0.1259765625,
"learning_rate": 0.001,
"loss": 3.6003,
"step": 28
},
{
"epoch": 0.0029338316190210085,
"grad_norm": 0.12109375,
"learning_rate": 0.001,
"loss": 3.4079,
"step": 42
},
{
"epoch": 0.003911775492028011,
"grad_norm": 0.22265625,
"learning_rate": 0.001,
"loss": 3.201,
"step": 56
},
{
"epoch": 0.004889719365035014,
"grad_norm": 0.21484375,
"learning_rate": 0.001,
"loss": 2.955,
"step": 70
},
{
"epoch": 0.005867663238042017,
"grad_norm": 0.2119140625,
"learning_rate": 0.001,
"loss": 2.8744,
"step": 84
},
{
"epoch": 0.006845607111049019,
"grad_norm": 0.15625,
"learning_rate": 0.001,
"loss": 2.8267,
"step": 98
},
{
"epoch": 0.007823550984056023,
"grad_norm": 0.1884765625,
"learning_rate": 0.001,
"loss": 2.7249,
"step": 112
},
{
"epoch": 0.008801494857063025,
"grad_norm": 2.625,
"learning_rate": 0.001,
"loss": 2.6441,
"step": 126
},
{
"epoch": 0.009779438730070028,
"grad_norm": 0.173828125,
"learning_rate": 0.001,
"loss": 2.6126,
"step": 140
},
{
"epoch": 0.01075738260307703,
"grad_norm": 0.10693359375,
"learning_rate": 0.001,
"loss": 2.5241,
"step": 154
},
{
"epoch": 0.011735326476084034,
"grad_norm": 0.158203125,
"learning_rate": 0.001,
"loss": 2.4803,
"step": 168
},
{
"epoch": 0.012713270349091036,
"grad_norm": 0.119140625,
"learning_rate": 0.001,
"loss": 2.3886,
"step": 182
},
{
"epoch": 0.013691214222098038,
"grad_norm": 0.1923828125,
"learning_rate": 0.001,
"loss": 2.3931,
"step": 196
},
{
"epoch": 0.014669158095105042,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 2.3493,
"step": 210
},
{
"epoch": 0.015647101968112045,
"grad_norm": 0.234375,
"learning_rate": 0.001,
"loss": 2.3317,
"step": 224
},
{
"epoch": 0.016625045841119047,
"grad_norm": 0.2138671875,
"learning_rate": 0.001,
"loss": 2.3284,
"step": 238
},
{
"epoch": 0.01760298971412605,
"grad_norm": 0.16796875,
"learning_rate": 0.001,
"loss": 2.2322,
"step": 252
},
{
"epoch": 0.01858093358713305,
"grad_norm": 0.1650390625,
"learning_rate": 0.001,
"loss": 2.2145,
"step": 266
},
{
"epoch": 0.019558877460140057,
"grad_norm": 0.2890625,
"learning_rate": 0.001,
"loss": 2.202,
"step": 280
},
{
"epoch": 0.02053682133314706,
"grad_norm": 0.201171875,
"learning_rate": 0.001,
"loss": 2.2156,
"step": 294
},
{
"epoch": 0.02151476520615406,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 2.1436,
"step": 308
},
{
"epoch": 0.022492709079161063,
"grad_norm": 0.390625,
"learning_rate": 0.001,
"loss": 2.1198,
"step": 322
},
{
"epoch": 0.023470652952168068,
"grad_norm": 0.234375,
"learning_rate": 0.001,
"loss": 2.1031,
"step": 336
},
{
"epoch": 0.02444859682517507,
"grad_norm": 0.1298828125,
"learning_rate": 0.001,
"loss": 2.0943,
"step": 350
},
{
"epoch": 0.025426540698182072,
"grad_norm": 0.10693359375,
"learning_rate": 0.001,
"loss": 2.0352,
"step": 364
},
{
"epoch": 0.026404484571189074,
"grad_norm": 0.158203125,
"learning_rate": 0.001,
"loss": 2.0439,
"step": 378
},
{
"epoch": 0.027382428444196076,
"grad_norm": 0.16796875,
"learning_rate": 0.001,
"loss": 2.0176,
"step": 392
},
{
"epoch": 0.02836037231720308,
"grad_norm": 0.474609375,
"learning_rate": 0.001,
"loss": 2.0212,
"step": 406
},
{
"epoch": 0.029338316190210083,
"grad_norm": 0.240234375,
"learning_rate": 0.001,
"loss": 1.9963,
"step": 420
},
{
"epoch": 0.030316260063217085,
"grad_norm": 0.1689453125,
"learning_rate": 0.001,
"loss": 1.9738,
"step": 434
},
{
"epoch": 0.03129420393622409,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.9362,
"step": 448
},
{
"epoch": 0.03227214780923109,
"grad_norm": 0.1982421875,
"learning_rate": 0.001,
"loss": 1.9393,
"step": 462
},
{
"epoch": 0.033250091682238095,
"grad_norm": 0.400390625,
"learning_rate": 0.001,
"loss": 1.9876,
"step": 476
},
{
"epoch": 0.0342280355552451,
"grad_norm": 0.23046875,
"learning_rate": 0.001,
"loss": 1.9551,
"step": 490
},
{
"epoch": 0.0352059794282521,
"grad_norm": 0.1591796875,
"learning_rate": 0.001,
"loss": 1.9103,
"step": 504
},
{
"epoch": 0.0361839233012591,
"grad_norm": 0.1767578125,
"learning_rate": 0.001,
"loss": 1.9243,
"step": 518
},
{
"epoch": 0.0371618671742661,
"grad_norm": 0.478515625,
"learning_rate": 0.001,
"loss": 1.9207,
"step": 532
},
{
"epoch": 0.03813981104727311,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.9029,
"step": 546
},
{
"epoch": 0.039117754920280114,
"grad_norm": 0.2294921875,
"learning_rate": 0.001,
"loss": 1.8988,
"step": 560
},
{
"epoch": 0.040095698793287116,
"grad_norm": 0.189453125,
"learning_rate": 0.001,
"loss": 1.8653,
"step": 574
},
{
"epoch": 0.04107364266629412,
"grad_norm": 0.2158203125,
"learning_rate": 0.001,
"loss": 1.8383,
"step": 588
},
{
"epoch": 0.04205158653930112,
"grad_norm": 0.236328125,
"learning_rate": 0.001,
"loss": 1.8191,
"step": 602
},
{
"epoch": 0.04302953041230812,
"grad_norm": 0.55859375,
"learning_rate": 0.001,
"loss": 1.8837,
"step": 616
},
{
"epoch": 0.044007474285315124,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.8838,
"step": 630
},
{
"epoch": 0.044985418158322125,
"grad_norm": 0.1513671875,
"learning_rate": 0.001,
"loss": 1.8423,
"step": 644
},
{
"epoch": 0.04596336203132913,
"grad_norm": 0.134765625,
"learning_rate": 0.001,
"loss": 1.785,
"step": 658
},
{
"epoch": 0.046941305904336136,
"grad_norm": 0.2265625,
"learning_rate": 0.001,
"loss": 1.8072,
"step": 672
},
{
"epoch": 0.04791924977734314,
"grad_norm": 0.1669921875,
"learning_rate": 0.001,
"loss": 1.7771,
"step": 686
},
{
"epoch": 0.04889719365035014,
"grad_norm": 0.33203125,
"learning_rate": 0.001,
"loss": 1.8465,
"step": 700
},
{
"epoch": 0.04987513752335714,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.7964,
"step": 714
},
{
"epoch": 0.050853081396364144,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.81,
"step": 728
},
{
"epoch": 0.051831025269371146,
"grad_norm": 0.33984375,
"learning_rate": 0.001,
"loss": 1.7867,
"step": 742
},
{
"epoch": 0.05280896914237815,
"grad_norm": 0.19140625,
"learning_rate": 0.001,
"loss": 1.7581,
"step": 756
},
{
"epoch": 0.05378691301538515,
"grad_norm": 0.2158203125,
"learning_rate": 0.001,
"loss": 1.7116,
"step": 770
},
{
"epoch": 0.05476485688839215,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.7431,
"step": 784
},
{
"epoch": 0.05574280076139916,
"grad_norm": 0.2255859375,
"learning_rate": 0.001,
"loss": 1.7647,
"step": 798
},
{
"epoch": 0.05672074463440616,
"grad_norm": 0.177734375,
"learning_rate": 0.001,
"loss": 1.709,
"step": 812
},
{
"epoch": 0.057698688507413165,
"grad_norm": 0.384765625,
"learning_rate": 0.001,
"loss": 1.7235,
"step": 826
},
{
"epoch": 0.05867663238042017,
"grad_norm": 0.26171875,
"learning_rate": 0.001,
"loss": 1.7161,
"step": 840
},
{
"epoch": 0.05965457625342717,
"grad_norm": 0.455078125,
"learning_rate": 0.001,
"loss": 1.812,
"step": 854
},
{
"epoch": 0.06063252012643417,
"grad_norm": 0.2099609375,
"learning_rate": 0.001,
"loss": 1.7164,
"step": 868
},
{
"epoch": 0.06161046399944117,
"grad_norm": 0.154296875,
"learning_rate": 0.001,
"loss": 1.7677,
"step": 882
},
{
"epoch": 0.06258840787244818,
"grad_norm": 0.1796875,
"learning_rate": 0.001,
"loss": 1.7001,
"step": 896
},
{
"epoch": 0.06356635174545518,
"grad_norm": 0.173828125,
"learning_rate": 0.001,
"loss": 1.7015,
"step": 910
},
{
"epoch": 0.06454429561846219,
"grad_norm": 0.203125,
"learning_rate": 0.001,
"loss": 1.6904,
"step": 924
},
{
"epoch": 0.06552223949146918,
"grad_norm": 0.2236328125,
"learning_rate": 0.001,
"loss": 1.6692,
"step": 938
},
{
"epoch": 0.06650018336447619,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.6802,
"step": 952
},
{
"epoch": 0.06747812723748318,
"grad_norm": 0.228515625,
"learning_rate": 0.001,
"loss": 1.6566,
"step": 966
},
{
"epoch": 0.0684560711104902,
"grad_norm": 0.275390625,
"learning_rate": 0.001,
"loss": 1.7042,
"step": 980
},
{
"epoch": 0.0694340149834972,
"grad_norm": 0.2158203125,
"learning_rate": 0.001,
"loss": 1.6653,
"step": 994
},
{
"epoch": 0.0704119588565042,
"grad_norm": 0.357421875,
"learning_rate": 0.001,
"loss": 1.6729,
"step": 1008
},
{
"epoch": 0.0713899027295112,
"grad_norm": 0.28125,
"learning_rate": 0.001,
"loss": 1.6864,
"step": 1022
},
{
"epoch": 0.0723678466025182,
"grad_norm": 0.1513671875,
"learning_rate": 0.001,
"loss": 1.6541,
"step": 1036
},
{
"epoch": 0.07334579047552521,
"grad_norm": 1.0390625,
"learning_rate": 0.001,
"loss": 1.7122,
"step": 1050
},
{
"epoch": 0.0743237343485322,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.7231,
"step": 1064
},
{
"epoch": 0.07530167822153921,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.6781,
"step": 1078
},
{
"epoch": 0.07627962209454622,
"grad_norm": 0.23828125,
"learning_rate": 0.001,
"loss": 1.667,
"step": 1092
},
{
"epoch": 0.07725756596755322,
"grad_norm": 0.2333984375,
"learning_rate": 0.001,
"loss": 1.6667,
"step": 1106
},
{
"epoch": 0.07823550984056023,
"grad_norm": 0.2119140625,
"learning_rate": 0.001,
"loss": 1.622,
"step": 1120
},
{
"epoch": 0.07921345371356722,
"grad_norm": 0.1533203125,
"learning_rate": 0.001,
"loss": 1.636,
"step": 1134
},
{
"epoch": 0.08019139758657423,
"grad_norm": 0.1943359375,
"learning_rate": 0.001,
"loss": 1.6423,
"step": 1148
},
{
"epoch": 0.08116934145958123,
"grad_norm": 0.1806640625,
"learning_rate": 0.001,
"loss": 1.6136,
"step": 1162
},
{
"epoch": 0.08214728533258824,
"grad_norm": 0.224609375,
"learning_rate": 0.001,
"loss": 1.5988,
"step": 1176
},
{
"epoch": 0.08312522920559523,
"grad_norm": 0.2890625,
"learning_rate": 0.001,
"loss": 1.6216,
"step": 1190
},
{
"epoch": 0.08410317307860224,
"grad_norm": 0.2060546875,
"learning_rate": 0.001,
"loss": 1.6296,
"step": 1204
},
{
"epoch": 0.08508111695160925,
"grad_norm": 0.2451171875,
"learning_rate": 0.001,
"loss": 1.5944,
"step": 1218
},
{
"epoch": 0.08605906082461624,
"grad_norm": 0.2392578125,
"learning_rate": 0.001,
"loss": 1.6144,
"step": 1232
},
{
"epoch": 0.08703700469762325,
"grad_norm": 0.357421875,
"learning_rate": 0.001,
"loss": 1.585,
"step": 1246
},
{
"epoch": 0.08801494857063025,
"grad_norm": 0.2470703125,
"learning_rate": 0.001,
"loss": 1.6033,
"step": 1260
},
{
"epoch": 0.08899289244363726,
"grad_norm": 0.201171875,
"learning_rate": 0.001,
"loss": 1.6122,
"step": 1274
},
{
"epoch": 0.08997083631664425,
"grad_norm": 0.396484375,
"learning_rate": 0.001,
"loss": 1.5978,
"step": 1288
},
{
"epoch": 0.09094878018965126,
"grad_norm": 0.4453125,
"learning_rate": 0.001,
"loss": 1.6392,
"step": 1302
},
{
"epoch": 0.09192672406265825,
"grad_norm": 0.2470703125,
"learning_rate": 0.001,
"loss": 1.6247,
"step": 1316
},
{
"epoch": 0.09290466793566526,
"grad_norm": 0.193359375,
"learning_rate": 0.001,
"loss": 1.5888,
"step": 1330
},
{
"epoch": 0.09388261180867227,
"grad_norm": 0.2392578125,
"learning_rate": 0.001,
"loss": 1.572,
"step": 1344
},
{
"epoch": 0.09486055568167927,
"grad_norm": 0.17578125,
"learning_rate": 0.001,
"loss": 1.5725,
"step": 1358
},
{
"epoch": 0.09583849955468628,
"grad_norm": 0.314453125,
"learning_rate": 0.001,
"loss": 1.6131,
"step": 1372
},
{
"epoch": 0.09681644342769327,
"grad_norm": 0.22265625,
"learning_rate": 0.001,
"loss": 1.5467,
"step": 1386
},
{
"epoch": 0.09779438730070028,
"grad_norm": 0.296875,
"learning_rate": 0.001,
"loss": 1.5843,
"step": 1400
},
{
"epoch": 0.09877233117370728,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.5519,
"step": 1414
},
{
"epoch": 0.09975027504671428,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.5531,
"step": 1428
},
{
"epoch": 0.09995983444807292,
"eval_loss": 2.1689391136169434,
"eval_runtime": 9.1273,
"eval_samples_per_second": 109.561,
"eval_steps_per_second": 1.424,
"step": 1431
},
{
"epoch": 0.10072821891972128,
"grad_norm": 0.2333984375,
"learning_rate": 0.001,
"loss": 1.561,
"step": 1442
},
{
"epoch": 0.10170616279272829,
"grad_norm": 0.302734375,
"learning_rate": 0.001,
"loss": 1.5818,
"step": 1456
},
{
"epoch": 0.1026841066657353,
"grad_norm": 0.294921875,
"learning_rate": 0.001,
"loss": 1.5653,
"step": 1470
},
{
"epoch": 0.10366205053874229,
"grad_norm": 0.296875,
"learning_rate": 0.001,
"loss": 1.585,
"step": 1484
},
{
"epoch": 0.1046399944117493,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.5502,
"step": 1498
},
{
"epoch": 0.1056179382847563,
"grad_norm": 0.5703125,
"learning_rate": 0.001,
"loss": 1.5873,
"step": 1512
},
{
"epoch": 0.1065958821577633,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.5825,
"step": 1526
},
{
"epoch": 0.1075738260307703,
"grad_norm": 0.2578125,
"learning_rate": 0.001,
"loss": 1.5712,
"step": 1540
},
{
"epoch": 0.10855176990377731,
"grad_norm": 0.2392578125,
"learning_rate": 0.001,
"loss": 1.5443,
"step": 1554
},
{
"epoch": 0.1095297137767843,
"grad_norm": 0.95703125,
"learning_rate": 0.001,
"loss": 1.5989,
"step": 1568
},
{
"epoch": 0.11050765764979131,
"grad_norm": 0.318359375,
"learning_rate": 0.001,
"loss": 1.6003,
"step": 1582
},
{
"epoch": 0.11148560152279832,
"grad_norm": 0.2578125,
"learning_rate": 0.001,
"loss": 1.5907,
"step": 1596
},
{
"epoch": 0.11246354539580532,
"grad_norm": 0.330078125,
"learning_rate": 0.001,
"loss": 1.5373,
"step": 1610
},
{
"epoch": 0.11344148926881233,
"grad_norm": 0.2392578125,
"learning_rate": 0.001,
"loss": 1.5531,
"step": 1624
},
{
"epoch": 0.11441943314181932,
"grad_norm": 0.232421875,
"learning_rate": 0.001,
"loss": 1.5317,
"step": 1638
},
{
"epoch": 0.11539737701482633,
"grad_norm": 0.3671875,
"learning_rate": 0.001,
"loss": 1.5157,
"step": 1652
},
{
"epoch": 0.11637532088783333,
"grad_norm": 0.2001953125,
"learning_rate": 0.001,
"loss": 1.5462,
"step": 1666
},
{
"epoch": 0.11735326476084033,
"grad_norm": 0.2236328125,
"learning_rate": 0.001,
"loss": 1.5598,
"step": 1680
},
{
"epoch": 0.11833120863384734,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.5345,
"step": 1694
},
{
"epoch": 0.11930915250685434,
"grad_norm": 0.470703125,
"learning_rate": 0.001,
"loss": 1.5324,
"step": 1708
},
{
"epoch": 0.12028709637986135,
"grad_norm": 0.2158203125,
"learning_rate": 0.001,
"loss": 1.498,
"step": 1722
},
{
"epoch": 0.12126504025286834,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.5072,
"step": 1736
},
{
"epoch": 0.12224298412587535,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.5364,
"step": 1750
},
{
"epoch": 0.12322092799888235,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.5,
"step": 1764
},
{
"epoch": 0.12419887187188935,
"grad_norm": 0.31640625,
"learning_rate": 0.001,
"loss": 1.5113,
"step": 1778
},
{
"epoch": 0.12517681574489636,
"grad_norm": 0.400390625,
"learning_rate": 0.001,
"loss": 1.5099,
"step": 1792
},
{
"epoch": 0.12615475961790334,
"grad_norm": 0.2294921875,
"learning_rate": 0.001,
"loss": 1.5076,
"step": 1806
},
{
"epoch": 0.12713270349091035,
"grad_norm": 0.205078125,
"learning_rate": 0.001,
"loss": 1.4994,
"step": 1820
},
{
"epoch": 0.12811064736391736,
"grad_norm": 0.349609375,
"learning_rate": 0.001,
"loss": 1.5135,
"step": 1834
},
{
"epoch": 0.12908859123692437,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.5214,
"step": 1848
},
{
"epoch": 0.13006653510993138,
"grad_norm": 0.326171875,
"learning_rate": 0.001,
"loss": 1.4525,
"step": 1862
},
{
"epoch": 0.13104447898293836,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.51,
"step": 1876
},
{
"epoch": 0.13202242285594537,
"grad_norm": 0.294921875,
"learning_rate": 0.001,
"loss": 1.4965,
"step": 1890
},
{
"epoch": 0.13300036672895238,
"grad_norm": 0.1943359375,
"learning_rate": 0.001,
"loss": 1.4854,
"step": 1904
},
{
"epoch": 0.1339783106019594,
"grad_norm": 0.1943359375,
"learning_rate": 0.001,
"loss": 1.4926,
"step": 1918
},
{
"epoch": 0.13495625447496637,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.4897,
"step": 1932
},
{
"epoch": 0.13593419834797338,
"grad_norm": 0.365234375,
"learning_rate": 0.001,
"loss": 1.498,
"step": 1946
},
{
"epoch": 0.1369121422209804,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.5029,
"step": 1960
},
{
"epoch": 0.1378900860939874,
"grad_norm": 0.419921875,
"learning_rate": 0.001,
"loss": 1.482,
"step": 1974
},
{
"epoch": 0.1388680299669944,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.5212,
"step": 1988
},
{
"epoch": 0.1398459738400014,
"grad_norm": 0.80859375,
"learning_rate": 0.001,
"loss": 1.549,
"step": 2002
},
{
"epoch": 0.1408239177130084,
"grad_norm": 0.36328125,
"learning_rate": 0.001,
"loss": 1.5148,
"step": 2016
},
{
"epoch": 0.1418018615860154,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.4993,
"step": 2030
},
{
"epoch": 0.1427798054590224,
"grad_norm": 0.220703125,
"learning_rate": 0.001,
"loss": 1.4985,
"step": 2044
},
{
"epoch": 0.14375774933202942,
"grad_norm": 0.25,
"learning_rate": 0.001,
"loss": 1.4938,
"step": 2058
},
{
"epoch": 0.1447356932050364,
"grad_norm": 0.177734375,
"learning_rate": 0.001,
"loss": 1.4777,
"step": 2072
},
{
"epoch": 0.1457136370780434,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.4865,
"step": 2086
},
{
"epoch": 0.14669158095105042,
"grad_norm": 0.19140625,
"learning_rate": 0.001,
"loss": 1.4567,
"step": 2100
},
{
"epoch": 0.14766952482405743,
"grad_norm": 0.2099609375,
"learning_rate": 0.001,
"loss": 1.4706,
"step": 2114
},
{
"epoch": 0.1486474686970644,
"grad_norm": 0.216796875,
"learning_rate": 0.001,
"loss": 1.4959,
"step": 2128
},
{
"epoch": 0.14962541257007142,
"grad_norm": 0.224609375,
"learning_rate": 0.001,
"loss": 1.4631,
"step": 2142
},
{
"epoch": 0.15060335644307843,
"grad_norm": 0.2490234375,
"learning_rate": 0.001,
"loss": 1.4669,
"step": 2156
},
{
"epoch": 0.15158130031608544,
"grad_norm": 0.2099609375,
"learning_rate": 0.001,
"loss": 1.4479,
"step": 2170
},
{
"epoch": 0.15255924418909245,
"grad_norm": 0.25,
"learning_rate": 0.001,
"loss": 1.4584,
"step": 2184
},
{
"epoch": 0.15353718806209943,
"grad_norm": 0.345703125,
"learning_rate": 0.001,
"loss": 1.4609,
"step": 2198
},
{
"epoch": 0.15451513193510644,
"grad_norm": 0.27734375,
"learning_rate": 0.001,
"loss": 1.4499,
"step": 2212
},
{
"epoch": 0.15549307580811345,
"grad_norm": 0.2373046875,
"learning_rate": 0.001,
"loss": 1.4562,
"step": 2226
},
{
"epoch": 0.15647101968112045,
"grad_norm": 0.302734375,
"learning_rate": 0.001,
"loss": 1.4745,
"step": 2240
},
{
"epoch": 0.15744896355412744,
"grad_norm": 0.330078125,
"learning_rate": 0.001,
"loss": 1.4773,
"step": 2254
},
{
"epoch": 0.15842690742713444,
"grad_norm": 0.4609375,
"learning_rate": 0.001,
"loss": 1.4501,
"step": 2268
},
{
"epoch": 0.15940485130014145,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.4378,
"step": 2282
},
{
"epoch": 0.16038279517314846,
"grad_norm": 0.19921875,
"learning_rate": 0.001,
"loss": 1.4452,
"step": 2296
},
{
"epoch": 0.16136073904615547,
"grad_norm": 0.322265625,
"learning_rate": 0.001,
"loss": 1.4536,
"step": 2310
},
{
"epoch": 0.16233868291916245,
"grad_norm": 0.302734375,
"learning_rate": 0.001,
"loss": 1.462,
"step": 2324
},
{
"epoch": 0.16331662679216946,
"grad_norm": 0.2353515625,
"learning_rate": 0.001,
"loss": 1.451,
"step": 2338
},
{
"epoch": 0.16429457066517647,
"grad_norm": 0.345703125,
"learning_rate": 0.001,
"loss": 1.4266,
"step": 2352
},
{
"epoch": 0.16527251453818348,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.4503,
"step": 2366
},
{
"epoch": 0.16625045841119046,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.4502,
"step": 2380
},
{
"epoch": 0.16722840228419747,
"grad_norm": 0.279296875,
"learning_rate": 0.001,
"loss": 1.4587,
"step": 2394
},
{
"epoch": 0.16820634615720448,
"grad_norm": 0.279296875,
"learning_rate": 0.001,
"loss": 1.4588,
"step": 2408
},
{
"epoch": 0.1691842900302115,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.4679,
"step": 2422
},
{
"epoch": 0.1701622339032185,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.4329,
"step": 2436
},
{
"epoch": 0.17114017777622548,
"grad_norm": 1.109375,
"learning_rate": 0.001,
"loss": 1.4566,
"step": 2450
},
{
"epoch": 0.1721181216492325,
"grad_norm": 0.357421875,
"learning_rate": 0.001,
"loss": 1.4728,
"step": 2464
},
{
"epoch": 0.1730960655222395,
"grad_norm": 0.2734375,
"learning_rate": 0.001,
"loss": 1.4518,
"step": 2478
},
{
"epoch": 0.1740740093952465,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.4507,
"step": 2492
},
{
"epoch": 0.17505195326825349,
"grad_norm": 0.2412109375,
"learning_rate": 0.001,
"loss": 1.4242,
"step": 2506
},
{
"epoch": 0.1760298971412605,
"grad_norm": 0.197265625,
"learning_rate": 0.001,
"loss": 1.4116,
"step": 2520
},
{
"epoch": 0.1770078410142675,
"grad_norm": 0.3671875,
"learning_rate": 0.001,
"loss": 1.4294,
"step": 2534
},
{
"epoch": 0.1779857848872745,
"grad_norm": 0.451171875,
"learning_rate": 0.001,
"loss": 1.4448,
"step": 2548
},
{
"epoch": 0.17896372876028152,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.4468,
"step": 2562
},
{
"epoch": 0.1799416726332885,
"grad_norm": 0.419921875,
"learning_rate": 0.001,
"loss": 1.4319,
"step": 2576
},
{
"epoch": 0.1809196165062955,
"grad_norm": 0.2275390625,
"learning_rate": 0.001,
"loss": 1.4309,
"step": 2590
},
{
"epoch": 0.18189756037930252,
"grad_norm": 0.236328125,
"learning_rate": 0.001,
"loss": 1.4324,
"step": 2604
},
{
"epoch": 0.18287550425230953,
"grad_norm": 0.1875,
"learning_rate": 0.001,
"loss": 1.4143,
"step": 2618
},
{
"epoch": 0.1838534481253165,
"grad_norm": 0.197265625,
"learning_rate": 0.001,
"loss": 1.417,
"step": 2632
},
{
"epoch": 0.18483139199832352,
"grad_norm": 0.2099609375,
"learning_rate": 0.001,
"loss": 1.4318,
"step": 2646
},
{
"epoch": 0.18580933587133053,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.4151,
"step": 2660
},
{
"epoch": 0.18678727974433754,
"grad_norm": 0.275390625,
"learning_rate": 0.001,
"loss": 1.4169,
"step": 2674
},
{
"epoch": 0.18776522361734455,
"grad_norm": 0.302734375,
"learning_rate": 0.001,
"loss": 1.4352,
"step": 2688
},
{
"epoch": 0.18874316749035153,
"grad_norm": 0.4296875,
"learning_rate": 0.001,
"loss": 1.4346,
"step": 2702
},
{
"epoch": 0.18972111136335854,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.4253,
"step": 2716
},
{
"epoch": 0.19069905523636554,
"grad_norm": 0.31640625,
"learning_rate": 0.001,
"loss": 1.4272,
"step": 2730
},
{
"epoch": 0.19167699910937255,
"grad_norm": 0.8984375,
"learning_rate": 0.001,
"loss": 1.4618,
"step": 2744
},
{
"epoch": 0.19265494298237953,
"grad_norm": 0.322265625,
"learning_rate": 0.001,
"loss": 1.4616,
"step": 2758
},
{
"epoch": 0.19363288685538654,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.4231,
"step": 2772
},
{
"epoch": 0.19461083072839355,
"grad_norm": 0.20703125,
"learning_rate": 0.001,
"loss": 1.4185,
"step": 2786
},
{
"epoch": 0.19558877460140056,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.4578,
"step": 2800
},
{
"epoch": 0.19656671847440757,
"grad_norm": 0.26953125,
"learning_rate": 0.001,
"loss": 1.4134,
"step": 2814
},
{
"epoch": 0.19754466234741455,
"grad_norm": 0.2099609375,
"learning_rate": 0.001,
"loss": 1.4253,
"step": 2828
},
{
"epoch": 0.19852260622042156,
"grad_norm": 0.208984375,
"learning_rate": 0.001,
"loss": 1.414,
"step": 2842
},
{
"epoch": 0.19950055009342857,
"grad_norm": 0.337890625,
"learning_rate": 0.001,
"loss": 1.4069,
"step": 2856
},
{
"epoch": 0.19991966889614585,
"eval_loss": 1.8960140943527222,
"eval_runtime": 9.2129,
"eval_samples_per_second": 108.543,
"eval_steps_per_second": 1.411,
"step": 2862
},
{
"epoch": 0.20047849396643558,
"grad_norm": 0.31640625,
"learning_rate": 0.001,
"loss": 1.4267,
"step": 2870
},
{
"epoch": 0.20145643783944256,
"grad_norm": 0.2373046875,
"learning_rate": 0.001,
"loss": 1.4186,
"step": 2884
},
{
"epoch": 0.20243438171244957,
"grad_norm": 0.2236328125,
"learning_rate": 0.001,
"loss": 1.4105,
"step": 2898
},
{
"epoch": 0.20341232558545658,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.4012,
"step": 2912
},
{
"epoch": 0.20439026945846359,
"grad_norm": 0.234375,
"learning_rate": 0.001,
"loss": 1.3917,
"step": 2926
},
{
"epoch": 0.2053682133314706,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3952,
"step": 2940
},
{
"epoch": 0.20634615720447758,
"grad_norm": 0.1923828125,
"learning_rate": 0.001,
"loss": 1.396,
"step": 2954
},
{
"epoch": 0.20732410107748458,
"grad_norm": 0.453125,
"learning_rate": 0.001,
"loss": 1.4556,
"step": 2968
},
{
"epoch": 0.2083020449504916,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.4215,
"step": 2982
},
{
"epoch": 0.2092799888234986,
"grad_norm": 0.2158203125,
"learning_rate": 0.001,
"loss": 1.4166,
"step": 2996
},
{
"epoch": 0.21025793269650558,
"grad_norm": 0.25,
"learning_rate": 0.001,
"loss": 1.3975,
"step": 3010
},
{
"epoch": 0.2112358765695126,
"grad_norm": 0.2392578125,
"learning_rate": 0.001,
"loss": 1.4019,
"step": 3024
},
{
"epoch": 0.2122138204425196,
"grad_norm": 0.2578125,
"learning_rate": 0.001,
"loss": 1.4016,
"step": 3038
},
{
"epoch": 0.2131917643155266,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.4069,
"step": 3052
},
{
"epoch": 0.21416970818853362,
"grad_norm": 0.275390625,
"learning_rate": 0.001,
"loss": 1.4214,
"step": 3066
},
{
"epoch": 0.2151476520615406,
"grad_norm": 0.23046875,
"learning_rate": 0.001,
"loss": 1.4123,
"step": 3080
},
{
"epoch": 0.2161255959345476,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.3981,
"step": 3094
},
{
"epoch": 0.21710353980755462,
"grad_norm": 0.2451171875,
"learning_rate": 0.001,
"loss": 1.4067,
"step": 3108
},
{
"epoch": 0.21808148368056163,
"grad_norm": 0.2373046875,
"learning_rate": 0.001,
"loss": 1.3893,
"step": 3122
},
{
"epoch": 0.2190594275535686,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.4263,
"step": 3136
},
{
"epoch": 0.22003737142657562,
"grad_norm": 0.318359375,
"learning_rate": 0.001,
"loss": 1.3931,
"step": 3150
},
{
"epoch": 0.22101531529958263,
"grad_norm": 0.24609375,
"learning_rate": 0.001,
"loss": 1.3901,
"step": 3164
},
{
"epoch": 0.22199325917258964,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3805,
"step": 3178
},
{
"epoch": 0.22297120304559664,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.4044,
"step": 3192
},
{
"epoch": 0.22394914691860363,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.4046,
"step": 3206
},
{
"epoch": 0.22492709079161063,
"grad_norm": 0.369140625,
"learning_rate": 0.001,
"loss": 1.4084,
"step": 3220
},
{
"epoch": 0.22590503466461764,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.3929,
"step": 3234
},
{
"epoch": 0.22688297853762465,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.3462,
"step": 3248
},
{
"epoch": 0.22786092241063163,
"grad_norm": 0.29296875,
"learning_rate": 0.001,
"loss": 1.3721,
"step": 3262
},
{
"epoch": 0.22883886628363864,
"grad_norm": 0.3359375,
"learning_rate": 0.001,
"loss": 1.4027,
"step": 3276
},
{
"epoch": 0.22981681015664565,
"grad_norm": 0.2890625,
"learning_rate": 0.001,
"loss": 1.4081,
"step": 3290
},
{
"epoch": 0.23079475402965266,
"grad_norm": 0.34765625,
"learning_rate": 0.001,
"loss": 1.3706,
"step": 3304
},
{
"epoch": 0.23177269790265967,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.3652,
"step": 3318
},
{
"epoch": 0.23275064177566665,
"grad_norm": 0.4296875,
"learning_rate": 0.001,
"loss": 1.3623,
"step": 3332
},
{
"epoch": 0.23372858564867366,
"grad_norm": 0.236328125,
"learning_rate": 0.001,
"loss": 1.37,
"step": 3346
},
{
"epoch": 0.23470652952168067,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.3627,
"step": 3360
},
{
"epoch": 0.23568447339468768,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.3979,
"step": 3374
},
{
"epoch": 0.23666241726769469,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.3967,
"step": 3388
},
{
"epoch": 0.23764036114070167,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.3696,
"step": 3402
},
{
"epoch": 0.23861830501370868,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.3867,
"step": 3416
},
{
"epoch": 0.23959624888671568,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.4115,
"step": 3430
},
{
"epoch": 0.2405741927597227,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.3501,
"step": 3444
},
{
"epoch": 0.24155213663272967,
"grad_norm": 0.337890625,
"learning_rate": 0.001,
"loss": 1.377,
"step": 3458
},
{
"epoch": 0.24253008050573668,
"grad_norm": 0.205078125,
"learning_rate": 0.001,
"loss": 1.3585,
"step": 3472
},
{
"epoch": 0.2435080243787437,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.36,
"step": 3486
},
{
"epoch": 0.2444859682517507,
"grad_norm": 0.296875,
"learning_rate": 0.001,
"loss": 1.3759,
"step": 3500
},
{
"epoch": 0.2454639121247577,
"grad_norm": 0.24609375,
"learning_rate": 0.001,
"loss": 1.3782,
"step": 3514
},
{
"epoch": 0.2464418559977647,
"grad_norm": 0.294921875,
"learning_rate": 0.001,
"loss": 1.3489,
"step": 3528
},
{
"epoch": 0.2474197998707717,
"grad_norm": 1.203125,
"learning_rate": 0.001,
"loss": 1.4092,
"step": 3542
},
{
"epoch": 0.2483977437437787,
"grad_norm": 0.38671875,
"learning_rate": 0.001,
"loss": 1.3806,
"step": 3556
},
{
"epoch": 0.24937568761678572,
"grad_norm": 0.353515625,
"learning_rate": 0.001,
"loss": 1.3973,
"step": 3570
},
{
"epoch": 0.2503536314897927,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.3725,
"step": 3584
},
{
"epoch": 0.2513315753627997,
"grad_norm": 0.28125,
"learning_rate": 0.001,
"loss": 1.3523,
"step": 3598
},
{
"epoch": 0.2523095192358067,
"grad_norm": 0.41796875,
"learning_rate": 0.001,
"loss": 1.3929,
"step": 3612
},
{
"epoch": 0.2532874631088137,
"grad_norm": 0.359375,
"learning_rate": 0.001,
"loss": 1.3541,
"step": 3626
},
{
"epoch": 0.2542654069818207,
"grad_norm": 0.2197265625,
"learning_rate": 0.001,
"loss": 1.3471,
"step": 3640
},
{
"epoch": 0.25524335085482774,
"grad_norm": 0.248046875,
"learning_rate": 0.001,
"loss": 1.3887,
"step": 3654
},
{
"epoch": 0.2562212947278347,
"grad_norm": 0.38671875,
"learning_rate": 0.001,
"loss": 1.3675,
"step": 3668
},
{
"epoch": 0.2571992386008417,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3591,
"step": 3682
},
{
"epoch": 0.25817718247384874,
"grad_norm": 0.462890625,
"learning_rate": 0.001,
"loss": 1.3813,
"step": 3696
},
{
"epoch": 0.2591551263468557,
"grad_norm": 0.2099609375,
"learning_rate": 0.001,
"loss": 1.3555,
"step": 3710
},
{
"epoch": 0.26013307021986276,
"grad_norm": 0.95703125,
"learning_rate": 0.001,
"loss": 1.3931,
"step": 3724
},
{
"epoch": 0.26111101409286974,
"grad_norm": 0.60546875,
"learning_rate": 0.001,
"loss": 1.4361,
"step": 3738
},
{
"epoch": 0.2620889579658767,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3841,
"step": 3752
},
{
"epoch": 0.26306690183888376,
"grad_norm": 0.2490234375,
"learning_rate": 0.001,
"loss": 1.3808,
"step": 3766
},
{
"epoch": 0.26404484571189074,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.3792,
"step": 3780
},
{
"epoch": 0.2650227895848978,
"grad_norm": 0.2109375,
"learning_rate": 0.001,
"loss": 1.3488,
"step": 3794
},
{
"epoch": 0.26600073345790476,
"grad_norm": 0.3359375,
"learning_rate": 0.001,
"loss": 1.3644,
"step": 3808
},
{
"epoch": 0.26697867733091174,
"grad_norm": 0.359375,
"learning_rate": 0.001,
"loss": 1.3707,
"step": 3822
},
{
"epoch": 0.2679566212039188,
"grad_norm": 0.2236328125,
"learning_rate": 0.001,
"loss": 1.3584,
"step": 3836
},
{
"epoch": 0.26893456507692576,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3605,
"step": 3850
},
{
"epoch": 0.26991250894993274,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.3703,
"step": 3864
},
{
"epoch": 0.2708904528229398,
"grad_norm": 0.353515625,
"learning_rate": 0.001,
"loss": 1.3507,
"step": 3878
},
{
"epoch": 0.27186839669594676,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.3601,
"step": 3892
},
{
"epoch": 0.2728463405689538,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3663,
"step": 3906
},
{
"epoch": 0.2738242844419608,
"grad_norm": 0.2490234375,
"learning_rate": 0.001,
"loss": 1.3625,
"step": 3920
},
{
"epoch": 0.27480222831496776,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.3722,
"step": 3934
},
{
"epoch": 0.2757801721879748,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.342,
"step": 3948
},
{
"epoch": 0.2767581160609818,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.3682,
"step": 3962
},
{
"epoch": 0.2777360599339888,
"grad_norm": 0.2373046875,
"learning_rate": 0.001,
"loss": 1.3662,
"step": 3976
},
{
"epoch": 0.2787140038069958,
"grad_norm": 0.24609375,
"learning_rate": 0.001,
"loss": 1.3528,
"step": 3990
},
{
"epoch": 0.2796919476800028,
"grad_norm": 0.27734375,
"learning_rate": 0.001,
"loss": 1.3576,
"step": 4004
},
{
"epoch": 0.2806698915530098,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.3744,
"step": 4018
},
{
"epoch": 0.2816478354260168,
"grad_norm": 0.2333984375,
"learning_rate": 0.001,
"loss": 1.3554,
"step": 4032
},
{
"epoch": 0.2826257792990238,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.3518,
"step": 4046
},
{
"epoch": 0.2836037231720308,
"grad_norm": 0.25,
"learning_rate": 0.001,
"loss": 1.3595,
"step": 4060
},
{
"epoch": 0.2845816670450378,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.3361,
"step": 4074
},
{
"epoch": 0.2855596109180448,
"grad_norm": 0.275390625,
"learning_rate": 0.001,
"loss": 1.3664,
"step": 4088
},
{
"epoch": 0.2865375547910518,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.3633,
"step": 4102
},
{
"epoch": 0.28751549866405884,
"grad_norm": 0.2275390625,
"learning_rate": 0.001,
"loss": 1.3437,
"step": 4116
},
{
"epoch": 0.2884934425370658,
"grad_norm": 0.1943359375,
"learning_rate": 0.001,
"loss": 1.3401,
"step": 4130
},
{
"epoch": 0.2894713864100728,
"grad_norm": 0.2392578125,
"learning_rate": 0.001,
"loss": 1.3465,
"step": 4144
},
{
"epoch": 0.29044933028307984,
"grad_norm": 0.5,
"learning_rate": 0.001,
"loss": 1.3517,
"step": 4158
},
{
"epoch": 0.2914272741560868,
"grad_norm": 0.36328125,
"learning_rate": 0.001,
"loss": 1.3272,
"step": 4172
},
{
"epoch": 0.2924052180290938,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.3676,
"step": 4186
},
{
"epoch": 0.29338316190210084,
"grad_norm": 0.216796875,
"learning_rate": 0.001,
"loss": 1.3501,
"step": 4200
},
{
"epoch": 0.2943611057751078,
"grad_norm": 0.24609375,
"learning_rate": 0.001,
"loss": 1.3423,
"step": 4214
},
{
"epoch": 0.29533904964811486,
"grad_norm": 0.3671875,
"learning_rate": 0.001,
"loss": 1.3396,
"step": 4228
},
{
"epoch": 0.29631699352112184,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.3416,
"step": 4242
},
{
"epoch": 0.2972949373941288,
"grad_norm": 0.326171875,
"learning_rate": 0.001,
"loss": 1.3419,
"step": 4256
},
{
"epoch": 0.29827288126713586,
"grad_norm": 0.3671875,
"learning_rate": 0.001,
"loss": 1.3128,
"step": 4270
},
{
"epoch": 0.29925082514014284,
"grad_norm": 0.28125,
"learning_rate": 0.001,
"loss": 1.3528,
"step": 4284
},
{
"epoch": 0.29987950334421876,
"eval_loss": 1.7873083353042603,
"eval_runtime": 9.1065,
"eval_samples_per_second": 109.812,
"eval_steps_per_second": 1.428,
"step": 4293
},
{
"epoch": 0.3002287690131499,
"grad_norm": 0.330078125,
"learning_rate": 0.001,
"loss": 1.3496,
"step": 4298
},
{
"epoch": 0.30120671288615686,
"grad_norm": 0.2451171875,
"learning_rate": 0.001,
"loss": 1.3369,
"step": 4312
},
{
"epoch": 0.30218465675916384,
"grad_norm": 0.34765625,
"learning_rate": 0.001,
"loss": 1.3803,
"step": 4326
},
{
"epoch": 0.3031626006321709,
"grad_norm": 0.41796875,
"learning_rate": 0.001,
"loss": 1.324,
"step": 4340
},
{
"epoch": 0.30414054450517786,
"grad_norm": 0.42578125,
"learning_rate": 0.001,
"loss": 1.3559,
"step": 4354
},
{
"epoch": 0.3051184883781849,
"grad_norm": 0.373046875,
"learning_rate": 0.001,
"loss": 1.3166,
"step": 4368
},
{
"epoch": 0.3060964322511919,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.3376,
"step": 4382
},
{
"epoch": 0.30707437612419886,
"grad_norm": 0.2197265625,
"learning_rate": 0.001,
"loss": 1.3155,
"step": 4396
},
{
"epoch": 0.3080523199972059,
"grad_norm": 0.337890625,
"learning_rate": 0.001,
"loss": 1.3278,
"step": 4410
},
{
"epoch": 0.3090302638702129,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.34,
"step": 4424
},
{
"epoch": 0.31000820774321985,
"grad_norm": 0.2431640625,
"learning_rate": 0.001,
"loss": 1.313,
"step": 4438
},
{
"epoch": 0.3109861516162269,
"grad_norm": 0.359375,
"learning_rate": 0.001,
"loss": 1.3421,
"step": 4452
},
{
"epoch": 0.3119640954892339,
"grad_norm": 0.423828125,
"learning_rate": 0.001,
"loss": 1.3327,
"step": 4466
},
{
"epoch": 0.3129420393622409,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.3345,
"step": 4480
},
{
"epoch": 0.3139199832352479,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.3157,
"step": 4494
},
{
"epoch": 0.31489792710825487,
"grad_norm": 0.19921875,
"learning_rate": 0.001,
"loss": 1.334,
"step": 4508
},
{
"epoch": 0.3158758709812619,
"grad_norm": 0.26171875,
"learning_rate": 0.001,
"loss": 1.3526,
"step": 4522
},
{
"epoch": 0.3168538148542689,
"grad_norm": 0.35546875,
"learning_rate": 0.001,
"loss": 1.3565,
"step": 4536
},
{
"epoch": 0.3178317587272759,
"grad_norm": 0.33984375,
"learning_rate": 0.001,
"loss": 1.3343,
"step": 4550
},
{
"epoch": 0.3188097026002829,
"grad_norm": 0.28125,
"learning_rate": 0.001,
"loss": 1.3372,
"step": 4564
},
{
"epoch": 0.3197876464732899,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.3105,
"step": 4578
},
{
"epoch": 0.3207655903462969,
"grad_norm": 0.34765625,
"learning_rate": 0.001,
"loss": 1.3297,
"step": 4592
},
{
"epoch": 0.3217435342193039,
"grad_norm": 0.341796875,
"learning_rate": 0.001,
"loss": 1.3549,
"step": 4606
},
{
"epoch": 0.32272147809231094,
"grad_norm": 0.52734375,
"learning_rate": 0.001,
"loss": 1.3499,
"step": 4620
},
{
"epoch": 0.3236994219653179,
"grad_norm": 0.58984375,
"learning_rate": 0.001,
"loss": 1.3642,
"step": 4634
},
{
"epoch": 0.3246773658383249,
"grad_norm": 0.58984375,
"learning_rate": 0.001,
"loss": 1.3611,
"step": 4648
},
{
"epoch": 0.32565530971133194,
"grad_norm": 0.3359375,
"learning_rate": 0.001,
"loss": 1.3255,
"step": 4662
},
{
"epoch": 0.3266332535843389,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.3255,
"step": 4676
},
{
"epoch": 0.3276111974573459,
"grad_norm": 0.330078125,
"learning_rate": 0.001,
"loss": 1.3431,
"step": 4690
},
{
"epoch": 0.32858914133035294,
"grad_norm": 0.326171875,
"learning_rate": 0.001,
"loss": 1.322,
"step": 4704
},
{
"epoch": 0.3295670852033599,
"grad_norm": 0.26171875,
"learning_rate": 0.001,
"loss": 1.3271,
"step": 4718
},
{
"epoch": 0.33054502907636696,
"grad_norm": 0.353515625,
"learning_rate": 0.001,
"loss": 1.3364,
"step": 4732
},
{
"epoch": 0.33152297294937394,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.3311,
"step": 4746
},
{
"epoch": 0.3325009168223809,
"grad_norm": 0.33984375,
"learning_rate": 0.001,
"loss": 1.3555,
"step": 4760
},
{
"epoch": 0.33347886069538796,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.329,
"step": 4774
},
{
"epoch": 0.33445680456839494,
"grad_norm": 0.330078125,
"learning_rate": 0.001,
"loss": 1.3264,
"step": 4788
},
{
"epoch": 0.335434748441402,
"grad_norm": 0.279296875,
"learning_rate": 0.001,
"loss": 1.3142,
"step": 4802
},
{
"epoch": 0.33641269231440896,
"grad_norm": 0.205078125,
"learning_rate": 0.001,
"loss": 1.3202,
"step": 4816
},
{
"epoch": 0.33739063618741594,
"grad_norm": 0.234375,
"learning_rate": 0.001,
"loss": 1.3516,
"step": 4830
},
{
"epoch": 0.338368580060423,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3365,
"step": 4844
},
{
"epoch": 0.33934652393342996,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3334,
"step": 4858
},
{
"epoch": 0.340324467806437,
"grad_norm": 0.421875,
"learning_rate": 0.001,
"loss": 1.3485,
"step": 4872
},
{
"epoch": 0.341302411679444,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.3343,
"step": 4886
},
{
"epoch": 0.34228035555245095,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.3395,
"step": 4900
},
{
"epoch": 0.343258299425458,
"grad_norm": 0.359375,
"learning_rate": 0.001,
"loss": 1.3193,
"step": 4914
},
{
"epoch": 0.344236243298465,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.327,
"step": 4928
},
{
"epoch": 0.34521418717147195,
"grad_norm": 0.27734375,
"learning_rate": 0.001,
"loss": 1.2993,
"step": 4942
},
{
"epoch": 0.346192131044479,
"grad_norm": 0.3828125,
"learning_rate": 0.001,
"loss": 1.3182,
"step": 4956
},
{
"epoch": 0.34717007491748597,
"grad_norm": 0.2333984375,
"learning_rate": 0.001,
"loss": 1.3409,
"step": 4970
},
{
"epoch": 0.348148018790493,
"grad_norm": 0.345703125,
"learning_rate": 0.001,
"loss": 1.3354,
"step": 4984
},
{
"epoch": 0.3491259626635,
"grad_norm": 0.28125,
"learning_rate": 0.001,
"loss": 1.305,
"step": 4998
},
{
"epoch": 0.35010390653650697,
"grad_norm": 0.2216796875,
"learning_rate": 0.001,
"loss": 1.314,
"step": 5012
},
{
"epoch": 0.351081850409514,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.3267,
"step": 5026
},
{
"epoch": 0.352059794282521,
"grad_norm": 0.42578125,
"learning_rate": 0.001,
"loss": 1.3327,
"step": 5040
},
{
"epoch": 0.353037738155528,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.3113,
"step": 5054
},
{
"epoch": 0.354015682028535,
"grad_norm": 0.3984375,
"learning_rate": 0.001,
"loss": 1.3293,
"step": 5068
},
{
"epoch": 0.354993625901542,
"grad_norm": 0.294921875,
"learning_rate": 0.001,
"loss": 1.3094,
"step": 5082
},
{
"epoch": 0.355971569774549,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.3053,
"step": 5096
},
{
"epoch": 0.356949513647556,
"grad_norm": 0.2109375,
"learning_rate": 0.001,
"loss": 1.325,
"step": 5110
},
{
"epoch": 0.35792745752056304,
"grad_norm": 0.337890625,
"learning_rate": 0.001,
"loss": 1.3392,
"step": 5124
},
{
"epoch": 0.35890540139357,
"grad_norm": 0.412109375,
"learning_rate": 0.001,
"loss": 1.3363,
"step": 5138
},
{
"epoch": 0.359883345266577,
"grad_norm": 0.59375,
"learning_rate": 0.001,
"loss": 1.3693,
"step": 5152
},
{
"epoch": 0.36086128913958404,
"grad_norm": 0.390625,
"learning_rate": 0.001,
"loss": 1.3341,
"step": 5166
},
{
"epoch": 0.361839233012591,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.3104,
"step": 5180
},
{
"epoch": 0.362817176885598,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.3277,
"step": 5194
},
{
"epoch": 0.36379512075860504,
"grad_norm": 0.2275390625,
"learning_rate": 0.001,
"loss": 1.3274,
"step": 5208
},
{
"epoch": 0.364773064631612,
"grad_norm": 0.26953125,
"learning_rate": 0.001,
"loss": 1.3295,
"step": 5222
},
{
"epoch": 0.36575100850461906,
"grad_norm": 0.2490234375,
"learning_rate": 0.001,
"loss": 1.334,
"step": 5236
},
{
"epoch": 0.36672895237762604,
"grad_norm": 0.318359375,
"learning_rate": 0.001,
"loss": 1.3081,
"step": 5250
},
{
"epoch": 0.367706896250633,
"grad_norm": 0.232421875,
"learning_rate": 0.001,
"loss": 1.2691,
"step": 5264
},
{
"epoch": 0.36868484012364006,
"grad_norm": 0.47265625,
"learning_rate": 0.001,
"loss": 1.3241,
"step": 5278
},
{
"epoch": 0.36966278399664704,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.3015,
"step": 5292
},
{
"epoch": 0.3706407278696541,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.3269,
"step": 5306
},
{
"epoch": 0.37161867174266106,
"grad_norm": 0.23046875,
"learning_rate": 0.001,
"loss": 1.3249,
"step": 5320
},
{
"epoch": 0.37259661561566804,
"grad_norm": 0.384765625,
"learning_rate": 0.001,
"loss": 1.3091,
"step": 5334
},
{
"epoch": 0.3735745594886751,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.3251,
"step": 5348
},
{
"epoch": 0.37455250336168205,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2866,
"step": 5362
},
{
"epoch": 0.3755304472346891,
"grad_norm": 0.29296875,
"learning_rate": 0.001,
"loss": 1.2987,
"step": 5376
},
{
"epoch": 0.37650839110769607,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.3095,
"step": 5390
},
{
"epoch": 0.37748633498070305,
"grad_norm": 0.236328125,
"learning_rate": 0.001,
"loss": 1.304,
"step": 5404
},
{
"epoch": 0.3784642788537101,
"grad_norm": 0.21484375,
"learning_rate": 0.001,
"loss": 1.3158,
"step": 5418
},
{
"epoch": 0.37944222272671707,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.3164,
"step": 5432
},
{
"epoch": 0.3804201665997241,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.3078,
"step": 5446
},
{
"epoch": 0.3813981104727311,
"grad_norm": 0.365234375,
"learning_rate": 0.001,
"loss": 1.3603,
"step": 5460
},
{
"epoch": 0.38237605434573807,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.3174,
"step": 5474
},
{
"epoch": 0.3833539982187451,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.3204,
"step": 5488
},
{
"epoch": 0.3843319420917521,
"grad_norm": 0.33203125,
"learning_rate": 0.001,
"loss": 1.3005,
"step": 5502
},
{
"epoch": 0.38530988596475907,
"grad_norm": 0.3828125,
"learning_rate": 0.001,
"loss": 1.3396,
"step": 5516
},
{
"epoch": 0.3862878298377661,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.3231,
"step": 5530
},
{
"epoch": 0.3872657737107731,
"grad_norm": 0.375,
"learning_rate": 0.001,
"loss": 1.3353,
"step": 5544
},
{
"epoch": 0.3882437175837801,
"grad_norm": 0.33203125,
"learning_rate": 0.001,
"loss": 1.3314,
"step": 5558
},
{
"epoch": 0.3892216614567871,
"grad_norm": 0.35546875,
"learning_rate": 0.001,
"loss": 1.3228,
"step": 5572
},
{
"epoch": 0.3901996053297941,
"grad_norm": 0.294921875,
"learning_rate": 0.001,
"loss": 1.3034,
"step": 5586
},
{
"epoch": 0.3911775492028011,
"grad_norm": 0.349609375,
"learning_rate": 0.001,
"loss": 1.3307,
"step": 5600
},
{
"epoch": 0.3921554930758081,
"grad_norm": 0.26171875,
"learning_rate": 0.001,
"loss": 1.2888,
"step": 5614
},
{
"epoch": 0.39313343694881514,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.3107,
"step": 5628
},
{
"epoch": 0.3941113808218221,
"grad_norm": 0.3984375,
"learning_rate": 0.001,
"loss": 1.32,
"step": 5642
},
{
"epoch": 0.3950893246948291,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.3383,
"step": 5656
},
{
"epoch": 0.39606726856783614,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.2903,
"step": 5670
},
{
"epoch": 0.3970452124408431,
"grad_norm": 0.330078125,
"learning_rate": 0.001,
"loss": 1.3338,
"step": 5684
},
{
"epoch": 0.39802315631385016,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.3371,
"step": 5698
},
{
"epoch": 0.39900110018685714,
"grad_norm": 0.2265625,
"learning_rate": 0.001,
"loss": 1.3169,
"step": 5712
},
{
"epoch": 0.3998393377922917,
"eval_loss": 1.7229478359222412,
"eval_runtime": 9.1252,
"eval_samples_per_second": 109.587,
"eval_steps_per_second": 1.425,
"step": 5724
},
{
"epoch": 0.3999790440598641,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.3098,
"step": 5726
},
{
"epoch": 0.40095698793287116,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.3131,
"step": 5740
},
{
"epoch": 0.40193493180587814,
"grad_norm": 0.408203125,
"learning_rate": 0.001,
"loss": 1.3199,
"step": 5754
},
{
"epoch": 0.4029128756788851,
"grad_norm": 0.46875,
"learning_rate": 0.001,
"loss": 1.3238,
"step": 5768
},
{
"epoch": 0.40389081955189216,
"grad_norm": 0.46484375,
"learning_rate": 0.001,
"loss": 1.3192,
"step": 5782
},
{
"epoch": 0.40486876342489914,
"grad_norm": 0.31640625,
"learning_rate": 0.001,
"loss": 1.3274,
"step": 5796
},
{
"epoch": 0.4058467072979062,
"grad_norm": 0.318359375,
"learning_rate": 0.001,
"loss": 1.2836,
"step": 5810
},
{
"epoch": 0.40682465117091315,
"grad_norm": 0.353515625,
"learning_rate": 0.001,
"loss": 1.3021,
"step": 5824
},
{
"epoch": 0.40780259504392014,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.2903,
"step": 5838
},
{
"epoch": 0.40878053891692717,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.2965,
"step": 5852
},
{
"epoch": 0.40975848278993415,
"grad_norm": 0.2421875,
"learning_rate": 0.001,
"loss": 1.2816,
"step": 5866
},
{
"epoch": 0.4107364266629412,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.3082,
"step": 5880
},
{
"epoch": 0.41171437053594817,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.3229,
"step": 5894
},
{
"epoch": 0.41269231440895515,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.3056,
"step": 5908
},
{
"epoch": 0.4136702582819622,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.2618,
"step": 5922
},
{
"epoch": 0.41464820215496917,
"grad_norm": 0.24609375,
"learning_rate": 0.001,
"loss": 1.29,
"step": 5936
},
{
"epoch": 0.4156261460279762,
"grad_norm": 0.36328125,
"learning_rate": 0.001,
"loss": 1.3132,
"step": 5950
},
{
"epoch": 0.4166040899009832,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.3083,
"step": 5964
},
{
"epoch": 0.41758203377399017,
"grad_norm": 0.326171875,
"learning_rate": 0.001,
"loss": 1.3118,
"step": 5978
},
{
"epoch": 0.4185599776469972,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.3105,
"step": 5992
},
{
"epoch": 0.4195379215200042,
"grad_norm": 0.25,
"learning_rate": 0.001,
"loss": 1.3032,
"step": 6006
},
{
"epoch": 0.42051586539301117,
"grad_norm": 0.232421875,
"learning_rate": 0.001,
"loss": 1.2815,
"step": 6020
},
{
"epoch": 0.4214938092660182,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2913,
"step": 6034
},
{
"epoch": 0.4224717531390252,
"grad_norm": 0.2353515625,
"learning_rate": 0.001,
"loss": 1.3015,
"step": 6048
},
{
"epoch": 0.4234496970120322,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.3072,
"step": 6062
},
{
"epoch": 0.4244276408850392,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.3412,
"step": 6076
},
{
"epoch": 0.4254055847580462,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.3024,
"step": 6090
},
{
"epoch": 0.4263835286310532,
"grad_norm": 0.2255859375,
"learning_rate": 0.001,
"loss": 1.3124,
"step": 6104
},
{
"epoch": 0.4273614725040602,
"grad_norm": 0.55078125,
"learning_rate": 0.001,
"loss": 1.3066,
"step": 6118
},
{
"epoch": 0.42833941637706724,
"grad_norm": 0.353515625,
"learning_rate": 0.001,
"loss": 1.3194,
"step": 6132
},
{
"epoch": 0.4293173602500742,
"grad_norm": 0.314453125,
"learning_rate": 0.001,
"loss": 1.2943,
"step": 6146
},
{
"epoch": 0.4302953041230812,
"grad_norm": 0.279296875,
"learning_rate": 0.001,
"loss": 1.2726,
"step": 6160
},
{
"epoch": 0.43127324799608824,
"grad_norm": 0.2158203125,
"learning_rate": 0.001,
"loss": 1.2693,
"step": 6174
},
{
"epoch": 0.4322511918690952,
"grad_norm": 0.240234375,
"learning_rate": 0.001,
"loss": 1.2966,
"step": 6188
},
{
"epoch": 0.43322913574210226,
"grad_norm": 0.345703125,
"learning_rate": 0.001,
"loss": 1.3144,
"step": 6202
},
{
"epoch": 0.43420707961510924,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.2774,
"step": 6216
},
{
"epoch": 0.4351850234881162,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.3109,
"step": 6230
},
{
"epoch": 0.43616296736112325,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.2897,
"step": 6244
},
{
"epoch": 0.43714091123413024,
"grad_norm": 0.216796875,
"learning_rate": 0.001,
"loss": 1.2796,
"step": 6258
},
{
"epoch": 0.4381188551071372,
"grad_norm": 0.3125,
"learning_rate": 0.001,
"loss": 1.3235,
"step": 6272
},
{
"epoch": 0.43909679898014425,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.2996,
"step": 6286
},
{
"epoch": 0.44007474285315124,
"grad_norm": 0.32421875,
"learning_rate": 0.001,
"loss": 1.3074,
"step": 6300
},
{
"epoch": 0.44105268672615827,
"grad_norm": 0.34375,
"learning_rate": 0.001,
"loss": 1.2851,
"step": 6314
},
{
"epoch": 0.44203063059916525,
"grad_norm": 0.345703125,
"learning_rate": 0.001,
"loss": 1.3315,
"step": 6328
},
{
"epoch": 0.44300857447217223,
"grad_norm": 0.2421875,
"learning_rate": 0.001,
"loss": 1.3049,
"step": 6342
},
{
"epoch": 0.44398651834517927,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.3186,
"step": 6356
},
{
"epoch": 0.44496446221818625,
"grad_norm": 0.392578125,
"learning_rate": 0.001,
"loss": 1.276,
"step": 6370
},
{
"epoch": 0.4459424060911933,
"grad_norm": 0.3828125,
"learning_rate": 0.001,
"loss": 1.2827,
"step": 6384
},
{
"epoch": 0.44692034996420027,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.2925,
"step": 6398
},
{
"epoch": 0.44789829383720725,
"grad_norm": 0.2294921875,
"learning_rate": 0.001,
"loss": 1.2788,
"step": 6412
},
{
"epoch": 0.4488762377102143,
"grad_norm": 0.32421875,
"learning_rate": 0.001,
"loss": 1.2982,
"step": 6426
},
{
"epoch": 0.44985418158322127,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.2977,
"step": 6440
},
{
"epoch": 0.4508321254562283,
"grad_norm": 0.349609375,
"learning_rate": 0.001,
"loss": 1.3041,
"step": 6454
},
{
"epoch": 0.4518100693292353,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.2828,
"step": 6468
},
{
"epoch": 0.45278801320224227,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.2942,
"step": 6482
},
{
"epoch": 0.4537659570752493,
"grad_norm": 0.26953125,
"learning_rate": 0.001,
"loss": 1.2626,
"step": 6496
},
{
"epoch": 0.4547439009482563,
"grad_norm": 0.32421875,
"learning_rate": 0.001,
"loss": 1.2788,
"step": 6510
},
{
"epoch": 0.45572184482126327,
"grad_norm": 0.349609375,
"learning_rate": 0.001,
"loss": 1.2733,
"step": 6524
},
{
"epoch": 0.4566997886942703,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.3146,
"step": 6538
},
{
"epoch": 0.4576777325672773,
"grad_norm": 0.302734375,
"learning_rate": 0.001,
"loss": 1.3027,
"step": 6552
},
{
"epoch": 0.4586556764402843,
"grad_norm": 0.28125,
"learning_rate": 0.001,
"loss": 1.2753,
"step": 6566
},
{
"epoch": 0.4596336203132913,
"grad_norm": 0.439453125,
"learning_rate": 0.001,
"loss": 1.2955,
"step": 6580
},
{
"epoch": 0.4606115641862983,
"grad_norm": 0.353515625,
"learning_rate": 0.001,
"loss": 1.3105,
"step": 6594
},
{
"epoch": 0.4615895080593053,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.2798,
"step": 6608
},
{
"epoch": 0.4625674519323123,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.2983,
"step": 6622
},
{
"epoch": 0.46354539580531934,
"grad_norm": 0.52734375,
"learning_rate": 0.001,
"loss": 1.3166,
"step": 6636
},
{
"epoch": 0.4645233396783263,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.287,
"step": 6650
},
{
"epoch": 0.4655012835513333,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.28,
"step": 6664
},
{
"epoch": 0.46647922742434034,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.2896,
"step": 6678
},
{
"epoch": 0.4674571712973473,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.2933,
"step": 6692
},
{
"epoch": 0.46843511517035435,
"grad_norm": 0.34375,
"learning_rate": 0.001,
"loss": 1.2602,
"step": 6706
},
{
"epoch": 0.46941305904336134,
"grad_norm": 0.25,
"learning_rate": 0.001,
"loss": 1.267,
"step": 6720
},
{
"epoch": 0.4703910029163683,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.2745,
"step": 6734
},
{
"epoch": 0.47136894678937535,
"grad_norm": 0.453125,
"learning_rate": 0.001,
"loss": 1.3102,
"step": 6748
},
{
"epoch": 0.47234689066238233,
"grad_norm": 0.36328125,
"learning_rate": 0.001,
"loss": 1.2844,
"step": 6762
},
{
"epoch": 0.47332483453538937,
"grad_norm": 0.404296875,
"learning_rate": 0.001,
"loss": 1.2787,
"step": 6776
},
{
"epoch": 0.47430277840839635,
"grad_norm": 0.41796875,
"learning_rate": 0.001,
"loss": 1.2921,
"step": 6790
},
{
"epoch": 0.47528072228140333,
"grad_norm": 0.447265625,
"learning_rate": 0.001,
"loss": 1.3157,
"step": 6804
},
{
"epoch": 0.47625866615441037,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.2932,
"step": 6818
},
{
"epoch": 0.47723661002741735,
"grad_norm": 0.296875,
"learning_rate": 0.001,
"loss": 1.3011,
"step": 6832
},
{
"epoch": 0.47821455390042433,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.2883,
"step": 6846
},
{
"epoch": 0.47919249777343137,
"grad_norm": 0.2431640625,
"learning_rate": 0.001,
"loss": 1.3087,
"step": 6860
},
{
"epoch": 0.48017044164643835,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.2855,
"step": 6874
},
{
"epoch": 0.4811483855194454,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.3106,
"step": 6888
},
{
"epoch": 0.48212632939245237,
"grad_norm": 0.29296875,
"learning_rate": 0.001,
"loss": 1.2869,
"step": 6902
},
{
"epoch": 0.48310427326545935,
"grad_norm": 0.53125,
"learning_rate": 0.001,
"loss": 1.2852,
"step": 6916
},
{
"epoch": 0.4840822171384664,
"grad_norm": 0.46484375,
"learning_rate": 0.001,
"loss": 1.2995,
"step": 6930
},
{
"epoch": 0.48506016101147337,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.3098,
"step": 6944
},
{
"epoch": 0.4860381048844804,
"grad_norm": 0.322265625,
"learning_rate": 0.001,
"loss": 1.2687,
"step": 6958
},
{
"epoch": 0.4870160487574874,
"grad_norm": 0.400390625,
"learning_rate": 0.001,
"loss": 1.2885,
"step": 6972
},
{
"epoch": 0.48799399263049437,
"grad_norm": 0.302734375,
"learning_rate": 0.001,
"loss": 1.3135,
"step": 6986
},
{
"epoch": 0.4889719365035014,
"grad_norm": 0.314453125,
"learning_rate": 0.001,
"loss": 1.2776,
"step": 7000
},
{
"epoch": 0.4899498803765084,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2761,
"step": 7014
},
{
"epoch": 0.4909278242495154,
"grad_norm": 0.326171875,
"learning_rate": 0.001,
"loss": 1.2805,
"step": 7028
},
{
"epoch": 0.4919057681225224,
"grad_norm": 0.310546875,
"learning_rate": 0.001,
"loss": 1.2836,
"step": 7042
},
{
"epoch": 0.4928837119955294,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.3029,
"step": 7056
},
{
"epoch": 0.4938616558685364,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.2929,
"step": 7070
},
{
"epoch": 0.4948395997415434,
"grad_norm": 0.80859375,
"learning_rate": 0.001,
"loss": 1.2995,
"step": 7084
},
{
"epoch": 0.4958175436145504,
"grad_norm": 0.4609375,
"learning_rate": 0.001,
"loss": 1.2788,
"step": 7098
},
{
"epoch": 0.4967954874875574,
"grad_norm": 0.279296875,
"learning_rate": 0.001,
"loss": 1.276,
"step": 7112
},
{
"epoch": 0.4977734313605644,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.2883,
"step": 7126
},
{
"epoch": 0.49875137523357144,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.2803,
"step": 7140
},
{
"epoch": 0.4997293191065784,
"grad_norm": 0.22265625,
"learning_rate": 0.001,
"loss": 1.2576,
"step": 7154
},
{
"epoch": 0.49979917224036463,
"eval_loss": 1.6829583644866943,
"eval_runtime": 9.1256,
"eval_samples_per_second": 109.582,
"eval_steps_per_second": 1.425,
"step": 7155
},
{
"epoch": 0.5007072629795855,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.2987,
"step": 7168
},
{
"epoch": 0.5016852068525924,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.2759,
"step": 7182
},
{
"epoch": 0.5026631507255994,
"grad_norm": 0.376953125,
"learning_rate": 0.001,
"loss": 1.2814,
"step": 7196
},
{
"epoch": 0.5036410945986064,
"grad_norm": 0.359375,
"learning_rate": 0.001,
"loss": 1.2701,
"step": 7210
},
{
"epoch": 0.5046190384716134,
"grad_norm": 0.314453125,
"learning_rate": 0.001,
"loss": 1.2857,
"step": 7224
},
{
"epoch": 0.5055969823446205,
"grad_norm": 0.234375,
"learning_rate": 0.001,
"loss": 1.2707,
"step": 7238
},
{
"epoch": 0.5065749262176275,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.2851,
"step": 7252
},
{
"epoch": 0.5075528700906344,
"grad_norm": 0.337890625,
"learning_rate": 0.001,
"loss": 1.2722,
"step": 7266
},
{
"epoch": 0.5085308139636414,
"grad_norm": 0.345703125,
"learning_rate": 0.001,
"loss": 1.277,
"step": 7280
},
{
"epoch": 0.5095087578366484,
"grad_norm": 0.33203125,
"learning_rate": 0.001,
"loss": 1.3021,
"step": 7294
},
{
"epoch": 0.5104867017096555,
"grad_norm": 0.2314453125,
"learning_rate": 0.001,
"loss": 1.2856,
"step": 7308
},
{
"epoch": 0.5114646455826625,
"grad_norm": 0.25,
"learning_rate": 0.001,
"loss": 1.2704,
"step": 7322
},
{
"epoch": 0.5124425894556695,
"grad_norm": 0.27734375,
"learning_rate": 0.001,
"loss": 1.2837,
"step": 7336
},
{
"epoch": 0.5134205333286764,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.277,
"step": 7350
},
{
"epoch": 0.5143984772016834,
"grad_norm": 0.2734375,
"learning_rate": 0.001,
"loss": 1.2838,
"step": 7364
},
{
"epoch": 0.5153764210746905,
"grad_norm": 0.296875,
"learning_rate": 0.001,
"loss": 1.2762,
"step": 7378
},
{
"epoch": 0.5163543649476975,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.2749,
"step": 7392
},
{
"epoch": 0.5173323088207045,
"grad_norm": 0.28125,
"learning_rate": 0.001,
"loss": 1.2791,
"step": 7406
},
{
"epoch": 0.5183102526937114,
"grad_norm": 0.400390625,
"learning_rate": 0.001,
"loss": 1.2708,
"step": 7420
},
{
"epoch": 0.5192881965667184,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.2773,
"step": 7434
},
{
"epoch": 0.5202661404397255,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.2783,
"step": 7448
},
{
"epoch": 0.5212440843127325,
"grad_norm": 0.341796875,
"learning_rate": 0.001,
"loss": 1.2944,
"step": 7462
},
{
"epoch": 0.5222220281857395,
"grad_norm": 0.369140625,
"learning_rate": 0.001,
"loss": 1.2714,
"step": 7476
},
{
"epoch": 0.5231999720587465,
"grad_norm": 0.2421875,
"learning_rate": 0.001,
"loss": 1.2711,
"step": 7490
},
{
"epoch": 0.5241779159317534,
"grad_norm": 0.51953125,
"learning_rate": 0.001,
"loss": 1.2808,
"step": 7504
},
{
"epoch": 0.5251558598047605,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2765,
"step": 7518
},
{
"epoch": 0.5261338036777675,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.2702,
"step": 7532
},
{
"epoch": 0.5271117475507745,
"grad_norm": 0.26953125,
"learning_rate": 0.001,
"loss": 1.2802,
"step": 7546
},
{
"epoch": 0.5280896914237815,
"grad_norm": 0.57421875,
"learning_rate": 0.001,
"loss": 1.2733,
"step": 7560
},
{
"epoch": 0.5290676352967885,
"grad_norm": 0.494140625,
"learning_rate": 0.001,
"loss": 1.2575,
"step": 7574
},
{
"epoch": 0.5300455791697956,
"grad_norm": 0.376953125,
"learning_rate": 0.001,
"loss": 1.2863,
"step": 7588
},
{
"epoch": 0.5310235230428025,
"grad_norm": 0.369140625,
"learning_rate": 0.001,
"loss": 1.2815,
"step": 7602
},
{
"epoch": 0.5320014669158095,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.2745,
"step": 7616
},
{
"epoch": 0.5329794107888165,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.294,
"step": 7630
},
{
"epoch": 0.5339573546618235,
"grad_norm": 0.2265625,
"learning_rate": 0.001,
"loss": 1.2797,
"step": 7644
},
{
"epoch": 0.5349352985348306,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.2665,
"step": 7658
},
{
"epoch": 0.5359132424078376,
"grad_norm": 0.458984375,
"learning_rate": 0.001,
"loss": 1.28,
"step": 7672
},
{
"epoch": 0.5368911862808445,
"grad_norm": 0.359375,
"learning_rate": 0.001,
"loss": 1.3057,
"step": 7686
},
{
"epoch": 0.5378691301538515,
"grad_norm": 0.37109375,
"learning_rate": 0.001,
"loss": 1.258,
"step": 7700
},
{
"epoch": 0.5388470740268585,
"grad_norm": 0.33203125,
"learning_rate": 0.001,
"loss": 1.2742,
"step": 7714
},
{
"epoch": 0.5398250178998655,
"grad_norm": 0.365234375,
"learning_rate": 0.001,
"loss": 1.277,
"step": 7728
},
{
"epoch": 0.5408029617728726,
"grad_norm": 0.412109375,
"learning_rate": 0.001,
"loss": 1.2819,
"step": 7742
},
{
"epoch": 0.5417809056458796,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.3018,
"step": 7756
},
{
"epoch": 0.5427588495188865,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2619,
"step": 7770
},
{
"epoch": 0.5437367933918935,
"grad_norm": 0.296875,
"learning_rate": 0.001,
"loss": 1.2513,
"step": 7784
},
{
"epoch": 0.5447147372649005,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2456,
"step": 7798
},
{
"epoch": 0.5456926811379076,
"grad_norm": 0.62109375,
"learning_rate": 0.001,
"loss": 1.2768,
"step": 7812
},
{
"epoch": 0.5466706250109146,
"grad_norm": 0.44140625,
"learning_rate": 0.001,
"loss": 1.265,
"step": 7826
},
{
"epoch": 0.5476485688839215,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.2683,
"step": 7840
},
{
"epoch": 0.5486265127569285,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.2809,
"step": 7854
},
{
"epoch": 0.5496044566299355,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2498,
"step": 7868
},
{
"epoch": 0.5505824005029426,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.2632,
"step": 7882
},
{
"epoch": 0.5515603443759496,
"grad_norm": 0.29296875,
"learning_rate": 0.001,
"loss": 1.2711,
"step": 7896
},
{
"epoch": 0.5525382882489566,
"grad_norm": 0.373046875,
"learning_rate": 0.001,
"loss": 1.2813,
"step": 7910
},
{
"epoch": 0.5535162321219635,
"grad_norm": 0.427734375,
"learning_rate": 0.001,
"loss": 1.2993,
"step": 7924
},
{
"epoch": 0.5544941759949705,
"grad_norm": 0.373046875,
"learning_rate": 0.001,
"loss": 1.3001,
"step": 7938
},
{
"epoch": 0.5554721198679776,
"grad_norm": 0.416015625,
"learning_rate": 0.001,
"loss": 1.2786,
"step": 7952
},
{
"epoch": 0.5564500637409846,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.2976,
"step": 7966
},
{
"epoch": 0.5574280076139916,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.286,
"step": 7980
},
{
"epoch": 0.5584059514869986,
"grad_norm": 0.59765625,
"learning_rate": 0.001,
"loss": 1.282,
"step": 7994
},
{
"epoch": 0.5593838953600055,
"grad_norm": 0.244140625,
"learning_rate": 0.001,
"loss": 1.2853,
"step": 8008
},
{
"epoch": 0.5603618392330126,
"grad_norm": 0.2265625,
"learning_rate": 0.001,
"loss": 1.2572,
"step": 8022
},
{
"epoch": 0.5613397831060196,
"grad_norm": 0.306640625,
"learning_rate": 0.001,
"loss": 1.2572,
"step": 8036
},
{
"epoch": 0.5623177269790266,
"grad_norm": 0.63671875,
"learning_rate": 0.001,
"loss": 1.315,
"step": 8050
},
{
"epoch": 0.5632956708520336,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.3007,
"step": 8064
},
{
"epoch": 0.5642736147250406,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2737,
"step": 8078
},
{
"epoch": 0.5652515585980477,
"grad_norm": 0.24609375,
"learning_rate": 0.001,
"loss": 1.2766,
"step": 8092
},
{
"epoch": 0.5662295024710546,
"grad_norm": 0.314453125,
"learning_rate": 0.001,
"loss": 1.3102,
"step": 8106
},
{
"epoch": 0.5672074463440616,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.3044,
"step": 8120
},
{
"epoch": 0.5681853902170686,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.2612,
"step": 8134
},
{
"epoch": 0.5691633340900756,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.2701,
"step": 8148
},
{
"epoch": 0.5701412779630827,
"grad_norm": 0.21875,
"learning_rate": 0.001,
"loss": 1.2649,
"step": 8162
},
{
"epoch": 0.5711192218360897,
"grad_norm": 0.2236328125,
"learning_rate": 0.001,
"loss": 1.2761,
"step": 8176
},
{
"epoch": 0.5720971657090966,
"grad_norm": 0.29296875,
"learning_rate": 0.001,
"loss": 1.2668,
"step": 8190
},
{
"epoch": 0.5730751095821036,
"grad_norm": 0.31640625,
"learning_rate": 0.001,
"loss": 1.2847,
"step": 8204
},
{
"epoch": 0.5740530534551106,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.2722,
"step": 8218
},
{
"epoch": 0.5750309973281177,
"grad_norm": 0.224609375,
"learning_rate": 0.001,
"loss": 1.253,
"step": 8232
},
{
"epoch": 0.5760089412011247,
"grad_norm": 0.2890625,
"learning_rate": 0.001,
"loss": 1.2454,
"step": 8246
},
{
"epoch": 0.5769868850741317,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.2558,
"step": 8260
},
{
"epoch": 0.5779648289471386,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.2765,
"step": 8274
},
{
"epoch": 0.5789427728201456,
"grad_norm": 0.314453125,
"learning_rate": 0.001,
"loss": 1.289,
"step": 8288
},
{
"epoch": 0.5799207166931526,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.2724,
"step": 8302
},
{
"epoch": 0.5808986605661597,
"grad_norm": 0.44140625,
"learning_rate": 0.001,
"loss": 1.2753,
"step": 8316
},
{
"epoch": 0.5818766044391667,
"grad_norm": 0.326171875,
"learning_rate": 0.001,
"loss": 1.2558,
"step": 8330
},
{
"epoch": 0.5828545483121736,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2697,
"step": 8344
},
{
"epoch": 0.5838324921851806,
"grad_norm": 0.34375,
"learning_rate": 0.001,
"loss": 1.2685,
"step": 8358
},
{
"epoch": 0.5848104360581876,
"grad_norm": 0.275390625,
"learning_rate": 0.001,
"loss": 1.2724,
"step": 8372
},
{
"epoch": 0.5857883799311947,
"grad_norm": 0.2255859375,
"learning_rate": 0.001,
"loss": 1.2287,
"step": 8386
},
{
"epoch": 0.5867663238042017,
"grad_norm": 0.212890625,
"learning_rate": 0.001,
"loss": 1.2363,
"step": 8400
},
{
"epoch": 0.5877442676772087,
"grad_norm": 0.279296875,
"learning_rate": 0.001,
"loss": 1.2648,
"step": 8414
},
{
"epoch": 0.5887222115502156,
"grad_norm": 0.427734375,
"learning_rate": 0.001,
"loss": 1.2949,
"step": 8428
},
{
"epoch": 0.5897001554232226,
"grad_norm": 0.23828125,
"learning_rate": 0.001,
"loss": 1.2571,
"step": 8442
},
{
"epoch": 0.5906780992962297,
"grad_norm": 0.349609375,
"learning_rate": 0.001,
"loss": 1.2831,
"step": 8456
},
{
"epoch": 0.5916560431692367,
"grad_norm": 0.34765625,
"learning_rate": 0.001,
"loss": 1.2965,
"step": 8470
},
{
"epoch": 0.5926339870422437,
"grad_norm": 0.412109375,
"learning_rate": 0.001,
"loss": 1.2685,
"step": 8484
},
{
"epoch": 0.5936119309152507,
"grad_norm": 0.439453125,
"learning_rate": 0.001,
"loss": 1.2637,
"step": 8498
},
{
"epoch": 0.5945898747882576,
"grad_norm": 0.3671875,
"learning_rate": 0.001,
"loss": 1.28,
"step": 8512
},
{
"epoch": 0.5955678186612647,
"grad_norm": 0.43359375,
"learning_rate": 0.001,
"loss": 1.2636,
"step": 8526
},
{
"epoch": 0.5965457625342717,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.251,
"step": 8540
},
{
"epoch": 0.5975237064072787,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.262,
"step": 8554
},
{
"epoch": 0.5985016502802857,
"grad_norm": 0.365234375,
"learning_rate": 0.001,
"loss": 1.2696,
"step": 8568
},
{
"epoch": 0.5994795941532927,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.2872,
"step": 8582
},
{
"epoch": 0.5997590066884375,
"eval_loss": 1.661841869354248,
"eval_runtime": 9.1193,
"eval_samples_per_second": 109.657,
"eval_steps_per_second": 1.426,
"step": 8586
},
{
"epoch": 0.6004575380262998,
"grad_norm": 0.2451171875,
"learning_rate": 0.001,
"loss": 1.2767,
"step": 8596
},
{
"epoch": 0.6014354818993067,
"grad_norm": 0.33984375,
"learning_rate": 0.001,
"loss": 1.2623,
"step": 8610
},
{
"epoch": 0.6024134257723137,
"grad_norm": 0.26953125,
"learning_rate": 0.001,
"loss": 1.2617,
"step": 8624
},
{
"epoch": 0.6033913696453207,
"grad_norm": 0.25390625,
"learning_rate": 0.001,
"loss": 1.2514,
"step": 8638
},
{
"epoch": 0.6043693135183277,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.2664,
"step": 8652
},
{
"epoch": 0.6053472573913348,
"grad_norm": 0.357421875,
"learning_rate": 0.001,
"loss": 1.2421,
"step": 8666
},
{
"epoch": 0.6063252012643418,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2386,
"step": 8680
},
{
"epoch": 0.6073031451373487,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.2601,
"step": 8694
},
{
"epoch": 0.6082810890103557,
"grad_norm": 0.94921875,
"learning_rate": 0.001,
"loss": 1.2715,
"step": 8708
},
{
"epoch": 0.6092590328833627,
"grad_norm": 0.43359375,
"learning_rate": 0.001,
"loss": 1.2848,
"step": 8722
},
{
"epoch": 0.6102369767563698,
"grad_norm": 0.34375,
"learning_rate": 0.001,
"loss": 1.2632,
"step": 8736
},
{
"epoch": 0.6112149206293768,
"grad_norm": 0.283203125,
"learning_rate": 0.001,
"loss": 1.2912,
"step": 8750
},
{
"epoch": 0.6121928645023837,
"grad_norm": 0.388671875,
"learning_rate": 0.001,
"loss": 1.2613,
"step": 8764
},
{
"epoch": 0.6131708083753907,
"grad_norm": 0.27734375,
"learning_rate": 0.001,
"loss": 1.2357,
"step": 8778
},
{
"epoch": 0.6141487522483977,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.2541,
"step": 8792
},
{
"epoch": 0.6151266961214047,
"grad_norm": 0.32421875,
"learning_rate": 0.001,
"loss": 1.2746,
"step": 8806
},
{
"epoch": 0.6161046399944118,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2445,
"step": 8820
},
{
"epoch": 0.6170825838674188,
"grad_norm": 0.255859375,
"learning_rate": 0.001,
"loss": 1.2854,
"step": 8834
},
{
"epoch": 0.6180605277404257,
"grad_norm": 0.54296875,
"learning_rate": 0.001,
"loss": 1.2746,
"step": 8848
},
{
"epoch": 0.6190384716134327,
"grad_norm": 0.35546875,
"learning_rate": 0.001,
"loss": 1.2837,
"step": 8862
},
{
"epoch": 0.6200164154864397,
"grad_norm": 0.341796875,
"learning_rate": 0.001,
"loss": 1.2682,
"step": 8876
},
{
"epoch": 0.6209943593594468,
"grad_norm": 0.51953125,
"learning_rate": 0.001,
"loss": 1.2751,
"step": 8890
},
{
"epoch": 0.6219723032324538,
"grad_norm": 0.41015625,
"learning_rate": 0.001,
"loss": 1.2666,
"step": 8904
},
{
"epoch": 0.6229502471054608,
"grad_norm": 0.4140625,
"learning_rate": 0.001,
"loss": 1.2618,
"step": 8918
},
{
"epoch": 0.6239281909784677,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.2721,
"step": 8932
},
{
"epoch": 0.6249061348514747,
"grad_norm": 0.3515625,
"learning_rate": 0.001,
"loss": 1.2528,
"step": 8946
},
{
"epoch": 0.6258840787244818,
"grad_norm": 0.34375,
"learning_rate": 0.001,
"loss": 1.2771,
"step": 8960
},
{
"epoch": 0.6268620225974888,
"grad_norm": 0.275390625,
"learning_rate": 0.001,
"loss": 1.2751,
"step": 8974
},
{
"epoch": 0.6278399664704958,
"grad_norm": 0.28515625,
"learning_rate": 0.001,
"loss": 1.2749,
"step": 8988
},
{
"epoch": 0.6288179103435028,
"grad_norm": 0.27734375,
"learning_rate": 0.001,
"loss": 1.2851,
"step": 9002
},
{
"epoch": 0.6297958542165097,
"grad_norm": 0.23828125,
"learning_rate": 0.001,
"loss": 1.2529,
"step": 9016
},
{
"epoch": 0.6307737980895168,
"grad_norm": 0.2890625,
"learning_rate": 0.001,
"loss": 1.2673,
"step": 9030
},
{
"epoch": 0.6317517419625238,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.2746,
"step": 9044
},
{
"epoch": 0.6327296858355308,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.2649,
"step": 9058
},
{
"epoch": 0.6337076297085378,
"grad_norm": 0.318359375,
"learning_rate": 0.001,
"loss": 1.2849,
"step": 9072
},
{
"epoch": 0.6346855735815448,
"grad_norm": 0.30078125,
"learning_rate": 0.001,
"loss": 1.243,
"step": 9086
},
{
"epoch": 0.6356635174545519,
"grad_norm": 0.375,
"learning_rate": 0.001,
"loss": 1.2641,
"step": 9100
},
{
"epoch": 0.6366414613275588,
"grad_norm": 0.361328125,
"learning_rate": 0.001,
"loss": 1.2554,
"step": 9114
},
{
"epoch": 0.6376194052005658,
"grad_norm": 0.396484375,
"learning_rate": 0.001,
"loss": 1.2396,
"step": 9128
},
{
"epoch": 0.6385973490735728,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2508,
"step": 9142
},
{
"epoch": 0.6395752929465798,
"grad_norm": 0.33984375,
"learning_rate": 0.001,
"loss": 1.2772,
"step": 9156
},
{
"epoch": 0.6405532368195869,
"grad_norm": 0.53515625,
"learning_rate": 0.001,
"loss": 1.2453,
"step": 9170
},
{
"epoch": 0.6415311806925938,
"grad_norm": 0.2099609375,
"learning_rate": 0.001,
"loss": 1.2764,
"step": 9184
},
{
"epoch": 0.6425091245656008,
"grad_norm": 0.2333984375,
"learning_rate": 0.001,
"loss": 1.251,
"step": 9198
},
{
"epoch": 0.6434870684386078,
"grad_norm": 0.35546875,
"learning_rate": 0.001,
"loss": 1.2855,
"step": 9212
},
{
"epoch": 0.6444650123116148,
"grad_norm": 1.1953125,
"learning_rate": 0.001,
"loss": 1.3198,
"step": 9226
},
{
"epoch": 0.6454429561846219,
"grad_norm": 0.427734375,
"learning_rate": 0.001,
"loss": 1.2773,
"step": 9240
},
{
"epoch": 0.6464209000576289,
"grad_norm": 0.5703125,
"learning_rate": 0.001,
"loss": 1.2786,
"step": 9254
},
{
"epoch": 0.6473988439306358,
"grad_norm": 0.3125,
"learning_rate": 0.001,
"loss": 1.2389,
"step": 9268
},
{
"epoch": 0.6483767878036428,
"grad_norm": 0.36328125,
"learning_rate": 0.001,
"loss": 1.2587,
"step": 9282
},
{
"epoch": 0.6493547316766498,
"grad_norm": 0.380859375,
"learning_rate": 0.001,
"loss": 1.2806,
"step": 9296
},
{
"epoch": 0.6503326755496568,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.2307,
"step": 9310
},
{
"epoch": 0.6513106194226639,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2657,
"step": 9324
},
{
"epoch": 0.6522885632956709,
"grad_norm": 0.263671875,
"learning_rate": 0.001,
"loss": 1.2605,
"step": 9338
},
{
"epoch": 0.6532665071686778,
"grad_norm": 0.2412109375,
"learning_rate": 0.001,
"loss": 1.2538,
"step": 9352
},
{
"epoch": 0.6542444510416848,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.2633,
"step": 9366
},
{
"epoch": 0.6552223949146918,
"grad_norm": 0.31640625,
"learning_rate": 0.001,
"loss": 1.2582,
"step": 9380
},
{
"epoch": 0.6562003387876989,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.2515,
"step": 9394
},
{
"epoch": 0.6571782826607059,
"grad_norm": 0.33984375,
"learning_rate": 0.001,
"loss": 1.2679,
"step": 9408
},
{
"epoch": 0.6581562265337129,
"grad_norm": 0.3828125,
"learning_rate": 0.001,
"loss": 1.2539,
"step": 9422
},
{
"epoch": 0.6591341704067198,
"grad_norm": 0.26171875,
"learning_rate": 0.001,
"loss": 1.2632,
"step": 9436
},
{
"epoch": 0.6601121142797268,
"grad_norm": 0.3203125,
"learning_rate": 0.001,
"loss": 1.2946,
"step": 9450
},
{
"epoch": 0.6610900581527339,
"grad_norm": 0.38671875,
"learning_rate": 0.001,
"loss": 1.2691,
"step": 9464
},
{
"epoch": 0.6620680020257409,
"grad_norm": 0.2890625,
"learning_rate": 0.001,
"loss": 1.246,
"step": 9478
},
{
"epoch": 0.6630459458987479,
"grad_norm": 0.431640625,
"learning_rate": 0.001,
"loss": 1.2606,
"step": 9492
},
{
"epoch": 0.6640238897717549,
"grad_norm": 0.8671875,
"learning_rate": 0.001,
"loss": 1.2782,
"step": 9506
},
{
"epoch": 0.6650018336447618,
"grad_norm": 0.4375,
"learning_rate": 0.001,
"loss": 1.2687,
"step": 9520
},
{
"epoch": 0.6659797775177689,
"grad_norm": 0.37109375,
"learning_rate": 0.001,
"loss": 1.2778,
"step": 9534
},
{
"epoch": 0.6669577213907759,
"grad_norm": 0.2490234375,
"learning_rate": 0.001,
"loss": 1.2544,
"step": 9548
},
{
"epoch": 0.6679356652637829,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.246,
"step": 9562
},
{
"epoch": 0.6689136091367899,
"grad_norm": 0.287109375,
"learning_rate": 0.001,
"loss": 1.2577,
"step": 9576
},
{
"epoch": 0.6698915530097969,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.2639,
"step": 9590
},
{
"epoch": 0.670869496882804,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2493,
"step": 9604
},
{
"epoch": 0.6718474407558109,
"grad_norm": 0.2119140625,
"learning_rate": 0.001,
"loss": 1.2586,
"step": 9618
},
{
"epoch": 0.6728253846288179,
"grad_norm": 0.240234375,
"learning_rate": 0.001,
"loss": 1.2719,
"step": 9632
},
{
"epoch": 0.6738033285018249,
"grad_norm": 0.294921875,
"learning_rate": 0.001,
"loss": 1.2272,
"step": 9646
},
{
"epoch": 0.6747812723748319,
"grad_norm": 0.423828125,
"learning_rate": 0.001,
"loss": 1.2667,
"step": 9660
},
{
"epoch": 0.675759216247839,
"grad_norm": 0.30859375,
"learning_rate": 0.001,
"loss": 1.2567,
"step": 9674
},
{
"epoch": 0.676737160120846,
"grad_norm": 0.21875,
"learning_rate": 0.001,
"loss": 1.2403,
"step": 9688
},
{
"epoch": 0.6777151039938529,
"grad_norm": 0.2490234375,
"learning_rate": 0.001,
"loss": 1.2642,
"step": 9702
},
{
"epoch": 0.6786930478668599,
"grad_norm": 0.248046875,
"learning_rate": 0.001,
"loss": 1.2123,
"step": 9716
},
{
"epoch": 0.6796709917398669,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2413,
"step": 9730
},
{
"epoch": 0.680648935612874,
"grad_norm": 0.259765625,
"learning_rate": 0.001,
"loss": 1.2442,
"step": 9744
},
{
"epoch": 0.681626879485881,
"grad_norm": 0.271484375,
"learning_rate": 0.001,
"loss": 1.2298,
"step": 9758
},
{
"epoch": 0.682604823358888,
"grad_norm": 0.232421875,
"learning_rate": 0.001,
"loss": 1.2421,
"step": 9772
},
{
"epoch": 0.6835827672318949,
"grad_norm": 0.302734375,
"learning_rate": 0.001,
"loss": 1.2725,
"step": 9786
},
{
"epoch": 0.6845607111049019,
"grad_norm": 0.267578125,
"learning_rate": 0.001,
"loss": 1.2417,
"step": 9800
},
{
"epoch": 0.685538654977909,
"grad_norm": 0.232421875,
"learning_rate": 0.001,
"loss": 1.2526,
"step": 9814
},
{
"epoch": 0.686516598850916,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.2352,
"step": 9828
},
{
"epoch": 0.687494542723923,
"grad_norm": 0.361328125,
"learning_rate": 0.001,
"loss": 1.2653,
"step": 9842
},
{
"epoch": 0.68847248659693,
"grad_norm": 0.328125,
"learning_rate": 0.001,
"loss": 1.2569,
"step": 9856
},
{
"epoch": 0.6894504304699369,
"grad_norm": 0.26171875,
"learning_rate": 0.001,
"loss": 1.248,
"step": 9870
},
{
"epoch": 0.6904283743429439,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.2864,
"step": 9884
},
{
"epoch": 0.691406318215951,
"grad_norm": 0.345703125,
"learning_rate": 0.001,
"loss": 1.2663,
"step": 9898
},
{
"epoch": 0.692384262088958,
"grad_norm": 0.36328125,
"learning_rate": 0.001,
"loss": 1.2534,
"step": 9912
},
{
"epoch": 0.693362205961965,
"grad_norm": 0.333984375,
"learning_rate": 0.001,
"loss": 1.2439,
"step": 9926
},
{
"epoch": 0.6943401498349719,
"grad_norm": 0.298828125,
"learning_rate": 0.001,
"loss": 1.2592,
"step": 9940
},
{
"epoch": 0.6953180937079789,
"grad_norm": 0.3046875,
"learning_rate": 0.001,
"loss": 1.2655,
"step": 9954
},
{
"epoch": 0.696296037580986,
"grad_norm": 0.265625,
"learning_rate": 0.001,
"loss": 1.2569,
"step": 9968
},
{
"epoch": 0.697273981453993,
"grad_norm": 0.376953125,
"learning_rate": 0.001,
"loss": 1.2663,
"step": 9982
},
{
"epoch": 0.698251925327,
"grad_norm": 0.291015625,
"learning_rate": 0.001,
"loss": 1.2419,
"step": 9996
},
{
"epoch": 0.699229869200007,
"grad_norm": 0.251953125,
"learning_rate": 0.001,
"loss": 1.2406,
"step": 10010
},
{
"epoch": 0.6997188411365105,
"eval_loss": 1.6404287815093994,
"eval_runtime": 9.1224,
"eval_samples_per_second": 109.621,
"eval_steps_per_second": 1.425,
"step": 10017
}
],
"logging_steps": 14,
"max_steps": 14315,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1431,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.66909141699448e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}