| { |
| "best_global_step": 6156, |
| "best_metric": 1.9586824178695679, |
| "best_model_checkpoint": "saves/prompt-tuning/llama-3-8b-instruct/train_stsb_1752763924/checkpoint-6156", |
| "epoch": 10.0, |
| "eval_steps": 324, |
| "global_step": 6470, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0077279752704791345, |
| "grad_norm": 0.7005897760391235, |
| "learning_rate": 3.0911901081916536e-07, |
| "loss": 8.2653, |
| "num_input_tokens_seen": 3904, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.015455950540958269, |
| "grad_norm": 0.6296595335006714, |
| "learning_rate": 6.955177743431221e-07, |
| "loss": 8.431, |
| "num_input_tokens_seen": 7296, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.023183925811437404, |
| "grad_norm": 0.6692777276039124, |
| "learning_rate": 1.0819165378670788e-06, |
| "loss": 8.5446, |
| "num_input_tokens_seen": 11136, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.030911901081916538, |
| "grad_norm": 0.6222425699234009, |
| "learning_rate": 1.4683153013910356e-06, |
| "loss": 8.4578, |
| "num_input_tokens_seen": 15040, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03863987635239567, |
| "grad_norm": 0.6727356910705566, |
| "learning_rate": 1.8547140649149923e-06, |
| "loss": 8.2819, |
| "num_input_tokens_seen": 18496, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.04636785162287481, |
| "grad_norm": 0.5868141651153564, |
| "learning_rate": 2.241112828438949e-06, |
| "loss": 8.2987, |
| "num_input_tokens_seen": 21824, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05409582689335394, |
| "grad_norm": 1.1792558431625366, |
| "learning_rate": 2.627511591962906e-06, |
| "loss": 8.4134, |
| "num_input_tokens_seen": 25792, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.061823802163833076, |
| "grad_norm": 0.7558006048202515, |
| "learning_rate": 3.0139103554868627e-06, |
| "loss": 8.4503, |
| "num_input_tokens_seen": 29632, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0695517774343122, |
| "grad_norm": 1.1656103134155273, |
| "learning_rate": 3.4003091190108196e-06, |
| "loss": 8.4214, |
| "num_input_tokens_seen": 33536, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.07727975270479134, |
| "grad_norm": 0.6609169840812683, |
| "learning_rate": 3.7867078825347765e-06, |
| "loss": 8.5704, |
| "num_input_tokens_seen": 37056, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08500772797527048, |
| "grad_norm": 0.6289478540420532, |
| "learning_rate": 4.173106646058733e-06, |
| "loss": 8.4276, |
| "num_input_tokens_seen": 40384, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.09273570324574962, |
| "grad_norm": 0.7771320939064026, |
| "learning_rate": 4.559505409582689e-06, |
| "loss": 8.0075, |
| "num_input_tokens_seen": 44032, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.10046367851622875, |
| "grad_norm": 0.5663495659828186, |
| "learning_rate": 4.945904173106646e-06, |
| "loss": 8.4612, |
| "num_input_tokens_seen": 47552, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.10819165378670788, |
| "grad_norm": 0.7151011228561401, |
| "learning_rate": 5.332302936630603e-06, |
| "loss": 8.4386, |
| "num_input_tokens_seen": 51520, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.11591962905718702, |
| "grad_norm": 0.4895608425140381, |
| "learning_rate": 5.71870170015456e-06, |
| "loss": 8.5596, |
| "num_input_tokens_seen": 55360, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.12364760432766615, |
| "grad_norm": 0.4930502772331238, |
| "learning_rate": 6.1051004636785165e-06, |
| "loss": 8.4408, |
| "num_input_tokens_seen": 58944, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.13137557959814528, |
| "grad_norm": 0.619640052318573, |
| "learning_rate": 6.491499227202473e-06, |
| "loss": 8.4171, |
| "num_input_tokens_seen": 62336, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.1391035548686244, |
| "grad_norm": 0.5968901515007019, |
| "learning_rate": 6.87789799072643e-06, |
| "loss": 8.4728, |
| "num_input_tokens_seen": 66176, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.14683153013910355, |
| "grad_norm": 0.5749031901359558, |
| "learning_rate": 7.264296754250387e-06, |
| "loss": 8.4344, |
| "num_input_tokens_seen": 69888, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.1545595054095827, |
| "grad_norm": 0.7264779806137085, |
| "learning_rate": 7.650695517774343e-06, |
| "loss": 8.2166, |
| "num_input_tokens_seen": 73600, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16228748068006182, |
| "grad_norm": 2.099539041519165, |
| "learning_rate": 8.0370942812983e-06, |
| "loss": 8.3009, |
| "num_input_tokens_seen": 77568, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.17001545595054096, |
| "grad_norm": 0.7334800362586975, |
| "learning_rate": 8.423493044822257e-06, |
| "loss": 8.1876, |
| "num_input_tokens_seen": 80768, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1777434312210201, |
| "grad_norm": 0.6308206915855408, |
| "learning_rate": 8.809891808346214e-06, |
| "loss": 8.2654, |
| "num_input_tokens_seen": 84288, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.18547140649149924, |
| "grad_norm": 0.5811961889266968, |
| "learning_rate": 9.19629057187017e-06, |
| "loss": 8.1847, |
| "num_input_tokens_seen": 87424, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.19319938176197837, |
| "grad_norm": 0.5790714025497437, |
| "learning_rate": 9.582689335394126e-06, |
| "loss": 8.1837, |
| "num_input_tokens_seen": 91584, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2009273570324575, |
| "grad_norm": 0.6126049757003784, |
| "learning_rate": 9.969088098918083e-06, |
| "loss": 8.2103, |
| "num_input_tokens_seen": 95616, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.20865533230293662, |
| "grad_norm": 0.5547794699668884, |
| "learning_rate": 1.035548686244204e-05, |
| "loss": 8.2972, |
| "num_input_tokens_seen": 99136, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.21638330757341576, |
| "grad_norm": 0.6736480593681335, |
| "learning_rate": 1.0741885625965996e-05, |
| "loss": 8.2168, |
| "num_input_tokens_seen": 102336, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2241112828438949, |
| "grad_norm": 0.8961809873580933, |
| "learning_rate": 1.1128284389489953e-05, |
| "loss": 8.2389, |
| "num_input_tokens_seen": 106560, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.23183925811437403, |
| "grad_norm": 0.5742205381393433, |
| "learning_rate": 1.151468315301391e-05, |
| "loss": 8.029, |
| "num_input_tokens_seen": 110528, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.23956723338485317, |
| "grad_norm": 0.9602662324905396, |
| "learning_rate": 1.1901081916537867e-05, |
| "loss": 8.3119, |
| "num_input_tokens_seen": 114048, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.2472952086553323, |
| "grad_norm": 0.6280139684677124, |
| "learning_rate": 1.2287480680061824e-05, |
| "loss": 8.254, |
| "num_input_tokens_seen": 118336, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2550231839258114, |
| "grad_norm": 0.7177520990371704, |
| "learning_rate": 1.2673879443585781e-05, |
| "loss": 8.5888, |
| "num_input_tokens_seen": 122304, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.26275115919629055, |
| "grad_norm": 0.546525776386261, |
| "learning_rate": 1.3060278207109738e-05, |
| "loss": 8.2285, |
| "num_input_tokens_seen": 126080, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2704791344667697, |
| "grad_norm": 0.4941937029361725, |
| "learning_rate": 1.3446676970633695e-05, |
| "loss": 8.4309, |
| "num_input_tokens_seen": 129920, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.2782071097372488, |
| "grad_norm": 0.5484105944633484, |
| "learning_rate": 1.3833075734157651e-05, |
| "loss": 8.194, |
| "num_input_tokens_seen": 133696, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.28593508500772796, |
| "grad_norm": 0.7775905132293701, |
| "learning_rate": 1.4219474497681608e-05, |
| "loss": 8.4067, |
| "num_input_tokens_seen": 137664, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.2936630602782071, |
| "grad_norm": 0.6176138520240784, |
| "learning_rate": 1.4605873261205565e-05, |
| "loss": 8.2687, |
| "num_input_tokens_seen": 141184, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.30139103554868624, |
| "grad_norm": 0.6117033958435059, |
| "learning_rate": 1.4992272024729522e-05, |
| "loss": 8.2188, |
| "num_input_tokens_seen": 144768, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.3091190108191654, |
| "grad_norm": 0.6097720265388489, |
| "learning_rate": 1.5378670788253476e-05, |
| "loss": 8.3016, |
| "num_input_tokens_seen": 148032, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3168469860896445, |
| "grad_norm": 0.5379249453544617, |
| "learning_rate": 1.5765069551777432e-05, |
| "loss": 8.0923, |
| "num_input_tokens_seen": 152000, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.32457496136012365, |
| "grad_norm": 0.7796866297721863, |
| "learning_rate": 1.615146831530139e-05, |
| "loss": 8.2276, |
| "num_input_tokens_seen": 156096, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3323029366306028, |
| "grad_norm": 0.4686180651187897, |
| "learning_rate": 1.6537867078825346e-05, |
| "loss": 8.251, |
| "num_input_tokens_seen": 160128, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.3400309119010819, |
| "grad_norm": 0.6390528678894043, |
| "learning_rate": 1.6924265842349303e-05, |
| "loss": 8.2704, |
| "num_input_tokens_seen": 163712, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.34775888717156106, |
| "grad_norm": 0.7915067076683044, |
| "learning_rate": 1.731066460587326e-05, |
| "loss": 7.7831, |
| "num_input_tokens_seen": 167424, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.3554868624420402, |
| "grad_norm": 0.7012531161308289, |
| "learning_rate": 1.7697063369397217e-05, |
| "loss": 8.1578, |
| "num_input_tokens_seen": 170816, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.36321483771251933, |
| "grad_norm": 0.5768135190010071, |
| "learning_rate": 1.8083462132921174e-05, |
| "loss": 7.7408, |
| "num_input_tokens_seen": 174528, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.37094281298299847, |
| "grad_norm": 0.5718309283256531, |
| "learning_rate": 1.846986089644513e-05, |
| "loss": 7.9793, |
| "num_input_tokens_seen": 178432, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3786707882534776, |
| "grad_norm": 0.5248563885688782, |
| "learning_rate": 1.8856259659969088e-05, |
| "loss": 7.256, |
| "num_input_tokens_seen": 181888, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.38639876352395675, |
| "grad_norm": 0.565098226070404, |
| "learning_rate": 1.9242658423493044e-05, |
| "loss": 8.3203, |
| "num_input_tokens_seen": 185984, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3941267387944359, |
| "grad_norm": 0.46190837025642395, |
| "learning_rate": 1.9629057187017e-05, |
| "loss": 7.9005, |
| "num_input_tokens_seen": 189504, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.401854714064915, |
| "grad_norm": 0.45425835251808167, |
| "learning_rate": 2.0015455950540958e-05, |
| "loss": 8.028, |
| "num_input_tokens_seen": 193408, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4095826893353941, |
| "grad_norm": 0.6547238230705261, |
| "learning_rate": 2.0401854714064915e-05, |
| "loss": 7.8846, |
| "num_input_tokens_seen": 196672, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.41731066460587324, |
| "grad_norm": 0.4780774712562561, |
| "learning_rate": 2.0788253477588872e-05, |
| "loss": 8.1814, |
| "num_input_tokens_seen": 200960, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4250386398763524, |
| "grad_norm": 0.5580217242240906, |
| "learning_rate": 2.117465224111283e-05, |
| "loss": 7.7065, |
| "num_input_tokens_seen": 204352, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.4327666151468315, |
| "grad_norm": 0.4923495650291443, |
| "learning_rate": 2.1561051004636786e-05, |
| "loss": 8.0245, |
| "num_input_tokens_seen": 208000, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.44049459041731065, |
| "grad_norm": 0.6452226042747498, |
| "learning_rate": 2.1947449768160743e-05, |
| "loss": 7.8578, |
| "num_input_tokens_seen": 211584, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.4482225656877898, |
| "grad_norm": 0.7374160289764404, |
| "learning_rate": 2.23338485316847e-05, |
| "loss": 7.7108, |
| "num_input_tokens_seen": 215680, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4559505409582689, |
| "grad_norm": 0.5521262288093567, |
| "learning_rate": 2.2720247295208656e-05, |
| "loss": 7.7653, |
| "num_input_tokens_seen": 219328, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.46367851622874806, |
| "grad_norm": 0.5362392067909241, |
| "learning_rate": 2.3106646058732613e-05, |
| "loss": 7.6045, |
| "num_input_tokens_seen": 223232, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4714064914992272, |
| "grad_norm": 0.5953294634819031, |
| "learning_rate": 2.349304482225657e-05, |
| "loss": 7.6481, |
| "num_input_tokens_seen": 226816, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.47913446676970634, |
| "grad_norm": 0.7014301419258118, |
| "learning_rate": 2.3879443585780527e-05, |
| "loss": 7.7099, |
| "num_input_tokens_seen": 230464, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4868624420401855, |
| "grad_norm": 0.510015070438385, |
| "learning_rate": 2.4265842349304484e-05, |
| "loss": 7.5586, |
| "num_input_tokens_seen": 234368, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.4945904173106646, |
| "grad_norm": 0.5965865254402161, |
| "learning_rate": 2.465224111282844e-05, |
| "loss": 7.6904, |
| "num_input_tokens_seen": 238464, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.500772797527048, |
| "eval_loss": 7.569452285766602, |
| "eval_runtime": 9.8312, |
| "eval_samples_per_second": 58.487, |
| "eval_steps_per_second": 7.324, |
| "num_input_tokens_seen": 241664, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.5023183925811437, |
| "grad_norm": 0.49373796582221985, |
| "learning_rate": 2.5038639876352398e-05, |
| "loss": 7.495, |
| "num_input_tokens_seen": 242304, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.5100463678516228, |
| "grad_norm": 0.4991404116153717, |
| "learning_rate": 2.5425038639876354e-05, |
| "loss": 7.3584, |
| "num_input_tokens_seen": 246080, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.517774343122102, |
| "grad_norm": 0.5048945546150208, |
| "learning_rate": 2.581143740340031e-05, |
| "loss": 8.1246, |
| "num_input_tokens_seen": 250304, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.5255023183925811, |
| "grad_norm": 0.5030365586280823, |
| "learning_rate": 2.6197836166924268e-05, |
| "loss": 7.56, |
| "num_input_tokens_seen": 253504, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5332302936630603, |
| "grad_norm": 0.47446444630622864, |
| "learning_rate": 2.6584234930448225e-05, |
| "loss": 7.0644, |
| "num_input_tokens_seen": 257088, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.5409582689335394, |
| "grad_norm": 0.5291785001754761, |
| "learning_rate": 2.6970633693972182e-05, |
| "loss": 7.4655, |
| "num_input_tokens_seen": 261440, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5486862442040186, |
| "grad_norm": 0.46279534697532654, |
| "learning_rate": 2.735703245749614e-05, |
| "loss": 7.4421, |
| "num_input_tokens_seen": 265280, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.5564142194744977, |
| "grad_norm": 0.6287479400634766, |
| "learning_rate": 2.7743431221020096e-05, |
| "loss": 7.6079, |
| "num_input_tokens_seen": 268928, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5641421947449768, |
| "grad_norm": 0.5496251583099365, |
| "learning_rate": 2.812982998454405e-05, |
| "loss": 7.3586, |
| "num_input_tokens_seen": 272704, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.5718701700154559, |
| "grad_norm": 0.5213049650192261, |
| "learning_rate": 2.851622874806801e-05, |
| "loss": 7.2783, |
| "num_input_tokens_seen": 276992, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5795981452859351, |
| "grad_norm": 0.487604558467865, |
| "learning_rate": 2.8902627511591963e-05, |
| "loss": 7.483, |
| "num_input_tokens_seen": 280640, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.5873261205564142, |
| "grad_norm": 0.5995193123817444, |
| "learning_rate": 2.9289026275115923e-05, |
| "loss": 7.3264, |
| "num_input_tokens_seen": 284608, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5950540958268934, |
| "grad_norm": 0.5379067063331604, |
| "learning_rate": 2.9675425038639877e-05, |
| "loss": 7.362, |
| "num_input_tokens_seen": 288640, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.6027820710973725, |
| "grad_norm": 0.5467705130577087, |
| "learning_rate": 3.0061823802163837e-05, |
| "loss": 7.5526, |
| "num_input_tokens_seen": 292416, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6105100463678517, |
| "grad_norm": 0.4332791566848755, |
| "learning_rate": 3.044822256568779e-05, |
| "loss": 7.6352, |
| "num_input_tokens_seen": 296000, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.6182380216383307, |
| "grad_norm": 0.5871270895004272, |
| "learning_rate": 3.083462132921175e-05, |
| "loss": 7.2932, |
| "num_input_tokens_seen": 299712, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6259659969088099, |
| "grad_norm": 0.4454021751880646, |
| "learning_rate": 3.12210200927357e-05, |
| "loss": 7.6205, |
| "num_input_tokens_seen": 303360, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.633693972179289, |
| "grad_norm": 0.5419654250144958, |
| "learning_rate": 3.1607418856259664e-05, |
| "loss": 7.3044, |
| "num_input_tokens_seen": 306752, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6414219474497682, |
| "grad_norm": 0.4751136302947998, |
| "learning_rate": 3.1993817619783615e-05, |
| "loss": 7.3805, |
| "num_input_tokens_seen": 310592, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.6491499227202473, |
| "grad_norm": 0.5459133386611938, |
| "learning_rate": 3.238021638330758e-05, |
| "loss": 7.5579, |
| "num_input_tokens_seen": 314688, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6568778979907264, |
| "grad_norm": 0.45621687173843384, |
| "learning_rate": 3.276661514683153e-05, |
| "loss": 7.2994, |
| "num_input_tokens_seen": 318592, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.6646058732612056, |
| "grad_norm": 0.6815227270126343, |
| "learning_rate": 3.315301391035549e-05, |
| "loss": 6.8075, |
| "num_input_tokens_seen": 322368, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6723338485316847, |
| "grad_norm": 0.46193644404411316, |
| "learning_rate": 3.353941267387944e-05, |
| "loss": 6.5451, |
| "num_input_tokens_seen": 325824, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.6800618238021638, |
| "grad_norm": 0.6927512884140015, |
| "learning_rate": 3.3925811437403406e-05, |
| "loss": 6.85, |
| "num_input_tokens_seen": 329216, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6877897990726429, |
| "grad_norm": 0.5209140181541443, |
| "learning_rate": 3.4312210200927356e-05, |
| "loss": 6.8214, |
| "num_input_tokens_seen": 332992, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.6955177743431221, |
| "grad_norm": 0.4288826882839203, |
| "learning_rate": 3.469860896445132e-05, |
| "loss": 7.0542, |
| "num_input_tokens_seen": 336640, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7032457496136012, |
| "grad_norm": 0.5550535917282104, |
| "learning_rate": 3.508500772797527e-05, |
| "loss": 7.034, |
| "num_input_tokens_seen": 340352, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.7109737248840804, |
| "grad_norm": 0.4508901536464691, |
| "learning_rate": 3.547140649149923e-05, |
| "loss": 7.4092, |
| "num_input_tokens_seen": 344128, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7187017001545595, |
| "grad_norm": 0.5066952109336853, |
| "learning_rate": 3.585780525502318e-05, |
| "loss": 7.0113, |
| "num_input_tokens_seen": 347584, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.7264296754250387, |
| "grad_norm": 0.42304113507270813, |
| "learning_rate": 3.624420401854714e-05, |
| "loss": 7.1517, |
| "num_input_tokens_seen": 352000, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7341576506955177, |
| "grad_norm": 0.4524816572666168, |
| "learning_rate": 3.66306027820711e-05, |
| "loss": 7.1342, |
| "num_input_tokens_seen": 356032, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.7418856259659969, |
| "grad_norm": 0.46454742550849915, |
| "learning_rate": 3.7017001545595054e-05, |
| "loss": 7.1964, |
| "num_input_tokens_seen": 359552, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.749613601236476, |
| "grad_norm": 0.4011656939983368, |
| "learning_rate": 3.740340030911901e-05, |
| "loss": 6.6278, |
| "num_input_tokens_seen": 363776, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.7573415765069552, |
| "grad_norm": 0.5052220225334167, |
| "learning_rate": 3.778979907264297e-05, |
| "loss": 6.7523, |
| "num_input_tokens_seen": 367104, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7650695517774343, |
| "grad_norm": 0.5182458162307739, |
| "learning_rate": 3.8176197836166925e-05, |
| "loss": 6.7254, |
| "num_input_tokens_seen": 370688, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.7727975270479135, |
| "grad_norm": 0.4116693437099457, |
| "learning_rate": 3.856259659969088e-05, |
| "loss": 7.038, |
| "num_input_tokens_seen": 374272, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7805255023183926, |
| "grad_norm": 0.5274565815925598, |
| "learning_rate": 3.894899536321484e-05, |
| "loss": 6.8012, |
| "num_input_tokens_seen": 377792, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.7882534775888718, |
| "grad_norm": 0.5249013900756836, |
| "learning_rate": 3.9335394126738795e-05, |
| "loss": 6.4372, |
| "num_input_tokens_seen": 381312, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7959814528593508, |
| "grad_norm": 0.4378226697444916, |
| "learning_rate": 3.972179289026275e-05, |
| "loss": 6.7899, |
| "num_input_tokens_seen": 384896, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.80370942812983, |
| "grad_norm": 0.4792448878288269, |
| "learning_rate": 4.010819165378671e-05, |
| "loss": 6.7719, |
| "num_input_tokens_seen": 388672, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8114374034003091, |
| "grad_norm": 0.47215649485588074, |
| "learning_rate": 4.0494590417310666e-05, |
| "loss": 6.5768, |
| "num_input_tokens_seen": 392640, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.8191653786707882, |
| "grad_norm": 0.4052186906337738, |
| "learning_rate": 4.088098918083462e-05, |
| "loss": 6.4603, |
| "num_input_tokens_seen": 396032, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8268933539412674, |
| "grad_norm": 0.40831634402275085, |
| "learning_rate": 4.126738794435858e-05, |
| "loss": 6.9047, |
| "num_input_tokens_seen": 400384, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.8346213292117465, |
| "grad_norm": 0.41408637166023254, |
| "learning_rate": 4.1653786707882537e-05, |
| "loss": 6.929, |
| "num_input_tokens_seen": 404352, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8423493044822257, |
| "grad_norm": 0.3988915681838989, |
| "learning_rate": 4.2040185471406493e-05, |
| "loss": 6.2877, |
| "num_input_tokens_seen": 408000, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.8500772797527048, |
| "grad_norm": 0.3835439682006836, |
| "learning_rate": 4.242658423493045e-05, |
| "loss": 6.6326, |
| "num_input_tokens_seen": 412032, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8578052550231839, |
| "grad_norm": 0.42893922328948975, |
| "learning_rate": 4.281298299845441e-05, |
| "loss": 6.3484, |
| "num_input_tokens_seen": 415360, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.865533230293663, |
| "grad_norm": 0.41601845622062683, |
| "learning_rate": 4.3199381761978364e-05, |
| "loss": 6.6927, |
| "num_input_tokens_seen": 419264, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8732612055641422, |
| "grad_norm": 0.438260942697525, |
| "learning_rate": 4.358578052550232e-05, |
| "loss": 6.5952, |
| "num_input_tokens_seen": 422784, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.8809891808346213, |
| "grad_norm": 0.3734167516231537, |
| "learning_rate": 4.397217928902628e-05, |
| "loss": 6.2921, |
| "num_input_tokens_seen": 427008, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8887171561051005, |
| "grad_norm": 0.4252164661884308, |
| "learning_rate": 4.4358578052550235e-05, |
| "loss": 7.1411, |
| "num_input_tokens_seen": 431104, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.8964451313755796, |
| "grad_norm": 0.38181161880493164, |
| "learning_rate": 4.474497681607419e-05, |
| "loss": 6.791, |
| "num_input_tokens_seen": 435072, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9041731066460588, |
| "grad_norm": 0.4132893681526184, |
| "learning_rate": 4.513137557959815e-05, |
| "loss": 6.3356, |
| "num_input_tokens_seen": 439040, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.9119010819165378, |
| "grad_norm": 0.3585706353187561, |
| "learning_rate": 4.5517774343122105e-05, |
| "loss": 6.1658, |
| "num_input_tokens_seen": 442560, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.919629057187017, |
| "grad_norm": 0.3809860348701477, |
| "learning_rate": 4.590417310664606e-05, |
| "loss": 6.358, |
| "num_input_tokens_seen": 446528, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.9273570324574961, |
| "grad_norm": 0.3630789518356323, |
| "learning_rate": 4.629057187017002e-05, |
| "loss": 6.3932, |
| "num_input_tokens_seen": 450176, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9350850077279753, |
| "grad_norm": 0.3790375590324402, |
| "learning_rate": 4.6676970633693976e-05, |
| "loss": 6.4598, |
| "num_input_tokens_seen": 453952, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.9428129829984544, |
| "grad_norm": 0.36716851592063904, |
| "learning_rate": 4.706336939721793e-05, |
| "loss": 6.1113, |
| "num_input_tokens_seen": 457536, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9505409582689336, |
| "grad_norm": 0.362441748380661, |
| "learning_rate": 4.744976816074189e-05, |
| "loss": 6.3924, |
| "num_input_tokens_seen": 461120, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.9582689335394127, |
| "grad_norm": 0.3889773190021515, |
| "learning_rate": 4.7836166924265847e-05, |
| "loss": 6.4084, |
| "num_input_tokens_seen": 464960, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9659969088098919, |
| "grad_norm": 0.45412859320640564, |
| "learning_rate": 4.8222565687789803e-05, |
| "loss": 6.3847, |
| "num_input_tokens_seen": 468736, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.973724884080371, |
| "grad_norm": 0.378328800201416, |
| "learning_rate": 4.860896445131376e-05, |
| "loss": 6.0302, |
| "num_input_tokens_seen": 472768, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.98145285935085, |
| "grad_norm": 0.37784719467163086, |
| "learning_rate": 4.899536321483772e-05, |
| "loss": 6.0604, |
| "num_input_tokens_seen": 476544, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.9891808346213292, |
| "grad_norm": 0.3870941400527954, |
| "learning_rate": 4.9381761978361674e-05, |
| "loss": 5.923, |
| "num_input_tokens_seen": 480384, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.9969088098918083, |
| "grad_norm": 0.3700666129589081, |
| "learning_rate": 4.976816074188563e-05, |
| "loss": 5.955, |
| "num_input_tokens_seen": 483520, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.001545595054096, |
| "eval_loss": 6.029697418212891, |
| "eval_runtime": 9.8501, |
| "eval_samples_per_second": 58.375, |
| "eval_steps_per_second": 7.31, |
| "num_input_tokens_seen": 485616, |
| "step": 648 |
| }, |
| { |
| "epoch": 1.0046367851622875, |
| "grad_norm": 0.4179936647415161, |
| "learning_rate": 4.999998544620922e-05, |
| "loss": 6.3116, |
| "num_input_tokens_seen": 487472, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.0123647604327666, |
| "grad_norm": 0.40085089206695557, |
| "learning_rate": 4.999982171625755e-05, |
| "loss": 5.9363, |
| "num_input_tokens_seen": 491504, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.0200927357032457, |
| "grad_norm": 0.3790755271911621, |
| "learning_rate": 4.999947606531115e-05, |
| "loss": 6.368, |
| "num_input_tokens_seen": 495152, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.027820710973725, |
| "grad_norm": 0.4374313950538635, |
| "learning_rate": 4.999894849588528e-05, |
| "loss": 5.6627, |
| "num_input_tokens_seen": 498672, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.035548686244204, |
| "grad_norm": 0.3703961968421936, |
| "learning_rate": 4.9998239011819015e-05, |
| "loss": 6.3572, |
| "num_input_tokens_seen": 502640, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.0432766615146831, |
| "grad_norm": 0.4290381669998169, |
| "learning_rate": 4.999734761827518e-05, |
| "loss": 5.8942, |
| "num_input_tokens_seen": 506224, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.0510046367851622, |
| "grad_norm": 0.4251026511192322, |
| "learning_rate": 4.9996274321740366e-05, |
| "loss": 5.6945, |
| "num_input_tokens_seen": 509936, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.0587326120556415, |
| "grad_norm": 0.44041091203689575, |
| "learning_rate": 4.999501913002482e-05, |
| "loss": 6.2215, |
| "num_input_tokens_seen": 513968, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.0664605873261206, |
| "grad_norm": 0.3784466087818146, |
| "learning_rate": 4.999358205226245e-05, |
| "loss": 5.9134, |
| "num_input_tokens_seen": 517552, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.0741885625965997, |
| "grad_norm": 0.4258522391319275, |
| "learning_rate": 4.999196309891071e-05, |
| "loss": 5.528, |
| "num_input_tokens_seen": 520752, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.0819165378670788, |
| "grad_norm": 0.3941080868244171, |
| "learning_rate": 4.999016228175054e-05, |
| "loss": 5.5496, |
| "num_input_tokens_seen": 524208, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.089644513137558, |
| "grad_norm": 0.3999853730201721, |
| "learning_rate": 4.99881796138863e-05, |
| "loss": 5.775, |
| "num_input_tokens_seen": 527856, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.0973724884080371, |
| "grad_norm": 0.43056732416152954, |
| "learning_rate": 4.998601510974565e-05, |
| "loss": 5.7852, |
| "num_input_tokens_seen": 532080, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.1051004636785162, |
| "grad_norm": 0.36167922616004944, |
| "learning_rate": 4.998366878507945e-05, |
| "loss": 5.8906, |
| "num_input_tokens_seen": 535856, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.1128284389489953, |
| "grad_norm": 0.4512415826320648, |
| "learning_rate": 4.9981140656961645e-05, |
| "loss": 5.7887, |
| "num_input_tokens_seen": 539696, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.1205564142194744, |
| "grad_norm": 0.33006787300109863, |
| "learning_rate": 4.997843074378916e-05, |
| "loss": 5.9003, |
| "num_input_tokens_seen": 543408, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.1282843894899537, |
| "grad_norm": 0.3509158790111542, |
| "learning_rate": 4.9975539065281733e-05, |
| "loss": 5.78, |
| "num_input_tokens_seen": 547248, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.1360123647604328, |
| "grad_norm": 0.3338649570941925, |
| "learning_rate": 4.9972465642481796e-05, |
| "loss": 5.7212, |
| "num_input_tokens_seen": 550384, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.1437403400309119, |
| "grad_norm": 0.4186403751373291, |
| "learning_rate": 4.9969210497754314e-05, |
| "loss": 5.3607, |
| "num_input_tokens_seen": 554288, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.1514683153013912, |
| "grad_norm": 0.4077812731266022, |
| "learning_rate": 4.996577365478663e-05, |
| "loss": 5.6737, |
| "num_input_tokens_seen": 558128, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.1591962905718702, |
| "grad_norm": 0.39160779118537903, |
| "learning_rate": 4.996215513858826e-05, |
| "loss": 5.6733, |
| "num_input_tokens_seen": 561584, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.1669242658423493, |
| "grad_norm": 0.42368146777153015, |
| "learning_rate": 4.995835497549077e-05, |
| "loss": 5.4552, |
| "num_input_tokens_seen": 565296, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.1746522411128284, |
| "grad_norm": 0.35899245738983154, |
| "learning_rate": 4.995437319314753e-05, |
| "loss": 5.3818, |
| "num_input_tokens_seen": 569008, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.1823802163833075, |
| "grad_norm": 0.40432843565940857, |
| "learning_rate": 4.995020982053354e-05, |
| "loss": 5.4115, |
| "num_input_tokens_seen": 572656, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.1901081916537868, |
| "grad_norm": 0.34939906001091003, |
| "learning_rate": 4.9945864887945215e-05, |
| "loss": 5.6154, |
| "num_input_tokens_seen": 576496, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.1978361669242659, |
| "grad_norm": 0.40631914138793945, |
| "learning_rate": 4.994133842700015e-05, |
| "loss": 5.5658, |
| "num_input_tokens_seen": 580208, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.205564142194745, |
| "grad_norm": 0.38632699847221375, |
| "learning_rate": 4.993663047063692e-05, |
| "loss": 5.421, |
| "num_input_tokens_seen": 584240, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.213292117465224, |
| "grad_norm": 0.3667367994785309, |
| "learning_rate": 4.993174105311481e-05, |
| "loss": 5.2719, |
| "num_input_tokens_seen": 588400, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.2210200927357033, |
| "grad_norm": 0.3893936574459076, |
| "learning_rate": 4.992667021001357e-05, |
| "loss": 5.7396, |
| "num_input_tokens_seen": 592176, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.2287480680061824, |
| "grad_norm": 0.44007450342178345, |
| "learning_rate": 4.99214179782332e-05, |
| "loss": 5.3169, |
| "num_input_tokens_seen": 595632, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.2364760432766615, |
| "grad_norm": 0.32318371534347534, |
| "learning_rate": 4.9915984395993606e-05, |
| "loss": 5.2931, |
| "num_input_tokens_seen": 599152, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2442040185471406, |
| "grad_norm": 0.3229334354400635, |
| "learning_rate": 4.991036950283438e-05, |
| "loss": 5.4225, |
| "num_input_tokens_seen": 602800, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.2519319938176197, |
| "grad_norm": 0.36364948749542236, |
| "learning_rate": 4.990457333961449e-05, |
| "loss": 5.3279, |
| "num_input_tokens_seen": 606128, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.259659969088099, |
| "grad_norm": 0.3748265206813812, |
| "learning_rate": 4.9898595948511984e-05, |
| "loss": 5.3406, |
| "num_input_tokens_seen": 609968, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.267387944358578, |
| "grad_norm": 0.35926467180252075, |
| "learning_rate": 4.9892437373023706e-05, |
| "loss": 5.5681, |
| "num_input_tokens_seen": 613808, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.2751159196290571, |
| "grad_norm": 0.3639248311519623, |
| "learning_rate": 4.988609765796492e-05, |
| "loss": 5.482, |
| "num_input_tokens_seen": 617456, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.2828438948995364, |
| "grad_norm": 0.4079735577106476, |
| "learning_rate": 4.9879576849469065e-05, |
| "loss": 5.103, |
| "num_input_tokens_seen": 621168, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.2905718701700155, |
| "grad_norm": 0.327030748128891, |
| "learning_rate": 4.9872874994987354e-05, |
| "loss": 4.6912, |
| "num_input_tokens_seen": 624624, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.2982998454404946, |
| "grad_norm": 0.336750328540802, |
| "learning_rate": 4.986599214328844e-05, |
| "loss": 5.288, |
| "num_input_tokens_seen": 628016, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3060278207109737, |
| "grad_norm": 0.5080354809761047, |
| "learning_rate": 4.985892834445811e-05, |
| "loss": 5.1205, |
| "num_input_tokens_seen": 631664, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.3137557959814528, |
| "grad_norm": 0.3950064778327942, |
| "learning_rate": 4.985168364989886e-05, |
| "loss": 5.1247, |
| "num_input_tokens_seen": 635824, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.321483771251932, |
| "grad_norm": 0.37459149956703186, |
| "learning_rate": 4.984425811232954e-05, |
| "loss": 5.5621, |
| "num_input_tokens_seen": 639536, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.3292117465224111, |
| "grad_norm": 0.3513096570968628, |
| "learning_rate": 4.983665178578498e-05, |
| "loss": 5.4441, |
| "num_input_tokens_seen": 642992, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.3369397217928902, |
| "grad_norm": 0.3584459125995636, |
| "learning_rate": 4.98288647256156e-05, |
| "loss": 4.4869, |
| "num_input_tokens_seen": 646576, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.3446676970633695, |
| "grad_norm": 0.33611801266670227, |
| "learning_rate": 4.9820896988487e-05, |
| "loss": 5.1894, |
| "num_input_tokens_seen": 650160, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.3523956723338486, |
| "grad_norm": 0.3570202589035034, |
| "learning_rate": 4.981274863237953e-05, |
| "loss": 4.8735, |
| "num_input_tokens_seen": 654256, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.3601236476043277, |
| "grad_norm": 0.4741190969944, |
| "learning_rate": 4.9804419716587894e-05, |
| "loss": 4.5883, |
| "num_input_tokens_seen": 658288, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.3678516228748068, |
| "grad_norm": 0.3228873312473297, |
| "learning_rate": 4.979591030172072e-05, |
| "loss": 5.5148, |
| "num_input_tokens_seen": 662192, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.3755795981452859, |
| "grad_norm": 0.3376190960407257, |
| "learning_rate": 4.978722044970009e-05, |
| "loss": 4.937, |
| "num_input_tokens_seen": 666096, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.383307573415765, |
| "grad_norm": 0.37344226241111755, |
| "learning_rate": 4.9778350223761115e-05, |
| "loss": 5.0469, |
| "num_input_tokens_seen": 670512, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.3910355486862442, |
| "grad_norm": 0.2931409478187561, |
| "learning_rate": 4.9769299688451475e-05, |
| "loss": 4.993, |
| "num_input_tokens_seen": 674480, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.3987635239567233, |
| "grad_norm": 0.43220609426498413, |
| "learning_rate": 4.976006890963093e-05, |
| "loss": 5.1053, |
| "num_input_tokens_seen": 677488, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.4064914992272024, |
| "grad_norm": 0.3765735328197479, |
| "learning_rate": 4.9750657954470864e-05, |
| "loss": 5.0701, |
| "num_input_tokens_seen": 681392, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.4142194744976817, |
| "grad_norm": 0.3108097314834595, |
| "learning_rate": 4.974106689145377e-05, |
| "loss": 5.1058, |
| "num_input_tokens_seen": 685232, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.4219474497681608, |
| "grad_norm": 0.36728906631469727, |
| "learning_rate": 4.973129579037278e-05, |
| "loss": 5.318, |
| "num_input_tokens_seen": 688944, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.4296754250386399, |
| "grad_norm": 0.35344597697257996, |
| "learning_rate": 4.972134472233113e-05, |
| "loss": 5.1818, |
| "num_input_tokens_seen": 692464, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.437403400309119, |
| "grad_norm": 0.37421491742134094, |
| "learning_rate": 4.971121375974168e-05, |
| "loss": 4.6604, |
| "num_input_tokens_seen": 696240, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.445131375579598, |
| "grad_norm": 0.3447347581386566, |
| "learning_rate": 4.970090297632633e-05, |
| "loss": 4.8528, |
| "num_input_tokens_seen": 699952, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.4528593508500773, |
| "grad_norm": 0.3421487808227539, |
| "learning_rate": 4.969041244711555e-05, |
| "loss": 4.8747, |
| "num_input_tokens_seen": 703664, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.4605873261205564, |
| "grad_norm": 0.354444295167923, |
| "learning_rate": 4.967974224844777e-05, |
| "loss": 5.0095, |
| "num_input_tokens_seen": 707696, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.4683153013910355, |
| "grad_norm": 0.36146560311317444, |
| "learning_rate": 4.966889245796888e-05, |
| "loss": 4.7609, |
| "num_input_tokens_seen": 711216, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.4760432766615148, |
| "grad_norm": 0.3156791031360626, |
| "learning_rate": 4.965786315463162e-05, |
| "loss": 4.7666, |
| "num_input_tokens_seen": 714992, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.4837712519319939, |
| "grad_norm": 0.33873096108436584, |
| "learning_rate": 4.9646654418695055e-05, |
| "loss": 4.833, |
| "num_input_tokens_seen": 718704, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.491499227202473, |
| "grad_norm": 0.4051862359046936, |
| "learning_rate": 4.963526633172392e-05, |
| "loss": 4.5196, |
| "num_input_tokens_seen": 722608, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.499227202472952, |
| "grad_norm": 0.32755422592163086, |
| "learning_rate": 4.9623698976588105e-05, |
| "loss": 4.6224, |
| "num_input_tokens_seen": 725744, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.5023183925811439, |
| "eval_loss": 4.67022705078125, |
| "eval_runtime": 9.8381, |
| "eval_samples_per_second": 58.446, |
| "eval_steps_per_second": 7.318, |
| "num_input_tokens_seen": 727280, |
| "step": 972 |
| }, |
| { |
| "epoch": 1.5069551777434311, |
| "grad_norm": 0.3122451901435852, |
| "learning_rate": 4.9611952437462e-05, |
| "loss": 4.6149, |
| "num_input_tokens_seen": 729456, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.5146831530139102, |
| "grad_norm": 0.3334190845489502, |
| "learning_rate": 4.960002679982389e-05, |
| "loss": 4.8694, |
| "num_input_tokens_seen": 733680, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.5224111282843895, |
| "grad_norm": 0.32536813616752625, |
| "learning_rate": 4.958792215045535e-05, |
| "loss": 4.904, |
| "num_input_tokens_seen": 737584, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.5301391035548686, |
| "grad_norm": 0.4649519622325897, |
| "learning_rate": 4.9575638577440606e-05, |
| "loss": 4.9842, |
| "num_input_tokens_seen": 741680, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.537867078825348, |
| "grad_norm": 0.31299111247062683, |
| "learning_rate": 4.956317617016589e-05, |
| "loss": 4.7217, |
| "num_input_tokens_seen": 745776, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.545595054095827, |
| "grad_norm": 0.3985936939716339, |
| "learning_rate": 4.955053501931878e-05, |
| "loss": 4.9462, |
| "num_input_tokens_seen": 749232, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.553323029366306, |
| "grad_norm": 0.598997175693512, |
| "learning_rate": 4.953771521688757e-05, |
| "loss": 4.4938, |
| "num_input_tokens_seen": 753008, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.5610510046367851, |
| "grad_norm": 0.7777657508850098, |
| "learning_rate": 4.952471685616058e-05, |
| "loss": 4.3919, |
| "num_input_tokens_seen": 756912, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.5687789799072642, |
| "grad_norm": 0.32821112871170044, |
| "learning_rate": 4.9511540031725454e-05, |
| "loss": 4.797, |
| "num_input_tokens_seen": 760432, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.5765069551777433, |
| "grad_norm": 0.2888381779193878, |
| "learning_rate": 4.949818483946853e-05, |
| "loss": 4.3946, |
| "num_input_tokens_seen": 764592, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.5842349304482226, |
| "grad_norm": 0.30184459686279297, |
| "learning_rate": 4.9484651376574094e-05, |
| "loss": 4.2145, |
| "num_input_tokens_seen": 767792, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.5919629057187017, |
| "grad_norm": 0.3076271414756775, |
| "learning_rate": 4.9470939741523685e-05, |
| "loss": 5.2852, |
| "num_input_tokens_seen": 772080, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.599690880989181, |
| "grad_norm": 0.31468066573143005, |
| "learning_rate": 4.9457050034095395e-05, |
| "loss": 4.7483, |
| "num_input_tokens_seen": 775216, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.60741885625966, |
| "grad_norm": 0.5978931784629822, |
| "learning_rate": 4.944298235536311e-05, |
| "loss": 4.5684, |
| "num_input_tokens_seen": 778992, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.6151468315301392, |
| "grad_norm": 0.45990973711013794, |
| "learning_rate": 4.942873680769581e-05, |
| "loss": 4.4237, |
| "num_input_tokens_seen": 782832, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.6228748068006182, |
| "grad_norm": 0.3652671277523041, |
| "learning_rate": 4.9414313494756804e-05, |
| "loss": 4.5137, |
| "num_input_tokens_seen": 786288, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.6306027820710973, |
| "grad_norm": 0.3771812915802002, |
| "learning_rate": 4.9399712521502966e-05, |
| "loss": 4.4615, |
| "num_input_tokens_seen": 789552, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.6383307573415764, |
| "grad_norm": 0.40835604071617126, |
| "learning_rate": 4.9384933994184016e-05, |
| "loss": 4.4301, |
| "num_input_tokens_seen": 793712, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.6460587326120555, |
| "grad_norm": 0.6361907720565796, |
| "learning_rate": 4.9369978020341676e-05, |
| "loss": 4.5741, |
| "num_input_tokens_seen": 797040, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.6537867078825348, |
| "grad_norm": 0.2855110466480255, |
| "learning_rate": 4.9354844708808965e-05, |
| "loss": 4.3495, |
| "num_input_tokens_seen": 800688, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.6615146831530139, |
| "grad_norm": 0.4478287100791931, |
| "learning_rate": 4.933953416970935e-05, |
| "loss": 4.6778, |
| "num_input_tokens_seen": 804656, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.6692426584234932, |
| "grad_norm": 0.3282228410243988, |
| "learning_rate": 4.932404651445596e-05, |
| "loss": 4.492, |
| "num_input_tokens_seen": 808752, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.6769706336939723, |
| "grad_norm": 0.38058677315711975, |
| "learning_rate": 4.930838185575077e-05, |
| "loss": 4.4508, |
| "num_input_tokens_seen": 812592, |
| "step": 1085 |
| }, |
| { |
| "epoch": 1.6846986089644513, |
| "grad_norm": 0.3633776307106018, |
| "learning_rate": 4.929254030758383e-05, |
| "loss": 4.5136, |
| "num_input_tokens_seen": 816688, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.6924265842349304, |
| "grad_norm": 0.4143412411212921, |
| "learning_rate": 4.927652198523237e-05, |
| "loss": 4.5167, |
| "num_input_tokens_seen": 820592, |
| "step": 1095 |
| }, |
| { |
| "epoch": 1.7001545595054095, |
| "grad_norm": 0.2974889874458313, |
| "learning_rate": 4.926032700525997e-05, |
| "loss": 4.2801, |
| "num_input_tokens_seen": 823984, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7078825347758886, |
| "grad_norm": 0.31788986921310425, |
| "learning_rate": 4.924395548551575e-05, |
| "loss": 4.7661, |
| "num_input_tokens_seen": 827760, |
| "step": 1105 |
| }, |
| { |
| "epoch": 1.7156105100463679, |
| "grad_norm": 0.3491007685661316, |
| "learning_rate": 4.9227407545133486e-05, |
| "loss": 3.9265, |
| "num_input_tokens_seen": 831408, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.723338485316847, |
| "grad_norm": 0.2992270886898041, |
| "learning_rate": 4.921068330453075e-05, |
| "loss": 4.0714, |
| "num_input_tokens_seen": 835120, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.7310664605873263, |
| "grad_norm": 0.31046733260154724, |
| "learning_rate": 4.9193782885408026e-05, |
| "loss": 4.1982, |
| "num_input_tokens_seen": 839216, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.7387944358578054, |
| "grad_norm": 0.3238506615161896, |
| "learning_rate": 4.917670641074784e-05, |
| "loss": 4.4634, |
| "num_input_tokens_seen": 842352, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.7465224111282844, |
| "grad_norm": 0.34013789892196655, |
| "learning_rate": 4.9159454004813854e-05, |
| "loss": 3.7904, |
| "num_input_tokens_seen": 845936, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.7542503863987635, |
| "grad_norm": 0.5322805643081665, |
| "learning_rate": 4.9142025793149935e-05, |
| "loss": 4.3029, |
| "num_input_tokens_seen": 849456, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.7619783616692426, |
| "grad_norm": 0.489900678396225, |
| "learning_rate": 4.912442190257931e-05, |
| "loss": 3.8957, |
| "num_input_tokens_seen": 852912, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.7697063369397217, |
| "grad_norm": 0.316989004611969, |
| "learning_rate": 4.9106642461203575e-05, |
| "loss": 4.4156, |
| "num_input_tokens_seen": 857648, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.7774343122102008, |
| "grad_norm": 0.31556737422943115, |
| "learning_rate": 4.908868759840181e-05, |
| "loss": 4.1275, |
| "num_input_tokens_seen": 861680, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.78516228748068, |
| "grad_norm": 0.28157737851142883, |
| "learning_rate": 4.907055744482959e-05, |
| "loss": 4.3739, |
| "num_input_tokens_seen": 865456, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.7928902627511591, |
| "grad_norm": 0.30491557717323303, |
| "learning_rate": 4.905225213241809e-05, |
| "loss": 4.083, |
| "num_input_tokens_seen": 868784, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.8006182380216385, |
| "grad_norm": 0.35626837611198425, |
| "learning_rate": 4.9033771794373084e-05, |
| "loss": 4.2187, |
| "num_input_tokens_seen": 872560, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.8083462132921175, |
| "grad_norm": 0.3904974162578583, |
| "learning_rate": 4.901511656517399e-05, |
| "loss": 3.9821, |
| "num_input_tokens_seen": 876528, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.8160741885625966, |
| "grad_norm": 0.6061235666275024, |
| "learning_rate": 4.8996286580572895e-05, |
| "loss": 4.3058, |
| "num_input_tokens_seen": 880624, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.8238021638330757, |
| "grad_norm": 0.30002129077911377, |
| "learning_rate": 4.8977281977593546e-05, |
| "loss": 4.2416, |
| "num_input_tokens_seen": 884528, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.8315301391035548, |
| "grad_norm": 0.29267552495002747, |
| "learning_rate": 4.8958102894530395e-05, |
| "loss": 4.0598, |
| "num_input_tokens_seen": 888176, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.8392581143740339, |
| "grad_norm": 0.33920446038246155, |
| "learning_rate": 4.8938749470947534e-05, |
| "loss": 4.432, |
| "num_input_tokens_seen": 892336, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.8469860896445132, |
| "grad_norm": 0.3370172679424286, |
| "learning_rate": 4.8919221847677744e-05, |
| "loss": 4.141, |
| "num_input_tokens_seen": 896496, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.8547140649149922, |
| "grad_norm": 0.2805825471878052, |
| "learning_rate": 4.889952016682142e-05, |
| "loss": 4.4681, |
| "num_input_tokens_seen": 900208, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.8624420401854715, |
| "grad_norm": 0.3391216993331909, |
| "learning_rate": 4.8879644571745565e-05, |
| "loss": 4.492, |
| "num_input_tokens_seen": 903536, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.8701700154559506, |
| "grad_norm": 0.7657363414764404, |
| "learning_rate": 4.885959520708272e-05, |
| "loss": 3.8387, |
| "num_input_tokens_seen": 907120, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.8778979907264297, |
| "grad_norm": 0.3277113139629364, |
| "learning_rate": 4.883937221872995e-05, |
| "loss": 4.0781, |
| "num_input_tokens_seen": 910576, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.8856259659969088, |
| "grad_norm": 0.3017078638076782, |
| "learning_rate": 4.881897575384774e-05, |
| "loss": 3.5843, |
| "num_input_tokens_seen": 914416, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.8933539412673879, |
| "grad_norm": 0.32566162943840027, |
| "learning_rate": 4.879840596085897e-05, |
| "loss": 3.9687, |
| "num_input_tokens_seen": 918064, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.901081916537867, |
| "grad_norm": 0.285196989774704, |
| "learning_rate": 4.877766298944779e-05, |
| "loss": 4.0323, |
| "num_input_tokens_seen": 922160, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.9088098918083463, |
| "grad_norm": 0.31941235065460205, |
| "learning_rate": 4.875674699055855e-05, |
| "loss": 4.0946, |
| "num_input_tokens_seen": 926128, |
| "step": 1235 |
| }, |
| { |
| "epoch": 1.9165378670788253, |
| "grad_norm": 0.36172521114349365, |
| "learning_rate": 4.8735658116394714e-05, |
| "loss": 4.3137, |
| "num_input_tokens_seen": 930224, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.9242658423493046, |
| "grad_norm": 0.2846207618713379, |
| "learning_rate": 4.871439652041773e-05, |
| "loss": 3.8571, |
| "num_input_tokens_seen": 934000, |
| "step": 1245 |
| }, |
| { |
| "epoch": 1.9319938176197837, |
| "grad_norm": 0.4269625246524811, |
| "learning_rate": 4.869296235734594e-05, |
| "loss": 4.116, |
| "num_input_tokens_seen": 938160, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.9397217928902628, |
| "grad_norm": 0.24182330071926117, |
| "learning_rate": 4.8671355783153415e-05, |
| "loss": 4.1353, |
| "num_input_tokens_seen": 941552, |
| "step": 1255 |
| }, |
| { |
| "epoch": 1.947449768160742, |
| "grad_norm": 0.2979944348335266, |
| "learning_rate": 4.864957695506885e-05, |
| "loss": 4.0559, |
| "num_input_tokens_seen": 945200, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.955177743431221, |
| "grad_norm": 0.2942993640899658, |
| "learning_rate": 4.862762603157445e-05, |
| "loss": 3.7291, |
| "num_input_tokens_seen": 949040, |
| "step": 1265 |
| }, |
| { |
| "epoch": 1.9629057187017, |
| "grad_norm": 0.2731165289878845, |
| "learning_rate": 4.860550317240467e-05, |
| "loss": 4.0914, |
| "num_input_tokens_seen": 952944, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.9706336939721791, |
| "grad_norm": 0.2240080088376999, |
| "learning_rate": 4.8583208538545175e-05, |
| "loss": 3.7287, |
| "num_input_tokens_seen": 956912, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.9783616692426584, |
| "grad_norm": 0.3749920427799225, |
| "learning_rate": 4.856074229223161e-05, |
| "loss": 3.8797, |
| "num_input_tokens_seen": 960752, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.9860896445131375, |
| "grad_norm": 0.24248334765434265, |
| "learning_rate": 4.85381045969484e-05, |
| "loss": 4.2639, |
| "num_input_tokens_seen": 964336, |
| "step": 1285 |
| }, |
| { |
| "epoch": 1.9938176197836168, |
| "grad_norm": 0.6645636558532715, |
| "learning_rate": 4.851529561742762e-05, |
| "loss": 3.9315, |
| "num_input_tokens_seen": 967600, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.001545595054096, |
| "grad_norm": 0.5366644263267517, |
| "learning_rate": 4.849231551964771e-05, |
| "loss": 3.407, |
| "num_input_tokens_seen": 970704, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.003091190108192, |
| "eval_loss": 3.904050588607788, |
| "eval_runtime": 9.8507, |
| "eval_samples_per_second": 58.372, |
| "eval_steps_per_second": 7.309, |
| "num_input_tokens_seen": 971536, |
| "step": 1296 |
| }, |
| { |
| "epoch": 2.009273570324575, |
| "grad_norm": 0.30658474564552307, |
| "learning_rate": 4.846916447083239e-05, |
| "loss": 4.2572, |
| "num_input_tokens_seen": 974416, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.017001545595054, |
| "grad_norm": 0.2476230412721634, |
| "learning_rate": 4.8445842639449313e-05, |
| "loss": 3.871, |
| "num_input_tokens_seen": 978704, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.024729520865533, |
| "grad_norm": 0.2552611529827118, |
| "learning_rate": 4.842235019520893e-05, |
| "loss": 4.0268, |
| "num_input_tokens_seen": 982480, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.0324574961360122, |
| "grad_norm": 0.243674173951149, |
| "learning_rate": 4.8398687309063206e-05, |
| "loss": 3.9918, |
| "num_input_tokens_seen": 986384, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.0401854714064913, |
| "grad_norm": 0.34793391823768616, |
| "learning_rate": 4.8374854153204405e-05, |
| "loss": 4.1587, |
| "num_input_tokens_seen": 990032, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.047913446676971, |
| "grad_norm": 0.24952645599842072, |
| "learning_rate": 4.835085090106382e-05, |
| "loss": 3.9453, |
| "num_input_tokens_seen": 993424, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.05564142194745, |
| "grad_norm": 0.4589490294456482, |
| "learning_rate": 4.832667772731051e-05, |
| "loss": 3.7476, |
| "num_input_tokens_seen": 997648, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.063369397217929, |
| "grad_norm": 0.2599397897720337, |
| "learning_rate": 4.830233480785005e-05, |
| "loss": 3.6993, |
| "num_input_tokens_seen": 1001360, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.071097372488408, |
| "grad_norm": 0.31336286664009094, |
| "learning_rate": 4.827782231982323e-05, |
| "loss": 4.0099, |
| "num_input_tokens_seen": 1005200, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.078825347758887, |
| "grad_norm": 0.2606953978538513, |
| "learning_rate": 4.8253140441604764e-05, |
| "loss": 4.0381, |
| "num_input_tokens_seen": 1008912, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.0865533230293662, |
| "grad_norm": 0.38538479804992676, |
| "learning_rate": 4.8228289352802006e-05, |
| "loss": 3.5811, |
| "num_input_tokens_seen": 1012688, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.0942812982998453, |
| "grad_norm": 0.29820919036865234, |
| "learning_rate": 4.820326923425364e-05, |
| "loss": 3.8514, |
| "num_input_tokens_seen": 1015952, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.1020092735703244, |
| "grad_norm": 0.338681697845459, |
| "learning_rate": 4.817808026802836e-05, |
| "loss": 3.8809, |
| "num_input_tokens_seen": 1019664, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.109737248840804, |
| "grad_norm": 0.38213613629341125, |
| "learning_rate": 4.815272263742354e-05, |
| "loss": 3.7324, |
| "num_input_tokens_seen": 1023632, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.117465224111283, |
| "grad_norm": 0.24088416993618011, |
| "learning_rate": 4.812719652696392e-05, |
| "loss": 3.6557, |
| "num_input_tokens_seen": 1027728, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.125193199381762, |
| "grad_norm": 0.32198968529701233, |
| "learning_rate": 4.810150212240023e-05, |
| "loss": 3.7389, |
| "num_input_tokens_seen": 1031696, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.132921174652241, |
| "grad_norm": 0.308168888092041, |
| "learning_rate": 4.807563961070788e-05, |
| "loss": 4.219, |
| "num_input_tokens_seen": 1035472, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.1406491499227203, |
| "grad_norm": 0.2740299105644226, |
| "learning_rate": 4.804960918008557e-05, |
| "loss": 2.9497, |
| "num_input_tokens_seen": 1038672, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.1483771251931993, |
| "grad_norm": 0.3373715281486511, |
| "learning_rate": 4.802341101995389e-05, |
| "loss": 4.0791, |
| "num_input_tokens_seen": 1042192, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.1561051004636784, |
| "grad_norm": 0.28904953598976135, |
| "learning_rate": 4.7997045320954056e-05, |
| "loss": 3.8542, |
| "num_input_tokens_seen": 1045712, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.1638330757341575, |
| "grad_norm": 0.27055951952934265, |
| "learning_rate": 4.797051227494638e-05, |
| "loss": 3.7141, |
| "num_input_tokens_seen": 1049552, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.1715610510046366, |
| "grad_norm": 0.3160632252693176, |
| "learning_rate": 4.7943812075008975e-05, |
| "loss": 3.9331, |
| "num_input_tokens_seen": 1052944, |
| "step": 1405 |
| }, |
| { |
| "epoch": 2.179289026275116, |
| "grad_norm": 0.4832019805908203, |
| "learning_rate": 4.791694491543629e-05, |
| "loss": 3.5992, |
| "num_input_tokens_seen": 1056656, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.187017001545595, |
| "grad_norm": 0.5526258945465088, |
| "learning_rate": 4.788991099173775e-05, |
| "loss": 3.9143, |
| "num_input_tokens_seen": 1060560, |
| "step": 1415 |
| }, |
| { |
| "epoch": 2.1947449768160743, |
| "grad_norm": 0.32115232944488525, |
| "learning_rate": 4.786271050063629e-05, |
| "loss": 3.9486, |
| "num_input_tokens_seen": 1064528, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.2024729520865534, |
| "grad_norm": 0.25502118468284607, |
| "learning_rate": 4.783534364006692e-05, |
| "loss": 3.5801, |
| "num_input_tokens_seen": 1067856, |
| "step": 1425 |
| }, |
| { |
| "epoch": 2.2102009273570324, |
| "grad_norm": 0.38059893250465393, |
| "learning_rate": 4.780781060917533e-05, |
| "loss": 3.5739, |
| "num_input_tokens_seen": 1071248, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.2179289026275115, |
| "grad_norm": 0.3651339113712311, |
| "learning_rate": 4.778011160831641e-05, |
| "loss": 3.5081, |
| "num_input_tokens_seen": 1074832, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.2256568778979906, |
| "grad_norm": 0.24716055393218994, |
| "learning_rate": 4.7752246839052785e-05, |
| "loss": 3.8176, |
| "num_input_tokens_seen": 1079184, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.2333848531684697, |
| "grad_norm": 0.2873936891555786, |
| "learning_rate": 4.7724216504153356e-05, |
| "loss": 4.0717, |
| "num_input_tokens_seen": 1082832, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.2411128284389488, |
| "grad_norm": 0.4039398431777954, |
| "learning_rate": 4.769602080759185e-05, |
| "loss": 3.8992, |
| "num_input_tokens_seen": 1086544, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.2488408037094283, |
| "grad_norm": 0.2683480679988861, |
| "learning_rate": 4.766765995454527e-05, |
| "loss": 3.2922, |
| "num_input_tokens_seen": 1090512, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.2565687789799074, |
| "grad_norm": 0.23535872995853424, |
| "learning_rate": 4.76391341513925e-05, |
| "loss": 3.7648, |
| "num_input_tokens_seen": 1094352, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.2642967542503865, |
| "grad_norm": 0.3309881389141083, |
| "learning_rate": 4.7610443605712696e-05, |
| "loss": 3.7942, |
| "num_input_tokens_seen": 1098128, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.2720247295208655, |
| "grad_norm": 0.33581194281578064, |
| "learning_rate": 4.758158852628387e-05, |
| "loss": 3.4687, |
| "num_input_tokens_seen": 1101392, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.2797527047913446, |
| "grad_norm": 0.30702441930770874, |
| "learning_rate": 4.7552569123081305e-05, |
| "loss": 3.9292, |
| "num_input_tokens_seen": 1104848, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.2874806800618237, |
| "grad_norm": 0.26855015754699707, |
| "learning_rate": 4.752338560727604e-05, |
| "loss": 3.51, |
| "num_input_tokens_seen": 1108560, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.295208655332303, |
| "grad_norm": 0.30905085802078247, |
| "learning_rate": 4.749403819123338e-05, |
| "loss": 3.7468, |
| "num_input_tokens_seen": 1112784, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.3029366306027823, |
| "grad_norm": 0.31154030561447144, |
| "learning_rate": 4.746452708851128e-05, |
| "loss": 4.1219, |
| "num_input_tokens_seen": 1116560, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.3106646058732614, |
| "grad_norm": 0.3405595123767853, |
| "learning_rate": 4.7434852513858844e-05, |
| "loss": 3.9791, |
| "num_input_tokens_seen": 1120272, |
| "step": 1495 |
| }, |
| { |
| "epoch": 2.3183925811437405, |
| "grad_norm": 0.3933602273464203, |
| "learning_rate": 4.740501468321473e-05, |
| "loss": 3.5535, |
| "num_input_tokens_seen": 1124048, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.3261205564142196, |
| "grad_norm": 0.26671847701072693, |
| "learning_rate": 4.737501381370561e-05, |
| "loss": 3.9563, |
| "num_input_tokens_seen": 1127952, |
| "step": 1505 |
| }, |
| { |
| "epoch": 2.3338485316846986, |
| "grad_norm": 0.2816687226295471, |
| "learning_rate": 4.7344850123644555e-05, |
| "loss": 3.5441, |
| "num_input_tokens_seen": 1132240, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.3415765069551777, |
| "grad_norm": 0.4195476472377777, |
| "learning_rate": 4.7314523832529465e-05, |
| "loss": 3.9082, |
| "num_input_tokens_seen": 1135952, |
| "step": 1515 |
| }, |
| { |
| "epoch": 2.349304482225657, |
| "grad_norm": 0.3049340844154358, |
| "learning_rate": 4.728403516104149e-05, |
| "loss": 3.6012, |
| "num_input_tokens_seen": 1139664, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.357032457496136, |
| "grad_norm": 0.3365153968334198, |
| "learning_rate": 4.725338433104337e-05, |
| "loss": 3.7627, |
| "num_input_tokens_seen": 1143312, |
| "step": 1525 |
| }, |
| { |
| "epoch": 2.364760432766615, |
| "grad_norm": 0.2289888709783554, |
| "learning_rate": 4.72225715655779e-05, |
| "loss": 3.848, |
| "num_input_tokens_seen": 1147216, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.3724884080370945, |
| "grad_norm": 0.26810458302497864, |
| "learning_rate": 4.719159708886621e-05, |
| "loss": 4.0156, |
| "num_input_tokens_seen": 1150864, |
| "step": 1535 |
| }, |
| { |
| "epoch": 2.3802163833075736, |
| "grad_norm": 0.670549750328064, |
| "learning_rate": 4.716046112630623e-05, |
| "loss": 3.466, |
| "num_input_tokens_seen": 1154384, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.3879443585780527, |
| "grad_norm": 0.4632706046104431, |
| "learning_rate": 4.712916390447099e-05, |
| "loss": 3.454, |
| "num_input_tokens_seen": 1158288, |
| "step": 1545 |
| }, |
| { |
| "epoch": 2.3956723338485317, |
| "grad_norm": 0.2911776602268219, |
| "learning_rate": 4.709770565110697e-05, |
| "loss": 3.6318, |
| "num_input_tokens_seen": 1161744, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.403400309119011, |
| "grad_norm": 0.2800629734992981, |
| "learning_rate": 4.7066086595132486e-05, |
| "loss": 3.9736, |
| "num_input_tokens_seen": 1165584, |
| "step": 1555 |
| }, |
| { |
| "epoch": 2.41112828438949, |
| "grad_norm": 0.2719181776046753, |
| "learning_rate": 4.7034306966635966e-05, |
| "loss": 3.7662, |
| "num_input_tokens_seen": 1169104, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.418856259659969, |
| "grad_norm": 0.34364286065101624, |
| "learning_rate": 4.700236699687434e-05, |
| "loss": 3.5639, |
| "num_input_tokens_seen": 1173200, |
| "step": 1565 |
| }, |
| { |
| "epoch": 2.426584234930448, |
| "grad_norm": 0.25128039717674255, |
| "learning_rate": 4.697026691827129e-05, |
| "loss": 3.6556, |
| "num_input_tokens_seen": 1176848, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.434312210200927, |
| "grad_norm": 0.38562244176864624, |
| "learning_rate": 4.693800696441564e-05, |
| "loss": 3.3021, |
| "num_input_tokens_seen": 1180432, |
| "step": 1575 |
| }, |
| { |
| "epoch": 2.4420401854714067, |
| "grad_norm": 0.2515873312950134, |
| "learning_rate": 4.690558737005955e-05, |
| "loss": 3.5264, |
| "num_input_tokens_seen": 1184528, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.4497681607418857, |
| "grad_norm": 0.3244396448135376, |
| "learning_rate": 4.687300837111691e-05, |
| "loss": 3.2549, |
| "num_input_tokens_seen": 1188560, |
| "step": 1585 |
| }, |
| { |
| "epoch": 2.457496136012365, |
| "grad_norm": 0.23380200564861298, |
| "learning_rate": 4.6840270204661575e-05, |
| "loss": 3.4865, |
| "num_input_tokens_seen": 1192656, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.465224111282844, |
| "grad_norm": 0.24694781005382538, |
| "learning_rate": 4.6807373108925626e-05, |
| "loss": 3.6555, |
| "num_input_tokens_seen": 1196176, |
| "step": 1595 |
| }, |
| { |
| "epoch": 2.472952086553323, |
| "grad_norm": 0.42272141575813293, |
| "learning_rate": 4.677431732329766e-05, |
| "loss": 3.3435, |
| "num_input_tokens_seen": 1199824, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.480680061823802, |
| "grad_norm": 0.31362879276275635, |
| "learning_rate": 4.674110308832106e-05, |
| "loss": 3.6951, |
| "num_input_tokens_seen": 1204048, |
| "step": 1605 |
| }, |
| { |
| "epoch": 2.488408037094281, |
| "grad_norm": 0.22943803668022156, |
| "learning_rate": 4.670773064569221e-05, |
| "loss": 3.8555, |
| "num_input_tokens_seen": 1207632, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.4961360123647607, |
| "grad_norm": 0.26428958773612976, |
| "learning_rate": 4.667420023825876e-05, |
| "loss": 3.4376, |
| "num_input_tokens_seen": 1211216, |
| "step": 1615 |
| }, |
| { |
| "epoch": 2.5038639876352393, |
| "grad_norm": 0.39434218406677246, |
| "learning_rate": 4.664051211001786e-05, |
| "loss": 3.3945, |
| "num_input_tokens_seen": 1214864, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.5038639876352393, |
| "eval_loss": 3.5485548973083496, |
| "eval_runtime": 9.8824, |
| "eval_samples_per_second": 58.185, |
| "eval_steps_per_second": 7.286, |
| "num_input_tokens_seen": 1214864, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.511591962905719, |
| "grad_norm": 0.49116188287734985, |
| "learning_rate": 4.660666650611436e-05, |
| "loss": 3.3948, |
| "num_input_tokens_seen": 1218960, |
| "step": 1625 |
| }, |
| { |
| "epoch": 2.519319938176198, |
| "grad_norm": 0.5065126419067383, |
| "learning_rate": 4.657266367283906e-05, |
| "loss": 3.1531, |
| "num_input_tokens_seen": 1223120, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.527047913446677, |
| "grad_norm": 0.395476758480072, |
| "learning_rate": 4.653850385762689e-05, |
| "loss": 3.2479, |
| "num_input_tokens_seen": 1226768, |
| "step": 1635 |
| }, |
| { |
| "epoch": 2.534775888717156, |
| "grad_norm": 0.36967065930366516, |
| "learning_rate": 4.6504187309055135e-05, |
| "loss": 3.4955, |
| "num_input_tokens_seen": 1230352, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.542503863987635, |
| "grad_norm": 0.25611770153045654, |
| "learning_rate": 4.646971427684159e-05, |
| "loss": 3.8143, |
| "num_input_tokens_seen": 1233936, |
| "step": 1645 |
| }, |
| { |
| "epoch": 2.5502318392581143, |
| "grad_norm": 0.4444847106933594, |
| "learning_rate": 4.6435085011842785e-05, |
| "loss": 3.7283, |
| "num_input_tokens_seen": 1237392, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.5579598145285933, |
| "grad_norm": 0.5292262434959412, |
| "learning_rate": 4.6400299766052126e-05, |
| "loss": 3.6426, |
| "num_input_tokens_seen": 1241552, |
| "step": 1655 |
| }, |
| { |
| "epoch": 2.565687789799073, |
| "grad_norm": 0.24306781589984894, |
| "learning_rate": 4.636535879259808e-05, |
| "loss": 3.837, |
| "num_input_tokens_seen": 1245008, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.573415765069552, |
| "grad_norm": 0.2738051414489746, |
| "learning_rate": 4.633026234574232e-05, |
| "loss": 3.4779, |
| "num_input_tokens_seen": 1248592, |
| "step": 1665 |
| }, |
| { |
| "epoch": 2.581143740340031, |
| "grad_norm": 0.26396363973617554, |
| "learning_rate": 4.62950106808779e-05, |
| "loss": 3.3651, |
| "num_input_tokens_seen": 1252496, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.58887171561051, |
| "grad_norm": 0.3096264898777008, |
| "learning_rate": 4.6259604054527364e-05, |
| "loss": 3.7189, |
| "num_input_tokens_seen": 1255824, |
| "step": 1675 |
| }, |
| { |
| "epoch": 2.596599690880989, |
| "grad_norm": 0.37881341576576233, |
| "learning_rate": 4.622404272434089e-05, |
| "loss": 3.8145, |
| "num_input_tokens_seen": 1259728, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.6043276661514683, |
| "grad_norm": 0.5067923665046692, |
| "learning_rate": 4.6188326949094425e-05, |
| "loss": 3.2801, |
| "num_input_tokens_seen": 1263376, |
| "step": 1685 |
| }, |
| { |
| "epoch": 2.6120556414219473, |
| "grad_norm": 0.30508747696876526, |
| "learning_rate": 4.615245698868781e-05, |
| "loss": 3.7173, |
| "num_input_tokens_seen": 1267472, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.6197836166924264, |
| "grad_norm": 0.2755107879638672, |
| "learning_rate": 4.6116433104142845e-05, |
| "loss": 3.4027, |
| "num_input_tokens_seen": 1271312, |
| "step": 1695 |
| }, |
| { |
| "epoch": 2.6275115919629055, |
| "grad_norm": 0.4445127248764038, |
| "learning_rate": 4.608025555760145e-05, |
| "loss": 3.373, |
| "num_input_tokens_seen": 1274960, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.635239567233385, |
| "grad_norm": 0.2652303874492645, |
| "learning_rate": 4.604392461232371e-05, |
| "loss": 3.491, |
| "num_input_tokens_seen": 1278160, |
| "step": 1705 |
| }, |
| { |
| "epoch": 2.642967542503864, |
| "grad_norm": 0.27256447076797485, |
| "learning_rate": 4.600744053268596e-05, |
| "loss": 3.7528, |
| "num_input_tokens_seen": 1282128, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.650695517774343, |
| "grad_norm": 0.30931556224823, |
| "learning_rate": 4.597080358417893e-05, |
| "loss": 3.4361, |
| "num_input_tokens_seen": 1285840, |
| "step": 1715 |
| }, |
| { |
| "epoch": 2.6584234930448223, |
| "grad_norm": 0.2930947244167328, |
| "learning_rate": 4.5934014033405695e-05, |
| "loss": 3.1586, |
| "num_input_tokens_seen": 1289744, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.6661514683153014, |
| "grad_norm": 0.22680403292179108, |
| "learning_rate": 4.5897072148079846e-05, |
| "loss": 3.3894, |
| "num_input_tokens_seen": 1293840, |
| "step": 1725 |
| }, |
| { |
| "epoch": 2.6738794435857804, |
| "grad_norm": 0.25522181391716003, |
| "learning_rate": 4.585997819702348e-05, |
| "loss": 3.6516, |
| "num_input_tokens_seen": 1297872, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.6816074188562595, |
| "grad_norm": 0.3003213405609131, |
| "learning_rate": 4.5822732450165253e-05, |
| "loss": 3.3977, |
| "num_input_tokens_seen": 1301712, |
| "step": 1735 |
| }, |
| { |
| "epoch": 2.689335394126739, |
| "grad_norm": 0.287356972694397, |
| "learning_rate": 4.5785335178538444e-05, |
| "loss": 3.229, |
| "num_input_tokens_seen": 1305040, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.6970633693972177, |
| "grad_norm": 0.23212386667728424, |
| "learning_rate": 4.5747786654278936e-05, |
| "loss": 3.4395, |
| "num_input_tokens_seen": 1308880, |
| "step": 1745 |
| }, |
| { |
| "epoch": 2.704791344667697, |
| "grad_norm": 0.4021557569503784, |
| "learning_rate": 4.5710087150623274e-05, |
| "loss": 3.5296, |
| "num_input_tokens_seen": 1312464, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.7125193199381763, |
| "grad_norm": 0.27974724769592285, |
| "learning_rate": 4.567223694190667e-05, |
| "loss": 3.5769, |
| "num_input_tokens_seen": 1316240, |
| "step": 1755 |
| }, |
| { |
| "epoch": 2.7202472952086554, |
| "grad_norm": 0.29455214738845825, |
| "learning_rate": 4.563423630356099e-05, |
| "loss": 3.1926, |
| "num_input_tokens_seen": 1320080, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.7279752704791345, |
| "grad_norm": 0.6454901695251465, |
| "learning_rate": 4.559608551211276e-05, |
| "loss": 3.4921, |
| "num_input_tokens_seen": 1323408, |
| "step": 1765 |
| }, |
| { |
| "epoch": 2.7357032457496135, |
| "grad_norm": 0.41331547498703003, |
| "learning_rate": 4.555778484518116e-05, |
| "loss": 3.5073, |
| "num_input_tokens_seen": 1327376, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.7434312210200926, |
| "grad_norm": 0.26140904426574707, |
| "learning_rate": 4.551933458147599e-05, |
| "loss": 3.6708, |
| "num_input_tokens_seen": 1331152, |
| "step": 1775 |
| }, |
| { |
| "epoch": 2.7511591962905717, |
| "grad_norm": 0.2431512176990509, |
| "learning_rate": 4.548073500079566e-05, |
| "loss": 3.2524, |
| "num_input_tokens_seen": 1335120, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.7588871715610512, |
| "grad_norm": 0.28115934133529663, |
| "learning_rate": 4.544198638402514e-05, |
| "loss": 3.2425, |
| "num_input_tokens_seen": 1338576, |
| "step": 1785 |
| }, |
| { |
| "epoch": 2.76661514683153, |
| "grad_norm": 0.31076323986053467, |
| "learning_rate": 4.5403089013133905e-05, |
| "loss": 3.5338, |
| "num_input_tokens_seen": 1342288, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.7743431221020094, |
| "grad_norm": 0.3837130069732666, |
| "learning_rate": 4.536404317117392e-05, |
| "loss": 3.5508, |
| "num_input_tokens_seen": 1346192, |
| "step": 1795 |
| }, |
| { |
| "epoch": 2.7820710973724885, |
| "grad_norm": 0.4085627794265747, |
| "learning_rate": 4.5324849142277545e-05, |
| "loss": 3.3496, |
| "num_input_tokens_seen": 1350160, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.7897990726429676, |
| "grad_norm": 0.393399715423584, |
| "learning_rate": 4.5285507211655486e-05, |
| "loss": 3.7701, |
| "num_input_tokens_seen": 1354320, |
| "step": 1805 |
| }, |
| { |
| "epoch": 2.7975270479134466, |
| "grad_norm": 0.32475441694259644, |
| "learning_rate": 4.52460176655947e-05, |
| "loss": 3.3192, |
| "num_input_tokens_seen": 1358096, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.8052550231839257, |
| "grad_norm": 0.38561108708381653, |
| "learning_rate": 4.520638079145635e-05, |
| "loss": 3.4763, |
| "num_input_tokens_seen": 1361552, |
| "step": 1815 |
| }, |
| { |
| "epoch": 2.812982998454405, |
| "grad_norm": 0.2731021046638489, |
| "learning_rate": 4.516659687767367e-05, |
| "loss": 3.6312, |
| "num_input_tokens_seen": 1365776, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.820710973724884, |
| "grad_norm": 0.2799111604690552, |
| "learning_rate": 4.512666621374989e-05, |
| "loss": 3.4312, |
| "num_input_tokens_seen": 1369296, |
| "step": 1825 |
| }, |
| { |
| "epoch": 2.8284389489953634, |
| "grad_norm": 0.40603402256965637, |
| "learning_rate": 4.5086589090256124e-05, |
| "loss": 3.6473, |
| "num_input_tokens_seen": 1372752, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.8361669242658425, |
| "grad_norm": 0.31322500109672546, |
| "learning_rate": 4.5046365798829265e-05, |
| "loss": 3.5422, |
| "num_input_tokens_seen": 1376336, |
| "step": 1835 |
| }, |
| { |
| "epoch": 2.8438948995363216, |
| "grad_norm": 0.3008683919906616, |
| "learning_rate": 4.5005996632169845e-05, |
| "loss": 3.3106, |
| "num_input_tokens_seen": 1379664, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.8516228748068007, |
| "grad_norm": 0.2807864844799042, |
| "learning_rate": 4.4965481884039915e-05, |
| "loss": 3.4939, |
| "num_input_tokens_seen": 1383632, |
| "step": 1845 |
| }, |
| { |
| "epoch": 2.8593508500772797, |
| "grad_norm": 0.31224992871284485, |
| "learning_rate": 4.492482184926091e-05, |
| "loss": 3.5265, |
| "num_input_tokens_seen": 1387856, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.867078825347759, |
| "grad_norm": 0.367418497800827, |
| "learning_rate": 4.48840168237115e-05, |
| "loss": 3.6019, |
| "num_input_tokens_seen": 1391888, |
| "step": 1855 |
| }, |
| { |
| "epoch": 2.874806800618238, |
| "grad_norm": 0.23605087399482727, |
| "learning_rate": 4.484306710432544e-05, |
| "loss": 3.2355, |
| "num_input_tokens_seen": 1395344, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.8825347758887174, |
| "grad_norm": 0.42899090051651, |
| "learning_rate": 4.480197298908939e-05, |
| "loss": 3.3917, |
| "num_input_tokens_seen": 1398928, |
| "step": 1865 |
| }, |
| { |
| "epoch": 2.890262751159196, |
| "grad_norm": 0.23555681109428406, |
| "learning_rate": 4.4760734777040785e-05, |
| "loss": 3.5563, |
| "num_input_tokens_seen": 1402512, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.8979907264296756, |
| "grad_norm": 0.3788621127605438, |
| "learning_rate": 4.471935276826563e-05, |
| "loss": 3.4171, |
| "num_input_tokens_seen": 1406544, |
| "step": 1875 |
| }, |
| { |
| "epoch": 2.9057187017001547, |
| "grad_norm": 0.2907779812812805, |
| "learning_rate": 4.4677827263896315e-05, |
| "loss": 3.5528, |
| "num_input_tokens_seen": 1410064, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.9134466769706338, |
| "grad_norm": 0.29947736859321594, |
| "learning_rate": 4.463615856610943e-05, |
| "loss": 3.3987, |
| "num_input_tokens_seen": 1413648, |
| "step": 1885 |
| }, |
| { |
| "epoch": 2.921174652241113, |
| "grad_norm": 0.35539594292640686, |
| "learning_rate": 4.4594346978123595e-05, |
| "loss": 3.3475, |
| "num_input_tokens_seen": 1417232, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.928902627511592, |
| "grad_norm": 0.24370482563972473, |
| "learning_rate": 4.45523928041972e-05, |
| "loss": 3.565, |
| "num_input_tokens_seen": 1420560, |
| "step": 1895 |
| }, |
| { |
| "epoch": 2.936630602782071, |
| "grad_norm": 0.3878605365753174, |
| "learning_rate": 4.45102963496262e-05, |
| "loss": 3.2705, |
| "num_input_tokens_seen": 1424656, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.94435857805255, |
| "grad_norm": 0.24530839920043945, |
| "learning_rate": 4.4468057920741976e-05, |
| "loss": 3.6164, |
| "num_input_tokens_seen": 1428688, |
| "step": 1905 |
| }, |
| { |
| "epoch": 2.9520865533230296, |
| "grad_norm": 0.2778976857662201, |
| "learning_rate": 4.442567782490897e-05, |
| "loss": 3.4781, |
| "num_input_tokens_seen": 1432144, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.9598145285935082, |
| "grad_norm": 0.39094430208206177, |
| "learning_rate": 4.4383156370522554e-05, |
| "loss": 3.5724, |
| "num_input_tokens_seen": 1435792, |
| "step": 1915 |
| }, |
| { |
| "epoch": 2.9675425038639878, |
| "grad_norm": 0.22923092544078827, |
| "learning_rate": 4.434049386700676e-05, |
| "loss": 3.2843, |
| "num_input_tokens_seen": 1439120, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.975270479134467, |
| "grad_norm": 0.2819403409957886, |
| "learning_rate": 4.4297690624811984e-05, |
| "loss": 3.4764, |
| "num_input_tokens_seen": 1442896, |
| "step": 1925 |
| }, |
| { |
| "epoch": 2.982998454404946, |
| "grad_norm": 0.466340571641922, |
| "learning_rate": 4.42547469554128e-05, |
| "loss": 3.0741, |
| "num_input_tokens_seen": 1446352, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.990726429675425, |
| "grad_norm": 0.2733190655708313, |
| "learning_rate": 4.421166317130563e-05, |
| "loss": 3.2974, |
| "num_input_tokens_seen": 1450256, |
| "step": 1935 |
| }, |
| { |
| "epoch": 2.998454404945904, |
| "grad_norm": 0.2440556436777115, |
| "learning_rate": 4.4168439586006506e-05, |
| "loss": 3.0494, |
| "num_input_tokens_seen": 1454096, |
| "step": 1940 |
| }, |
| { |
| "epoch": 3.0046367851622873, |
| "eval_loss": 3.3124587535858154, |
| "eval_runtime": 9.8622, |
| "eval_samples_per_second": 58.303, |
| "eval_steps_per_second": 7.301, |
| "num_input_tokens_seen": 1456656, |
| "step": 1944 |
| }, |
| { |
| "epoch": 3.006182380216383, |
| "grad_norm": 0.2453344613313675, |
| "learning_rate": 4.412507651404878e-05, |
| "loss": 3.2227, |
| "num_input_tokens_seen": 1457360, |
| "step": 1945 |
| }, |
| { |
| "epoch": 3.0139103554868623, |
| "grad_norm": 0.32557570934295654, |
| "learning_rate": 4.408157427098083e-05, |
| "loss": 3.336, |
| "num_input_tokens_seen": 1460752, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.021638330757342, |
| "grad_norm": 0.2702576518058777, |
| "learning_rate": 4.4037933173363756e-05, |
| "loss": 3.3529, |
| "num_input_tokens_seen": 1464208, |
| "step": 1955 |
| }, |
| { |
| "epoch": 3.029366306027821, |
| "grad_norm": 0.389478474855423, |
| "learning_rate": 4.3994153538769114e-05, |
| "loss": 3.1414, |
| "num_input_tokens_seen": 1467792, |
| "step": 1960 |
| }, |
| { |
| "epoch": 3.0370942812983, |
| "grad_norm": 0.2336999922990799, |
| "learning_rate": 4.395023568577655e-05, |
| "loss": 3.4423, |
| "num_input_tokens_seen": 1471504, |
| "step": 1965 |
| }, |
| { |
| "epoch": 3.044822256568779, |
| "grad_norm": 0.2960321605205536, |
| "learning_rate": 4.390617993397153e-05, |
| "loss": 3.3133, |
| "num_input_tokens_seen": 1475216, |
| "step": 1970 |
| }, |
| { |
| "epoch": 3.052550231839258, |
| "grad_norm": 0.39140525460243225, |
| "learning_rate": 4.3861986603942985e-05, |
| "loss": 3.4031, |
| "num_input_tokens_seen": 1478672, |
| "step": 1975 |
| }, |
| { |
| "epoch": 3.060278207109737, |
| "grad_norm": 0.40083011984825134, |
| "learning_rate": 4.3817656017280995e-05, |
| "loss": 3.3519, |
| "num_input_tokens_seen": 1482704, |
| "step": 1980 |
| }, |
| { |
| "epoch": 3.0680061823802163, |
| "grad_norm": 0.27878859639167786, |
| "learning_rate": 4.3773188496574424e-05, |
| "loss": 3.6379, |
| "num_input_tokens_seen": 1486160, |
| "step": 1985 |
| }, |
| { |
| "epoch": 3.0757341576506954, |
| "grad_norm": 0.24547715485095978, |
| "learning_rate": 4.372858436540863e-05, |
| "loss": 3.0755, |
| "num_input_tokens_seen": 1489360, |
| "step": 1990 |
| }, |
| { |
| "epoch": 3.0834621329211744, |
| "grad_norm": 0.3561779260635376, |
| "learning_rate": 4.368384394836301e-05, |
| "loss": 3.2519, |
| "num_input_tokens_seen": 1493136, |
| "step": 1995 |
| }, |
| { |
| "epoch": 3.091190108191654, |
| "grad_norm": 0.561246931552887, |
| "learning_rate": 4.363896757100876e-05, |
| "loss": 3.0584, |
| "num_input_tokens_seen": 1497552, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.098918083462133, |
| "grad_norm": 0.3138968348503113, |
| "learning_rate": 4.359395555990641e-05, |
| "loss": 3.6345, |
| "num_input_tokens_seen": 1501200, |
| "step": 2005 |
| }, |
| { |
| "epoch": 3.106646058732612, |
| "grad_norm": 0.2394760251045227, |
| "learning_rate": 4.3548808242603484e-05, |
| "loss": 3.3659, |
| "num_input_tokens_seen": 1505296, |
| "step": 2010 |
| }, |
| { |
| "epoch": 3.114374034003091, |
| "grad_norm": 0.25303345918655396, |
| "learning_rate": 4.3503525947632126e-05, |
| "loss": 3.5564, |
| "num_input_tokens_seen": 1509456, |
| "step": 2015 |
| }, |
| { |
| "epoch": 3.1221020092735703, |
| "grad_norm": 0.5716120004653931, |
| "learning_rate": 4.3458109004506684e-05, |
| "loss": 2.9278, |
| "num_input_tokens_seen": 1512976, |
| "step": 2020 |
| }, |
| { |
| "epoch": 3.1298299845440494, |
| "grad_norm": 0.3777564764022827, |
| "learning_rate": 4.3412557743721336e-05, |
| "loss": 3.3736, |
| "num_input_tokens_seen": 1516752, |
| "step": 2025 |
| }, |
| { |
| "epoch": 3.1375579598145285, |
| "grad_norm": 0.5030646920204163, |
| "learning_rate": 4.336687249674768e-05, |
| "loss": 3.0751, |
| "num_input_tokens_seen": 1520400, |
| "step": 2030 |
| }, |
| { |
| "epoch": 3.1452859350850075, |
| "grad_norm": 0.40548887848854065, |
| "learning_rate": 4.33210535960323e-05, |
| "loss": 3.464, |
| "num_input_tokens_seen": 1524048, |
| "step": 2035 |
| }, |
| { |
| "epoch": 3.153013910355487, |
| "grad_norm": 0.287009596824646, |
| "learning_rate": 4.3275101374994386e-05, |
| "loss": 2.9975, |
| "num_input_tokens_seen": 1527440, |
| "step": 2040 |
| }, |
| { |
| "epoch": 3.160741885625966, |
| "grad_norm": 0.2755314111709595, |
| "learning_rate": 4.322901616802326e-05, |
| "loss": 3.1306, |
| "num_input_tokens_seen": 1531088, |
| "step": 2045 |
| }, |
| { |
| "epoch": 3.1684698608964452, |
| "grad_norm": 0.2994464337825775, |
| "learning_rate": 4.3182798310475994e-05, |
| "loss": 3.3007, |
| "num_input_tokens_seen": 1535568, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.1761978361669243, |
| "grad_norm": 0.42739635705947876, |
| "learning_rate": 4.313644813867491e-05, |
| "loss": 3.4714, |
| "num_input_tokens_seen": 1539408, |
| "step": 2055 |
| }, |
| { |
| "epoch": 3.1839258114374034, |
| "grad_norm": 0.33637383580207825, |
| "learning_rate": 4.308996598990521e-05, |
| "loss": 3.2876, |
| "num_input_tokens_seen": 1543376, |
| "step": 2060 |
| }, |
| { |
| "epoch": 3.1916537867078825, |
| "grad_norm": 0.317121297121048, |
| "learning_rate": 4.3043352202412445e-05, |
| "loss": 3.3758, |
| "num_input_tokens_seen": 1547216, |
| "step": 2065 |
| }, |
| { |
| "epoch": 3.1993817619783615, |
| "grad_norm": 0.471192330121994, |
| "learning_rate": 4.29966071154001e-05, |
| "loss": 3.3303, |
| "num_input_tokens_seen": 1550800, |
| "step": 2070 |
| }, |
| { |
| "epoch": 3.2071097372488406, |
| "grad_norm": 0.37737250328063965, |
| "learning_rate": 4.294973106902711e-05, |
| "loss": 3.0919, |
| "num_input_tokens_seen": 1554384, |
| "step": 2075 |
| }, |
| { |
| "epoch": 3.21483771251932, |
| "grad_norm": 0.3162528872489929, |
| "learning_rate": 4.2902724404405395e-05, |
| "loss": 3.7993, |
| "num_input_tokens_seen": 1557968, |
| "step": 2080 |
| }, |
| { |
| "epoch": 3.2225656877897992, |
| "grad_norm": 0.3474006652832031, |
| "learning_rate": 4.285558746359735e-05, |
| "loss": 3.3829, |
| "num_input_tokens_seen": 1562128, |
| "step": 2085 |
| }, |
| { |
| "epoch": 3.2302936630602783, |
| "grad_norm": 0.29031333327293396, |
| "learning_rate": 4.280832058961338e-05, |
| "loss": 3.2496, |
| "num_input_tokens_seen": 1566096, |
| "step": 2090 |
| }, |
| { |
| "epoch": 3.2380216383307574, |
| "grad_norm": 0.2682136595249176, |
| "learning_rate": 4.2760924126409427e-05, |
| "loss": 3.2577, |
| "num_input_tokens_seen": 1570064, |
| "step": 2095 |
| }, |
| { |
| "epoch": 3.2457496136012365, |
| "grad_norm": 0.2770131230354309, |
| "learning_rate": 4.271339841888441e-05, |
| "loss": 2.9641, |
| "num_input_tokens_seen": 1574032, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.2534775888717156, |
| "grad_norm": 0.3436896502971649, |
| "learning_rate": 4.266574381287776e-05, |
| "loss": 3.1281, |
| "num_input_tokens_seen": 1577488, |
| "step": 2105 |
| }, |
| { |
| "epoch": 3.2612055641421946, |
| "grad_norm": 0.32517606019973755, |
| "learning_rate": 4.261796065516688e-05, |
| "loss": 3.448, |
| "num_input_tokens_seen": 1581200, |
| "step": 2110 |
| }, |
| { |
| "epoch": 3.2689335394126737, |
| "grad_norm": 0.3597550392150879, |
| "learning_rate": 4.257004929346462e-05, |
| "loss": 3.2547, |
| "num_input_tokens_seen": 1585040, |
| "step": 2115 |
| }, |
| { |
| "epoch": 3.276661514683153, |
| "grad_norm": 0.2619961202144623, |
| "learning_rate": 4.252201007641679e-05, |
| "loss": 3.3299, |
| "num_input_tokens_seen": 1588624, |
| "step": 2120 |
| }, |
| { |
| "epoch": 3.2843894899536323, |
| "grad_norm": 0.2485017478466034, |
| "learning_rate": 4.247384335359956e-05, |
| "loss": 3.4412, |
| "num_input_tokens_seen": 1592784, |
| "step": 2125 |
| }, |
| { |
| "epoch": 3.2921174652241114, |
| "grad_norm": 0.5881773233413696, |
| "learning_rate": 4.2425549475516954e-05, |
| "loss": 2.9302, |
| "num_input_tokens_seen": 1596432, |
| "step": 2130 |
| }, |
| { |
| "epoch": 3.2998454404945905, |
| "grad_norm": 0.32718685269355774, |
| "learning_rate": 4.2377128793598295e-05, |
| "loss": 3.3778, |
| "num_input_tokens_seen": 1600016, |
| "step": 2135 |
| }, |
| { |
| "epoch": 3.3075734157650696, |
| "grad_norm": 0.26880738139152527, |
| "learning_rate": 4.232858166019564e-05, |
| "loss": 3.155, |
| "num_input_tokens_seen": 1604048, |
| "step": 2140 |
| }, |
| { |
| "epoch": 3.3153013910355487, |
| "grad_norm": 0.40621763467788696, |
| "learning_rate": 4.227990842858122e-05, |
| "loss": 3.0137, |
| "num_input_tokens_seen": 1607952, |
| "step": 2145 |
| }, |
| { |
| "epoch": 3.3230293663060277, |
| "grad_norm": 0.27599066495895386, |
| "learning_rate": 4.223110945294486e-05, |
| "loss": 3.4187, |
| "num_input_tokens_seen": 1611600, |
| "step": 2150 |
| }, |
| { |
| "epoch": 3.330757341576507, |
| "grad_norm": 0.32147035002708435, |
| "learning_rate": 4.2182185088391435e-05, |
| "loss": 3.236, |
| "num_input_tokens_seen": 1615184, |
| "step": 2155 |
| }, |
| { |
| "epoch": 3.338485316846986, |
| "grad_norm": 0.2467658519744873, |
| "learning_rate": 4.213313569093824e-05, |
| "loss": 3.2334, |
| "num_input_tokens_seen": 1619152, |
| "step": 2160 |
| }, |
| { |
| "epoch": 3.346213292117465, |
| "grad_norm": 0.31439894437789917, |
| "learning_rate": 4.208396161751243e-05, |
| "loss": 3.0429, |
| "num_input_tokens_seen": 1622736, |
| "step": 2165 |
| }, |
| { |
| "epoch": 3.3539412673879445, |
| "grad_norm": 0.27845317125320435, |
| "learning_rate": 4.20346632259484e-05, |
| "loss": 3.3637, |
| "num_input_tokens_seen": 1626256, |
| "step": 2170 |
| }, |
| { |
| "epoch": 3.3616692426584236, |
| "grad_norm": 0.34065747261047363, |
| "learning_rate": 4.198524087498522e-05, |
| "loss": 3.3511, |
| "num_input_tokens_seen": 1629584, |
| "step": 2175 |
| }, |
| { |
| "epoch": 3.3693972179289027, |
| "grad_norm": 0.3487941324710846, |
| "learning_rate": 4.193569492426398e-05, |
| "loss": 3.021, |
| "num_input_tokens_seen": 1633552, |
| "step": 2180 |
| }, |
| { |
| "epoch": 3.3771251931993818, |
| "grad_norm": 0.33543628454208374, |
| "learning_rate": 4.188602573432519e-05, |
| "loss": 3.215, |
| "num_input_tokens_seen": 1637584, |
| "step": 2185 |
| }, |
| { |
| "epoch": 3.384853168469861, |
| "grad_norm": 0.5411263704299927, |
| "learning_rate": 4.1836233666606176e-05, |
| "loss": 3.2398, |
| "num_input_tokens_seen": 1641744, |
| "step": 2190 |
| }, |
| { |
| "epoch": 3.39258114374034, |
| "grad_norm": 0.26962918043136597, |
| "learning_rate": 4.1786319083438406e-05, |
| "loss": 3.1149, |
| "num_input_tokens_seen": 1645520, |
| "step": 2195 |
| }, |
| { |
| "epoch": 3.400309119010819, |
| "grad_norm": 0.24972794950008392, |
| "learning_rate": 4.1736282348044916e-05, |
| "loss": 3.1159, |
| "num_input_tokens_seen": 1648912, |
| "step": 2200 |
| }, |
| { |
| "epoch": 3.4080370942812985, |
| "grad_norm": 0.35576534271240234, |
| "learning_rate": 4.168612382453759e-05, |
| "loss": 3.1835, |
| "num_input_tokens_seen": 1653072, |
| "step": 2205 |
| }, |
| { |
| "epoch": 3.4157650695517776, |
| "grad_norm": 0.3997161090373993, |
| "learning_rate": 4.163584387791458e-05, |
| "loss": 3.3053, |
| "num_input_tokens_seen": 1656912, |
| "step": 2210 |
| }, |
| { |
| "epoch": 3.4234930448222567, |
| "grad_norm": 0.47984302043914795, |
| "learning_rate": 4.158544287405762e-05, |
| "loss": 3.1529, |
| "num_input_tokens_seen": 1660560, |
| "step": 2215 |
| }, |
| { |
| "epoch": 3.4312210200927358, |
| "grad_norm": 0.24802884459495544, |
| "learning_rate": 4.153492117972934e-05, |
| "loss": 3.3344, |
| "num_input_tokens_seen": 1664528, |
| "step": 2220 |
| }, |
| { |
| "epoch": 3.438948995363215, |
| "grad_norm": 0.29277801513671875, |
| "learning_rate": 4.148427916257064e-05, |
| "loss": 3.0215, |
| "num_input_tokens_seen": 1668688, |
| "step": 2225 |
| }, |
| { |
| "epoch": 3.446676970633694, |
| "grad_norm": 0.3837185502052307, |
| "learning_rate": 4.1433517191098e-05, |
| "loss": 3.3542, |
| "num_input_tokens_seen": 1672528, |
| "step": 2230 |
| }, |
| { |
| "epoch": 3.454404945904173, |
| "grad_norm": 0.2690471112728119, |
| "learning_rate": 4.138263563470078e-05, |
| "loss": 3.1406, |
| "num_input_tokens_seen": 1675984, |
| "step": 2235 |
| }, |
| { |
| "epoch": 3.462132921174652, |
| "grad_norm": 0.28649571537971497, |
| "learning_rate": 4.133163486363857e-05, |
| "loss": 2.9112, |
| "num_input_tokens_seen": 1680080, |
| "step": 2240 |
| }, |
| { |
| "epoch": 3.469860896445131, |
| "grad_norm": 0.2906050980091095, |
| "learning_rate": 4.128051524903844e-05, |
| "loss": 3.0799, |
| "num_input_tokens_seen": 1683856, |
| "step": 2245 |
| }, |
| { |
| "epoch": 3.4775888717156107, |
| "grad_norm": 0.2971077561378479, |
| "learning_rate": 4.12292771628923e-05, |
| "loss": 3.4529, |
| "num_input_tokens_seen": 1688016, |
| "step": 2250 |
| }, |
| { |
| "epoch": 3.48531684698609, |
| "grad_norm": 0.279525488615036, |
| "learning_rate": 4.1177920978054144e-05, |
| "loss": 3.2334, |
| "num_input_tokens_seen": 1691664, |
| "step": 2255 |
| }, |
| { |
| "epoch": 3.493044822256569, |
| "grad_norm": 0.35451480746269226, |
| "learning_rate": 4.1126447068237376e-05, |
| "loss": 3.2177, |
| "num_input_tokens_seen": 1695312, |
| "step": 2260 |
| }, |
| { |
| "epoch": 3.500772797527048, |
| "grad_norm": 0.42419734597206116, |
| "learning_rate": 4.107485580801205e-05, |
| "loss": 3.1003, |
| "num_input_tokens_seen": 1699344, |
| "step": 2265 |
| }, |
| { |
| "epoch": 3.5054095826893352, |
| "eval_loss": 3.1060409545898438, |
| "eval_runtime": 9.8552, |
| "eval_samples_per_second": 58.345, |
| "eval_steps_per_second": 7.306, |
| "num_input_tokens_seen": 1701712, |
| "step": 2268 |
| }, |
| { |
| "epoch": 3.508500772797527, |
| "grad_norm": 0.3014146685600281, |
| "learning_rate": 4.102314757280219e-05, |
| "loss": 3.3953, |
| "num_input_tokens_seen": 1703312, |
| "step": 2270 |
| }, |
| { |
| "epoch": 3.516228748068006, |
| "grad_norm": 0.3373369574546814, |
| "learning_rate": 4.0971322738883014e-05, |
| "loss": 3.0952, |
| "num_input_tokens_seen": 1707088, |
| "step": 2275 |
| }, |
| { |
| "epoch": 3.523956723338485, |
| "grad_norm": 0.29237452149391174, |
| "learning_rate": 4.091938168337822e-05, |
| "loss": 3.1188, |
| "num_input_tokens_seen": 1710544, |
| "step": 2280 |
| }, |
| { |
| "epoch": 3.5316846986089647, |
| "grad_norm": 0.44121846556663513, |
| "learning_rate": 4.086732478425726e-05, |
| "loss": 2.8938, |
| "num_input_tokens_seen": 1713936, |
| "step": 2285 |
| }, |
| { |
| "epoch": 3.5394126738794434, |
| "grad_norm": 0.3628321886062622, |
| "learning_rate": 4.081515242033254e-05, |
| "loss": 3.0559, |
| "num_input_tokens_seen": 1717776, |
| "step": 2290 |
| }, |
| { |
| "epoch": 3.547140649149923, |
| "grad_norm": 0.3319717049598694, |
| "learning_rate": 4.076286497125671e-05, |
| "loss": 3.1759, |
| "num_input_tokens_seen": 1721616, |
| "step": 2295 |
| }, |
| { |
| "epoch": 3.554868624420402, |
| "grad_norm": 0.328025758266449, |
| "learning_rate": 4.071046281751986e-05, |
| "loss": 3.0737, |
| "num_input_tokens_seen": 1725520, |
| "step": 2300 |
| }, |
| { |
| "epoch": 3.562596599690881, |
| "grad_norm": 0.3481411635875702, |
| "learning_rate": 4.065794634044679e-05, |
| "loss": 2.8744, |
| "num_input_tokens_seen": 1729232, |
| "step": 2305 |
| }, |
| { |
| "epoch": 3.57032457496136, |
| "grad_norm": 0.2612632215023041, |
| "learning_rate": 4.060531592219422e-05, |
| "loss": 3.0024, |
| "num_input_tokens_seen": 1732752, |
| "step": 2310 |
| }, |
| { |
| "epoch": 3.578052550231839, |
| "grad_norm": 0.2584324777126312, |
| "learning_rate": 4.0552571945748e-05, |
| "loss": 2.8444, |
| "num_input_tokens_seen": 1736528, |
| "step": 2315 |
| }, |
| { |
| "epoch": 3.5857805255023183, |
| "grad_norm": 0.3369653820991516, |
| "learning_rate": 4.049971479492034e-05, |
| "loss": 3.0028, |
| "num_input_tokens_seen": 1740048, |
| "step": 2320 |
| }, |
| { |
| "epoch": 3.5935085007727974, |
| "grad_norm": 0.35002318024635315, |
| "learning_rate": 4.044674485434699e-05, |
| "loss": 3.2903, |
| "num_input_tokens_seen": 1743824, |
| "step": 2325 |
| }, |
| { |
| "epoch": 3.601236476043277, |
| "grad_norm": 0.29385900497436523, |
| "learning_rate": 4.039366250948448e-05, |
| "loss": 3.095, |
| "num_input_tokens_seen": 1747920, |
| "step": 2330 |
| }, |
| { |
| "epoch": 3.6089644513137555, |
| "grad_norm": 0.5506426095962524, |
| "learning_rate": 4.034046814660728e-05, |
| "loss": 3.2893, |
| "num_input_tokens_seen": 1751632, |
| "step": 2335 |
| }, |
| { |
| "epoch": 3.616692426584235, |
| "grad_norm": 0.24844296276569366, |
| "learning_rate": 4.0287162152805e-05, |
| "loss": 3.3977, |
| "num_input_tokens_seen": 1755664, |
| "step": 2340 |
| }, |
| { |
| "epoch": 3.624420401854714, |
| "grad_norm": 0.38642531633377075, |
| "learning_rate": 4.0233744915979594e-05, |
| "loss": 3.0687, |
| "num_input_tokens_seen": 1758800, |
| "step": 2345 |
| }, |
| { |
| "epoch": 3.6321483771251932, |
| "grad_norm": 0.49976831674575806, |
| "learning_rate": 4.01802168248425e-05, |
| "loss": 3.1762, |
| "num_input_tokens_seen": 1762832, |
| "step": 2350 |
| }, |
| { |
| "epoch": 3.6398763523956723, |
| "grad_norm": 0.3384222984313965, |
| "learning_rate": 4.012657826891185e-05, |
| "loss": 2.937, |
| "num_input_tokens_seen": 1766672, |
| "step": 2355 |
| }, |
| { |
| "epoch": 3.6476043276661514, |
| "grad_norm": 0.36615273356437683, |
| "learning_rate": 4.00728296385096e-05, |
| "loss": 3.0035, |
| "num_input_tokens_seen": 1770256, |
| "step": 2360 |
| }, |
| { |
| "epoch": 3.6553323029366305, |
| "grad_norm": 0.48593974113464355, |
| "learning_rate": 4.0018971324758705e-05, |
| "loss": 2.7236, |
| "num_input_tokens_seen": 1774224, |
| "step": 2365 |
| }, |
| { |
| "epoch": 3.6630602782071096, |
| "grad_norm": 0.3047850728034973, |
| "learning_rate": 3.996500371958028e-05, |
| "loss": 3.2167, |
| "num_input_tokens_seen": 1777616, |
| "step": 2370 |
| }, |
| { |
| "epoch": 3.670788253477589, |
| "grad_norm": 0.43760672211647034, |
| "learning_rate": 3.991092721569075e-05, |
| "loss": 2.9141, |
| "num_input_tokens_seen": 1781648, |
| "step": 2375 |
| }, |
| { |
| "epoch": 3.678516228748068, |
| "grad_norm": 0.289705753326416, |
| "learning_rate": 3.985674220659898e-05, |
| "loss": 3.1258, |
| "num_input_tokens_seen": 1785488, |
| "step": 2380 |
| }, |
| { |
| "epoch": 3.6862442040185472, |
| "grad_norm": 0.2379242330789566, |
| "learning_rate": 3.980244908660341e-05, |
| "loss": 2.8883, |
| "num_input_tokens_seen": 1789456, |
| "step": 2385 |
| }, |
| { |
| "epoch": 3.6939721792890263, |
| "grad_norm": 0.34431400895118713, |
| "learning_rate": 3.974804825078918e-05, |
| "loss": 3.2766, |
| "num_input_tokens_seen": 1793040, |
| "step": 2390 |
| }, |
| { |
| "epoch": 3.7017001545595054, |
| "grad_norm": 0.8246620297431946, |
| "learning_rate": 3.96935400950253e-05, |
| "loss": 2.8954, |
| "num_input_tokens_seen": 1796560, |
| "step": 2395 |
| }, |
| { |
| "epoch": 3.7094281298299845, |
| "grad_norm": 0.32684797048568726, |
| "learning_rate": 3.963892501596169e-05, |
| "loss": 2.9391, |
| "num_input_tokens_seen": 1800400, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.7171561051004636, |
| "grad_norm": 0.4404638409614563, |
| "learning_rate": 3.958420341102639e-05, |
| "loss": 3.003, |
| "num_input_tokens_seen": 1804048, |
| "step": 2405 |
| }, |
| { |
| "epoch": 3.7248840803709427, |
| "grad_norm": 0.5552765130996704, |
| "learning_rate": 3.9529375678422575e-05, |
| "loss": 3.0192, |
| "num_input_tokens_seen": 1807568, |
| "step": 2410 |
| }, |
| { |
| "epoch": 3.7326120556414217, |
| "grad_norm": 0.39639389514923096, |
| "learning_rate": 3.9474442217125726e-05, |
| "loss": 2.9807, |
| "num_input_tokens_seen": 1811280, |
| "step": 2415 |
| }, |
| { |
| "epoch": 3.7403400309119013, |
| "grad_norm": 0.280390202999115, |
| "learning_rate": 3.9419403426880684e-05, |
| "loss": 3.2511, |
| "num_input_tokens_seen": 1814992, |
| "step": 2420 |
| }, |
| { |
| "epoch": 3.7480680061823803, |
| "grad_norm": 0.4016093313694, |
| "learning_rate": 3.936425970819877e-05, |
| "loss": 2.9766, |
| "num_input_tokens_seen": 1818832, |
| "step": 2425 |
| }, |
| { |
| "epoch": 3.7557959814528594, |
| "grad_norm": 0.520076334476471, |
| "learning_rate": 3.930901146235485e-05, |
| "loss": 2.9149, |
| "num_input_tokens_seen": 1822288, |
| "step": 2430 |
| }, |
| { |
| "epoch": 3.7635239567233385, |
| "grad_norm": 0.3622855246067047, |
| "learning_rate": 3.925365909138443e-05, |
| "loss": 2.9592, |
| "num_input_tokens_seen": 1826192, |
| "step": 2435 |
| }, |
| { |
| "epoch": 3.7712519319938176, |
| "grad_norm": 0.3980333209037781, |
| "learning_rate": 3.91982029980807e-05, |
| "loss": 2.8414, |
| "num_input_tokens_seen": 1829968, |
| "step": 2440 |
| }, |
| { |
| "epoch": 3.7789799072642967, |
| "grad_norm": 0.3101566433906555, |
| "learning_rate": 3.9142643585991655e-05, |
| "loss": 2.7509, |
| "num_input_tokens_seen": 1834000, |
| "step": 2445 |
| }, |
| { |
| "epoch": 3.7867078825347757, |
| "grad_norm": 0.6647924780845642, |
| "learning_rate": 3.908698125941713e-05, |
| "loss": 2.9496, |
| "num_input_tokens_seen": 1838160, |
| "step": 2450 |
| }, |
| { |
| "epoch": 3.7944358578052553, |
| "grad_norm": 0.2631145417690277, |
| "learning_rate": 3.903121642340583e-05, |
| "loss": 2.9745, |
| "num_input_tokens_seen": 1841872, |
| "step": 2455 |
| }, |
| { |
| "epoch": 3.802163833075734, |
| "grad_norm": 0.352568119764328, |
| "learning_rate": 3.8975349483752436e-05, |
| "loss": 2.9618, |
| "num_input_tokens_seen": 1845776, |
| "step": 2460 |
| }, |
| { |
| "epoch": 3.8098918083462134, |
| "grad_norm": 0.39804670214653015, |
| "learning_rate": 3.8919380846994605e-05, |
| "loss": 3.1174, |
| "num_input_tokens_seen": 1849552, |
| "step": 2465 |
| }, |
| { |
| "epoch": 3.8176197836166925, |
| "grad_norm": 0.4087231755256653, |
| "learning_rate": 3.8863310920410055e-05, |
| "loss": 2.437, |
| "num_input_tokens_seen": 1853008, |
| "step": 2470 |
| }, |
| { |
| "epoch": 3.8253477588871716, |
| "grad_norm": 0.506405234336853, |
| "learning_rate": 3.8807140112013574e-05, |
| "loss": 3.0171, |
| "num_input_tokens_seen": 1856272, |
| "step": 2475 |
| }, |
| { |
| "epoch": 3.8330757341576507, |
| "grad_norm": 0.41620808839797974, |
| "learning_rate": 3.875086883055403e-05, |
| "loss": 3.0946, |
| "num_input_tokens_seen": 1859920, |
| "step": 2480 |
| }, |
| { |
| "epoch": 3.8408037094281298, |
| "grad_norm": 0.3406372368335724, |
| "learning_rate": 3.869449748551146e-05, |
| "loss": 3.0194, |
| "num_input_tokens_seen": 1863632, |
| "step": 2485 |
| }, |
| { |
| "epoch": 3.848531684698609, |
| "grad_norm": 0.4365371763706207, |
| "learning_rate": 3.863802648709404e-05, |
| "loss": 2.8842, |
| "num_input_tokens_seen": 1867088, |
| "step": 2490 |
| }, |
| { |
| "epoch": 3.856259659969088, |
| "grad_norm": 0.5339605808258057, |
| "learning_rate": 3.858145624623509e-05, |
| "loss": 2.9492, |
| "num_input_tokens_seen": 1870672, |
| "step": 2495 |
| }, |
| { |
| "epoch": 3.8639876352395675, |
| "grad_norm": 0.2724621295928955, |
| "learning_rate": 3.852478717459014e-05, |
| "loss": 2.9462, |
| "num_input_tokens_seen": 1874384, |
| "step": 2500 |
| }, |
| { |
| "epoch": 3.871715610510046, |
| "grad_norm": 0.36727994680404663, |
| "learning_rate": 3.8468019684533875e-05, |
| "loss": 2.8771, |
| "num_input_tokens_seen": 1878096, |
| "step": 2505 |
| }, |
| { |
| "epoch": 3.8794435857805256, |
| "grad_norm": 0.2990911900997162, |
| "learning_rate": 3.8411154189157185e-05, |
| "loss": 3.0987, |
| "num_input_tokens_seen": 1881936, |
| "step": 2510 |
| }, |
| { |
| "epoch": 3.8871715610510047, |
| "grad_norm": 0.32814911007881165, |
| "learning_rate": 3.8354191102264105e-05, |
| "loss": 3.1755, |
| "num_input_tokens_seen": 1885520, |
| "step": 2515 |
| }, |
| { |
| "epoch": 3.894899536321484, |
| "grad_norm": 0.47634440660476685, |
| "learning_rate": 3.829713083836886e-05, |
| "loss": 3.073, |
| "num_input_tokens_seen": 1889488, |
| "step": 2520 |
| }, |
| { |
| "epoch": 3.902627511591963, |
| "grad_norm": 0.39127570390701294, |
| "learning_rate": 3.82399738126928e-05, |
| "loss": 3.1232, |
| "num_input_tokens_seen": 1893072, |
| "step": 2525 |
| }, |
| { |
| "epoch": 3.910355486862442, |
| "grad_norm": 0.366414338350296, |
| "learning_rate": 3.818272044116142e-05, |
| "loss": 3.0161, |
| "num_input_tokens_seen": 1896912, |
| "step": 2530 |
| }, |
| { |
| "epoch": 3.918083462132921, |
| "grad_norm": 0.5952576994895935, |
| "learning_rate": 3.812537114040131e-05, |
| "loss": 2.7545, |
| "num_input_tokens_seen": 1900432, |
| "step": 2535 |
| }, |
| { |
| "epoch": 3.9258114374034, |
| "grad_norm": 0.3024425208568573, |
| "learning_rate": 3.806792632773709e-05, |
| "loss": 2.9895, |
| "num_input_tokens_seen": 1904656, |
| "step": 2540 |
| }, |
| { |
| "epoch": 3.9335394126738796, |
| "grad_norm": 0.3352251648902893, |
| "learning_rate": 3.801038642118847e-05, |
| "loss": 2.9207, |
| "num_input_tokens_seen": 1908176, |
| "step": 2545 |
| }, |
| { |
| "epoch": 3.9412673879443587, |
| "grad_norm": 0.37967225909233093, |
| "learning_rate": 3.7952751839467106e-05, |
| "loss": 2.8346, |
| "num_input_tokens_seen": 1911568, |
| "step": 2550 |
| }, |
| { |
| "epoch": 3.948995363214838, |
| "grad_norm": 0.3634495735168457, |
| "learning_rate": 3.78950230019736e-05, |
| "loss": 2.6807, |
| "num_input_tokens_seen": 1915408, |
| "step": 2555 |
| }, |
| { |
| "epoch": 3.956723338485317, |
| "grad_norm": 0.5400171279907227, |
| "learning_rate": 3.783720032879445e-05, |
| "loss": 2.5507, |
| "num_input_tokens_seen": 1919376, |
| "step": 2560 |
| }, |
| { |
| "epoch": 3.964451313755796, |
| "grad_norm": 0.6465855836868286, |
| "learning_rate": 3.7779284240699003e-05, |
| "loss": 2.9413, |
| "num_input_tokens_seen": 1922960, |
| "step": 2565 |
| }, |
| { |
| "epoch": 3.972179289026275, |
| "grad_norm": 0.34273695945739746, |
| "learning_rate": 3.772127515913634e-05, |
| "loss": 2.7105, |
| "num_input_tokens_seen": 1926544, |
| "step": 2570 |
| }, |
| { |
| "epoch": 3.979907264296754, |
| "grad_norm": 0.37591809034347534, |
| "learning_rate": 3.766317350623227e-05, |
| "loss": 2.7055, |
| "num_input_tokens_seen": 1929744, |
| "step": 2575 |
| }, |
| { |
| "epoch": 3.9876352395672336, |
| "grad_norm": 0.39454731345176697, |
| "learning_rate": 3.760497970478624e-05, |
| "loss": 2.7457, |
| "num_input_tokens_seen": 1933520, |
| "step": 2580 |
| }, |
| { |
| "epoch": 3.9953632148377123, |
| "grad_norm": 0.453294038772583, |
| "learning_rate": 3.7546694178268215e-05, |
| "loss": 3.2118, |
| "num_input_tokens_seen": 1937360, |
| "step": 2585 |
| }, |
| { |
| "epoch": 4.003091190108192, |
| "grad_norm": 0.38035985827445984, |
| "learning_rate": 3.7488317350815674e-05, |
| "loss": 2.7815, |
| "num_input_tokens_seen": 1941552, |
| "step": 2590 |
| }, |
| { |
| "epoch": 4.006182380216384, |
| "eval_loss": 2.9035558700561523, |
| "eval_runtime": 9.8504, |
| "eval_samples_per_second": 58.373, |
| "eval_steps_per_second": 7.309, |
| "num_input_tokens_seen": 1942960, |
| "step": 2592 |
| }, |
| { |
| "epoch": 4.0108191653786704, |
| "grad_norm": 0.3568049371242523, |
| "learning_rate": 3.742984964723047e-05, |
| "loss": 3.0167, |
| "num_input_tokens_seen": 1945200, |
| "step": 2595 |
| }, |
| { |
| "epoch": 4.01854714064915, |
| "grad_norm": 0.3261895477771759, |
| "learning_rate": 3.737129149297574e-05, |
| "loss": 2.8763, |
| "num_input_tokens_seen": 1949168, |
| "step": 2600 |
| }, |
| { |
| "epoch": 4.0262751159196295, |
| "grad_norm": 0.6526939868927002, |
| "learning_rate": 3.731264331417284e-05, |
| "loss": 2.7011, |
| "num_input_tokens_seen": 1952624, |
| "step": 2605 |
| }, |
| { |
| "epoch": 4.034003091190108, |
| "grad_norm": 0.3340992331504822, |
| "learning_rate": 3.72539055375982e-05, |
| "loss": 2.7641, |
| "num_input_tokens_seen": 1956912, |
| "step": 2610 |
| }, |
| { |
| "epoch": 4.041731066460588, |
| "grad_norm": 0.2900647521018982, |
| "learning_rate": 3.7195078590680275e-05, |
| "loss": 2.9773, |
| "num_input_tokens_seen": 1960688, |
| "step": 2615 |
| }, |
| { |
| "epoch": 4.049459041731066, |
| "grad_norm": 0.45488232374191284, |
| "learning_rate": 3.713616290149636e-05, |
| "loss": 2.8954, |
| "num_input_tokens_seen": 1964272, |
| "step": 2620 |
| }, |
| { |
| "epoch": 4.057187017001546, |
| "grad_norm": 0.3596618175506592, |
| "learning_rate": 3.7077158898769574e-05, |
| "loss": 2.9347, |
| "num_input_tokens_seen": 1967792, |
| "step": 2625 |
| }, |
| { |
| "epoch": 4.0649149922720245, |
| "grad_norm": 0.3053226172924042, |
| "learning_rate": 3.701806701186563e-05, |
| "loss": 2.6642, |
| "num_input_tokens_seen": 1971312, |
| "step": 2630 |
| }, |
| { |
| "epoch": 4.072642967542504, |
| "grad_norm": 0.32128965854644775, |
| "learning_rate": 3.695888767078981e-05, |
| "loss": 3.0455, |
| "num_input_tokens_seen": 1975152, |
| "step": 2635 |
| }, |
| { |
| "epoch": 4.080370942812983, |
| "grad_norm": 0.3554864525794983, |
| "learning_rate": 3.6899621306183754e-05, |
| "loss": 3.1219, |
| "num_input_tokens_seen": 1978928, |
| "step": 2640 |
| }, |
| { |
| "epoch": 4.088098918083462, |
| "grad_norm": 0.2976425588130951, |
| "learning_rate": 3.684026834932238e-05, |
| "loss": 2.6695, |
| "num_input_tokens_seen": 1982256, |
| "step": 2645 |
| }, |
| { |
| "epoch": 4.095826893353942, |
| "grad_norm": 0.42317289113998413, |
| "learning_rate": 3.678082923211072e-05, |
| "loss": 2.8639, |
| "num_input_tokens_seen": 1986224, |
| "step": 2650 |
| }, |
| { |
| "epoch": 4.10355486862442, |
| "grad_norm": 0.440965861082077, |
| "learning_rate": 3.6721304387080804e-05, |
| "loss": 2.8947, |
| "num_input_tokens_seen": 1990064, |
| "step": 2655 |
| }, |
| { |
| "epoch": 4.1112828438949, |
| "grad_norm": 0.7069739699363708, |
| "learning_rate": 3.666169424738848e-05, |
| "loss": 2.8494, |
| "num_input_tokens_seen": 1993968, |
| "step": 2660 |
| }, |
| { |
| "epoch": 4.1190108191653785, |
| "grad_norm": 0.4947892725467682, |
| "learning_rate": 3.660199924681027e-05, |
| "loss": 3.0113, |
| "num_input_tokens_seen": 1997808, |
| "step": 2665 |
| }, |
| { |
| "epoch": 4.126738794435858, |
| "grad_norm": 0.4745485186576843, |
| "learning_rate": 3.6542219819740234e-05, |
| "loss": 3.0485, |
| "num_input_tokens_seen": 2001520, |
| "step": 2670 |
| }, |
| { |
| "epoch": 4.134466769706337, |
| "grad_norm": 0.4884566068649292, |
| "learning_rate": 3.648235640118678e-05, |
| "loss": 2.9497, |
| "num_input_tokens_seen": 2005232, |
| "step": 2675 |
| }, |
| { |
| "epoch": 4.142194744976816, |
| "grad_norm": 0.38576316833496094, |
| "learning_rate": 3.642240942676953e-05, |
| "loss": 2.7742, |
| "num_input_tokens_seen": 2009328, |
| "step": 2680 |
| }, |
| { |
| "epoch": 4.149922720247295, |
| "grad_norm": 0.39867785573005676, |
| "learning_rate": 3.6362379332716126e-05, |
| "loss": 2.7951, |
| "num_input_tokens_seen": 2012912, |
| "step": 2685 |
| }, |
| { |
| "epoch": 4.157650695517774, |
| "grad_norm": 0.36217308044433594, |
| "learning_rate": 3.630226655585904e-05, |
| "loss": 3.0995, |
| "num_input_tokens_seen": 2016944, |
| "step": 2690 |
| }, |
| { |
| "epoch": 4.165378670788254, |
| "grad_norm": 0.4282553791999817, |
| "learning_rate": 3.624207153363246e-05, |
| "loss": 2.8722, |
| "num_input_tokens_seen": 2020912, |
| "step": 2695 |
| }, |
| { |
| "epoch": 4.1731066460587325, |
| "grad_norm": 0.44885340332984924, |
| "learning_rate": 3.6181794704069036e-05, |
| "loss": 2.7546, |
| "num_input_tokens_seen": 2025136, |
| "step": 2700 |
| }, |
| { |
| "epoch": 4.180834621329212, |
| "grad_norm": 0.4299675226211548, |
| "learning_rate": 3.612143650579673e-05, |
| "loss": 2.968, |
| "num_input_tokens_seen": 2029424, |
| "step": 2705 |
| }, |
| { |
| "epoch": 4.188562596599691, |
| "grad_norm": 0.31228068470954895, |
| "learning_rate": 3.606099737803559e-05, |
| "loss": 2.9026, |
| "num_input_tokens_seen": 2033328, |
| "step": 2710 |
| }, |
| { |
| "epoch": 4.19629057187017, |
| "grad_norm": 0.630832850933075, |
| "learning_rate": 3.600047776059464e-05, |
| "loss": 3.1547, |
| "num_input_tokens_seen": 2037040, |
| "step": 2715 |
| }, |
| { |
| "epoch": 4.204018547140649, |
| "grad_norm": 0.4609726369380951, |
| "learning_rate": 3.593987809386855e-05, |
| "loss": 2.8728, |
| "num_input_tokens_seen": 2041008, |
| "step": 2720 |
| }, |
| { |
| "epoch": 4.211746522411128, |
| "grad_norm": 0.4508463442325592, |
| "learning_rate": 3.5879198818834544e-05, |
| "loss": 3.0045, |
| "num_input_tokens_seen": 2044912, |
| "step": 2725 |
| }, |
| { |
| "epoch": 4.219474497681608, |
| "grad_norm": 0.3503614366054535, |
| "learning_rate": 3.581844037704914e-05, |
| "loss": 3.0032, |
| "num_input_tokens_seen": 2048048, |
| "step": 2730 |
| }, |
| { |
| "epoch": 4.2272024729520865, |
| "grad_norm": 0.5175710916519165, |
| "learning_rate": 3.575760321064492e-05, |
| "loss": 3.0595, |
| "num_input_tokens_seen": 2051952, |
| "step": 2735 |
| }, |
| { |
| "epoch": 4.234930448222566, |
| "grad_norm": 0.5993461012840271, |
| "learning_rate": 3.569668776232737e-05, |
| "loss": 2.912, |
| "num_input_tokens_seen": 2055344, |
| "step": 2740 |
| }, |
| { |
| "epoch": 4.242658423493045, |
| "grad_norm": 0.5014939904212952, |
| "learning_rate": 3.563569447537161e-05, |
| "loss": 2.5098, |
| "num_input_tokens_seen": 2058864, |
| "step": 2745 |
| }, |
| { |
| "epoch": 4.250386398763524, |
| "grad_norm": 0.35828927159309387, |
| "learning_rate": 3.5574623793619164e-05, |
| "loss": 2.7504, |
| "num_input_tokens_seen": 2062512, |
| "step": 2750 |
| }, |
| { |
| "epoch": 4.258114374034003, |
| "grad_norm": 0.3931995928287506, |
| "learning_rate": 3.551347616147479e-05, |
| "loss": 2.7396, |
| "num_input_tokens_seen": 2065904, |
| "step": 2755 |
| }, |
| { |
| "epoch": 4.265842349304482, |
| "grad_norm": 0.5853766798973083, |
| "learning_rate": 3.5452252023903176e-05, |
| "loss": 2.6511, |
| "num_input_tokens_seen": 2069744, |
| "step": 2760 |
| }, |
| { |
| "epoch": 4.273570324574961, |
| "grad_norm": 0.47158560156822205, |
| "learning_rate": 3.539095182642573e-05, |
| "loss": 2.2498, |
| "num_input_tokens_seen": 2073584, |
| "step": 2765 |
| }, |
| { |
| "epoch": 4.2812982998454405, |
| "grad_norm": 0.6278407573699951, |
| "learning_rate": 3.532957601511736e-05, |
| "loss": 3.012, |
| "num_input_tokens_seen": 2077296, |
| "step": 2770 |
| }, |
| { |
| "epoch": 4.289026275115919, |
| "grad_norm": 0.4914918541908264, |
| "learning_rate": 3.52681250366032e-05, |
| "loss": 3.0169, |
| "num_input_tokens_seen": 2081264, |
| "step": 2775 |
| }, |
| { |
| "epoch": 4.296754250386399, |
| "grad_norm": 0.3581917881965637, |
| "learning_rate": 3.520659933805535e-05, |
| "loss": 2.8308, |
| "num_input_tokens_seen": 2085296, |
| "step": 2780 |
| }, |
| { |
| "epoch": 4.304482225656878, |
| "grad_norm": 0.4669550359249115, |
| "learning_rate": 3.514499936718966e-05, |
| "loss": 2.9323, |
| "num_input_tokens_seen": 2089456, |
| "step": 2785 |
| }, |
| { |
| "epoch": 4.312210200927357, |
| "grad_norm": 0.619138777256012, |
| "learning_rate": 3.508332557226246e-05, |
| "loss": 2.3913, |
| "num_input_tokens_seen": 2093104, |
| "step": 2790 |
| }, |
| { |
| "epoch": 4.319938176197836, |
| "grad_norm": 0.44549477100372314, |
| "learning_rate": 3.502157840206725e-05, |
| "loss": 2.9869, |
| "num_input_tokens_seen": 2096496, |
| "step": 2795 |
| }, |
| { |
| "epoch": 4.327666151468315, |
| "grad_norm": 0.5536242127418518, |
| "learning_rate": 3.4959758305931525e-05, |
| "loss": 2.8984, |
| "num_input_tokens_seen": 2100272, |
| "step": 2800 |
| }, |
| { |
| "epoch": 4.3353941267387945, |
| "grad_norm": 0.34818941354751587, |
| "learning_rate": 3.489786573371341e-05, |
| "loss": 2.9021, |
| "num_input_tokens_seen": 2104048, |
| "step": 2805 |
| }, |
| { |
| "epoch": 4.343122102009273, |
| "grad_norm": 0.4407016336917877, |
| "learning_rate": 3.4835901135798456e-05, |
| "loss": 2.8693, |
| "num_input_tokens_seen": 2107760, |
| "step": 2810 |
| }, |
| { |
| "epoch": 4.350850077279753, |
| "grad_norm": 0.38318338990211487, |
| "learning_rate": 3.4773864963096326e-05, |
| "loss": 2.4316, |
| "num_input_tokens_seen": 2111216, |
| "step": 2815 |
| }, |
| { |
| "epoch": 4.358578052550232, |
| "grad_norm": 0.39572352170944214, |
| "learning_rate": 3.4711757667037536e-05, |
| "loss": 2.7168, |
| "num_input_tokens_seen": 2115312, |
| "step": 2820 |
| }, |
| { |
| "epoch": 4.366306027820711, |
| "grad_norm": 0.5528718829154968, |
| "learning_rate": 3.464957969957015e-05, |
| "loss": 2.8171, |
| "num_input_tokens_seen": 2118640, |
| "step": 2825 |
| }, |
| { |
| "epoch": 4.37403400309119, |
| "grad_norm": 0.44296079874038696, |
| "learning_rate": 3.45873315131565e-05, |
| "loss": 2.8152, |
| "num_input_tokens_seen": 2122416, |
| "step": 2830 |
| }, |
| { |
| "epoch": 4.381761978361669, |
| "grad_norm": 0.3478895425796509, |
| "learning_rate": 3.45250135607699e-05, |
| "loss": 2.5164, |
| "num_input_tokens_seen": 2125936, |
| "step": 2835 |
| }, |
| { |
| "epoch": 4.3894899536321486, |
| "grad_norm": 0.4040631055831909, |
| "learning_rate": 3.4462626295891325e-05, |
| "loss": 2.7903, |
| "num_input_tokens_seen": 2129520, |
| "step": 2840 |
| }, |
| { |
| "epoch": 4.397217928902627, |
| "grad_norm": 0.33225977420806885, |
| "learning_rate": 3.440017017250616e-05, |
| "loss": 2.8935, |
| "num_input_tokens_seen": 2134256, |
| "step": 2845 |
| }, |
| { |
| "epoch": 4.404945904173107, |
| "grad_norm": 0.4490237236022949, |
| "learning_rate": 3.433764564510085e-05, |
| "loss": 2.819, |
| "num_input_tokens_seen": 2138096, |
| "step": 2850 |
| }, |
| { |
| "epoch": 4.412673879443586, |
| "grad_norm": 0.40184807777404785, |
| "learning_rate": 3.427505316865961e-05, |
| "loss": 2.8849, |
| "num_input_tokens_seen": 2142128, |
| "step": 2855 |
| }, |
| { |
| "epoch": 4.420401854714065, |
| "grad_norm": 0.38390636444091797, |
| "learning_rate": 3.4212393198661094e-05, |
| "loss": 2.6071, |
| "num_input_tokens_seen": 2145904, |
| "step": 2860 |
| }, |
| { |
| "epoch": 4.428129829984544, |
| "grad_norm": 0.42832306027412415, |
| "learning_rate": 3.414966619107514e-05, |
| "loss": 2.3911, |
| "num_input_tokens_seen": 2149680, |
| "step": 2865 |
| }, |
| { |
| "epoch": 4.435857805255023, |
| "grad_norm": 0.6270868182182312, |
| "learning_rate": 3.408687260235935e-05, |
| "loss": 2.644, |
| "num_input_tokens_seen": 2153392, |
| "step": 2870 |
| }, |
| { |
| "epoch": 4.443585780525503, |
| "grad_norm": 0.41972818970680237, |
| "learning_rate": 3.402401288945591e-05, |
| "loss": 2.5729, |
| "num_input_tokens_seen": 2157360, |
| "step": 2875 |
| }, |
| { |
| "epoch": 4.451313755795981, |
| "grad_norm": 0.5714272260665894, |
| "learning_rate": 3.396108750978813e-05, |
| "loss": 2.942, |
| "num_input_tokens_seen": 2161584, |
| "step": 2880 |
| }, |
| { |
| "epoch": 4.459041731066461, |
| "grad_norm": 0.36096087098121643, |
| "learning_rate": 3.389809692125717e-05, |
| "loss": 2.6037, |
| "num_input_tokens_seen": 2165808, |
| "step": 2885 |
| }, |
| { |
| "epoch": 4.466769706336939, |
| "grad_norm": 0.49124354124069214, |
| "learning_rate": 3.3835041582238734e-05, |
| "loss": 2.7337, |
| "num_input_tokens_seen": 2169840, |
| "step": 2890 |
| }, |
| { |
| "epoch": 4.474497681607419, |
| "grad_norm": 0.4930627942085266, |
| "learning_rate": 3.377192195157968e-05, |
| "loss": 2.5801, |
| "num_input_tokens_seen": 2173488, |
| "step": 2895 |
| }, |
| { |
| "epoch": 4.4822256568778975, |
| "grad_norm": 0.6838205456733704, |
| "learning_rate": 3.370873848859473e-05, |
| "loss": 2.5378, |
| "num_input_tokens_seen": 2177008, |
| "step": 2900 |
| }, |
| { |
| "epoch": 4.489953632148377, |
| "grad_norm": 0.48879119753837585, |
| "learning_rate": 3.36454916530631e-05, |
| "loss": 2.8039, |
| "num_input_tokens_seen": 2180976, |
| "step": 2905 |
| }, |
| { |
| "epoch": 4.497681607418857, |
| "grad_norm": 0.5275672674179077, |
| "learning_rate": 3.358218190522516e-05, |
| "loss": 2.6789, |
| "num_input_tokens_seen": 2184432, |
| "step": 2910 |
| }, |
| { |
| "epoch": 4.505409582689335, |
| "grad_norm": 0.5114855170249939, |
| "learning_rate": 3.35188097057791e-05, |
| "loss": 2.7445, |
| "num_input_tokens_seen": 2188656, |
| "step": 2915 |
| }, |
| { |
| "epoch": 4.506955177743431, |
| "eval_loss": 2.695441246032715, |
| "eval_runtime": 9.8594, |
| "eval_samples_per_second": 58.32, |
| "eval_steps_per_second": 7.303, |
| "num_input_tokens_seen": 2189232, |
| "step": 2916 |
| }, |
| { |
| "epoch": 4.513137557959815, |
| "grad_norm": 0.38441339135169983, |
| "learning_rate": 3.345537551587753e-05, |
| "loss": 2.413, |
| "num_input_tokens_seen": 2191984, |
| "step": 2920 |
| }, |
| { |
| "epoch": 4.520865533230293, |
| "grad_norm": 0.3344990909099579, |
| "learning_rate": 3.33918797971242e-05, |
| "loss": 2.7369, |
| "num_input_tokens_seen": 2195632, |
| "step": 2925 |
| }, |
| { |
| "epoch": 4.528593508500773, |
| "grad_norm": 0.35520127415657043, |
| "learning_rate": 3.332832301157056e-05, |
| "loss": 2.7584, |
| "num_input_tokens_seen": 2199344, |
| "step": 2930 |
| }, |
| { |
| "epoch": 4.5363214837712516, |
| "grad_norm": 0.6611173152923584, |
| "learning_rate": 3.326470562171246e-05, |
| "loss": 2.681, |
| "num_input_tokens_seen": 2203248, |
| "step": 2935 |
| }, |
| { |
| "epoch": 4.544049459041731, |
| "grad_norm": 0.7467407584190369, |
| "learning_rate": 3.320102809048676e-05, |
| "loss": 2.5338, |
| "num_input_tokens_seen": 2207024, |
| "step": 2940 |
| }, |
| { |
| "epoch": 4.551777434312211, |
| "grad_norm": 0.3933999240398407, |
| "learning_rate": 3.313729088126796e-05, |
| "loss": 2.5062, |
| "num_input_tokens_seen": 2211056, |
| "step": 2945 |
| }, |
| { |
| "epoch": 4.559505409582689, |
| "grad_norm": 0.5209619998931885, |
| "learning_rate": 3.307349445786481e-05, |
| "loss": 2.5035, |
| "num_input_tokens_seen": 2214512, |
| "step": 2950 |
| }, |
| { |
| "epoch": 4.567233384853169, |
| "grad_norm": 0.4052325189113617, |
| "learning_rate": 3.300963928451699e-05, |
| "loss": 2.6405, |
| "num_input_tokens_seen": 2217968, |
| "step": 2955 |
| }, |
| { |
| "epoch": 4.574961360123647, |
| "grad_norm": 0.37696200609207153, |
| "learning_rate": 3.2945725825891676e-05, |
| "loss": 2.5377, |
| "num_input_tokens_seen": 2222192, |
| "step": 2960 |
| }, |
| { |
| "epoch": 4.582689335394127, |
| "grad_norm": 0.5208265781402588, |
| "learning_rate": 3.288175454708017e-05, |
| "loss": 2.6624, |
| "num_input_tokens_seen": 2226096, |
| "step": 2965 |
| }, |
| { |
| "epoch": 4.590417310664606, |
| "grad_norm": 0.5913110375404358, |
| "learning_rate": 3.281772591359457e-05, |
| "loss": 2.7047, |
| "num_input_tokens_seen": 2230128, |
| "step": 2970 |
| }, |
| { |
| "epoch": 4.598145285935085, |
| "grad_norm": 0.5141699314117432, |
| "learning_rate": 3.2753640391364276e-05, |
| "loss": 2.8835, |
| "num_input_tokens_seen": 2234672, |
| "step": 2975 |
| }, |
| { |
| "epoch": 4.605873261205565, |
| "grad_norm": 0.5090105533599854, |
| "learning_rate": 3.2689498446732705e-05, |
| "loss": 2.6953, |
| "num_input_tokens_seen": 2238256, |
| "step": 2980 |
| }, |
| { |
| "epoch": 4.613601236476043, |
| "grad_norm": 0.4541560709476471, |
| "learning_rate": 3.262530054645384e-05, |
| "loss": 2.7004, |
| "num_input_tokens_seen": 2242032, |
| "step": 2985 |
| }, |
| { |
| "epoch": 4.621329211746523, |
| "grad_norm": 0.49343761801719666, |
| "learning_rate": 3.256104715768885e-05, |
| "loss": 2.6817, |
| "num_input_tokens_seen": 2245488, |
| "step": 2990 |
| }, |
| { |
| "epoch": 4.629057187017001, |
| "grad_norm": 0.7500348687171936, |
| "learning_rate": 3.249673874800267e-05, |
| "loss": 2.2831, |
| "num_input_tokens_seen": 2249520, |
| "step": 2995 |
| }, |
| { |
| "epoch": 4.636785162287481, |
| "grad_norm": 0.41946667432785034, |
| "learning_rate": 3.2432375785360644e-05, |
| "loss": 2.6983, |
| "num_input_tokens_seen": 2253168, |
| "step": 3000 |
| }, |
| { |
| "epoch": 4.64451313755796, |
| "grad_norm": 0.4098086655139923, |
| "learning_rate": 3.236795873812509e-05, |
| "loss": 2.4214, |
| "num_input_tokens_seen": 2257328, |
| "step": 3005 |
| }, |
| { |
| "epoch": 4.652241112828439, |
| "grad_norm": 0.3378872275352478, |
| "learning_rate": 3.230348807505186e-05, |
| "loss": 2.7132, |
| "num_input_tokens_seen": 2260528, |
| "step": 3010 |
| }, |
| { |
| "epoch": 4.659969088098918, |
| "grad_norm": 0.4678763449192047, |
| "learning_rate": 3.223896426528701e-05, |
| "loss": 2.5183, |
| "num_input_tokens_seen": 2263920, |
| "step": 3015 |
| }, |
| { |
| "epoch": 4.667697063369397, |
| "grad_norm": 0.3910799026489258, |
| "learning_rate": 3.217438777836329e-05, |
| "loss": 2.4753, |
| "num_input_tokens_seen": 2267824, |
| "step": 3020 |
| }, |
| { |
| "epoch": 4.675425038639876, |
| "grad_norm": 0.3688734471797943, |
| "learning_rate": 3.210975908419682e-05, |
| "loss": 2.2819, |
| "num_input_tokens_seen": 2271408, |
| "step": 3025 |
| }, |
| { |
| "epoch": 4.683153013910355, |
| "grad_norm": 0.4956994950771332, |
| "learning_rate": 3.2045078653083594e-05, |
| "loss": 2.5683, |
| "num_input_tokens_seen": 2275120, |
| "step": 3030 |
| }, |
| { |
| "epoch": 4.690880989180835, |
| "grad_norm": 0.4585415720939636, |
| "learning_rate": 3.1980346955696116e-05, |
| "loss": 2.5434, |
| "num_input_tokens_seen": 2278576, |
| "step": 3035 |
| }, |
| { |
| "epoch": 4.698608964451314, |
| "grad_norm": 0.4156535863876343, |
| "learning_rate": 3.191556446307993e-05, |
| "loss": 2.5429, |
| "num_input_tokens_seen": 2281712, |
| "step": 3040 |
| }, |
| { |
| "epoch": 4.706336939721793, |
| "grad_norm": 0.40493646264076233, |
| "learning_rate": 3.1850731646650215e-05, |
| "loss": 2.4746, |
| "num_input_tokens_seen": 2285296, |
| "step": 3045 |
| }, |
| { |
| "epoch": 4.714064914992272, |
| "grad_norm": 0.3802744448184967, |
| "learning_rate": 3.178584897818836e-05, |
| "loss": 2.8662, |
| "num_input_tokens_seen": 2288688, |
| "step": 3050 |
| }, |
| { |
| "epoch": 4.721792890262751, |
| "grad_norm": 0.40369701385498047, |
| "learning_rate": 3.172091692983851e-05, |
| "loss": 2.721, |
| "num_input_tokens_seen": 2292144, |
| "step": 3055 |
| }, |
| { |
| "epoch": 4.72952086553323, |
| "grad_norm": 0.5234618782997131, |
| "learning_rate": 3.165593597410414e-05, |
| "loss": 2.5668, |
| "num_input_tokens_seen": 2295792, |
| "step": 3060 |
| }, |
| { |
| "epoch": 4.7372488408037094, |
| "grad_norm": 0.418813556432724, |
| "learning_rate": 3.1590906583844644e-05, |
| "loss": 2.8402, |
| "num_input_tokens_seen": 2299440, |
| "step": 3065 |
| }, |
| { |
| "epoch": 4.744976816074189, |
| "grad_norm": 0.4814930558204651, |
| "learning_rate": 3.1525829232271845e-05, |
| "loss": 2.8945, |
| "num_input_tokens_seen": 2303280, |
| "step": 3070 |
| }, |
| { |
| "epoch": 4.752704791344668, |
| "grad_norm": 0.38368573784828186, |
| "learning_rate": 3.146070439294657e-05, |
| "loss": 2.4513, |
| "num_input_tokens_seen": 2307056, |
| "step": 3075 |
| }, |
| { |
| "epoch": 4.760432766615147, |
| "grad_norm": 0.49482211470603943, |
| "learning_rate": 3.1395532539775244e-05, |
| "loss": 2.5108, |
| "num_input_tokens_seen": 2309872, |
| "step": 3080 |
| }, |
| { |
| "epoch": 4.768160741885626, |
| "grad_norm": 0.629030704498291, |
| "learning_rate": 3.1330314147006355e-05, |
| "loss": 2.5163, |
| "num_input_tokens_seen": 2313392, |
| "step": 3085 |
| }, |
| { |
| "epoch": 4.775888717156105, |
| "grad_norm": 0.3420495092868805, |
| "learning_rate": 3.126504968922711e-05, |
| "loss": 2.6241, |
| "num_input_tokens_seen": 2316976, |
| "step": 3090 |
| }, |
| { |
| "epoch": 4.783616692426584, |
| "grad_norm": 0.39348718523979187, |
| "learning_rate": 3.119973964135987e-05, |
| "loss": 2.6212, |
| "num_input_tokens_seen": 2320816, |
| "step": 3095 |
| }, |
| { |
| "epoch": 4.7913446676970635, |
| "grad_norm": 0.3372386395931244, |
| "learning_rate": 3.113438447865881e-05, |
| "loss": 2.3769, |
| "num_input_tokens_seen": 2324400, |
| "step": 3100 |
| }, |
| { |
| "epoch": 4.799072642967543, |
| "grad_norm": 0.4806166887283325, |
| "learning_rate": 3.1068984676706344e-05, |
| "loss": 2.6456, |
| "num_input_tokens_seen": 2327856, |
| "step": 3105 |
| }, |
| { |
| "epoch": 4.806800618238022, |
| "grad_norm": 0.5349667072296143, |
| "learning_rate": 3.100354071140977e-05, |
| "loss": 2.6822, |
| "num_input_tokens_seen": 2331824, |
| "step": 3110 |
| }, |
| { |
| "epoch": 4.814528593508501, |
| "grad_norm": 0.5503065586090088, |
| "learning_rate": 3.093805305899772e-05, |
| "loss": 2.5127, |
| "num_input_tokens_seen": 2335408, |
| "step": 3115 |
| }, |
| { |
| "epoch": 4.82225656877898, |
| "grad_norm": 0.4024689197540283, |
| "learning_rate": 3.0872522196016746e-05, |
| "loss": 2.6049, |
| "num_input_tokens_seen": 2339312, |
| "step": 3120 |
| }, |
| { |
| "epoch": 4.829984544049459, |
| "grad_norm": 0.462722510099411, |
| "learning_rate": 3.080694859932785e-05, |
| "loss": 2.5243, |
| "num_input_tokens_seen": 2342768, |
| "step": 3125 |
| }, |
| { |
| "epoch": 4.837712519319938, |
| "grad_norm": 0.43024778366088867, |
| "learning_rate": 3.074133274610297e-05, |
| "loss": 2.6725, |
| "num_input_tokens_seen": 2346544, |
| "step": 3130 |
| }, |
| { |
| "epoch": 4.8454404945904175, |
| "grad_norm": 0.5202332139015198, |
| "learning_rate": 3.06756751138216e-05, |
| "loss": 2.3939, |
| "num_input_tokens_seen": 2350640, |
| "step": 3135 |
| }, |
| { |
| "epoch": 4.853168469860896, |
| "grad_norm": 0.3638491928577423, |
| "learning_rate": 3.0609976180267186e-05, |
| "loss": 2.4518, |
| "num_input_tokens_seen": 2354160, |
| "step": 3140 |
| }, |
| { |
| "epoch": 4.860896445131376, |
| "grad_norm": 0.4176388680934906, |
| "learning_rate": 3.054423642352376e-05, |
| "loss": 2.5428, |
| "num_input_tokens_seen": 2357872, |
| "step": 3145 |
| }, |
| { |
| "epoch": 4.868624420401854, |
| "grad_norm": 0.4720919728279114, |
| "learning_rate": 3.0478456321972422e-05, |
| "loss": 2.4582, |
| "num_input_tokens_seen": 2361840, |
| "step": 3150 |
| }, |
| { |
| "epoch": 4.876352395672334, |
| "grad_norm": 0.47086870670318604, |
| "learning_rate": 3.0412636354287826e-05, |
| "loss": 2.437, |
| "num_input_tokens_seen": 2365488, |
| "step": 3155 |
| }, |
| { |
| "epoch": 4.884080370942813, |
| "grad_norm": 0.42605626583099365, |
| "learning_rate": 3.0346776999434774e-05, |
| "loss": 2.4077, |
| "num_input_tokens_seen": 2369200, |
| "step": 3160 |
| }, |
| { |
| "epoch": 4.891808346213292, |
| "grad_norm": 0.5135416388511658, |
| "learning_rate": 3.0280878736664632e-05, |
| "loss": 2.6054, |
| "num_input_tokens_seen": 2372976, |
| "step": 3165 |
| }, |
| { |
| "epoch": 4.8995363214837715, |
| "grad_norm": 0.519250214099884, |
| "learning_rate": 3.0214942045511933e-05, |
| "loss": 2.564, |
| "num_input_tokens_seen": 2377008, |
| "step": 3170 |
| }, |
| { |
| "epoch": 4.90726429675425, |
| "grad_norm": 0.33954399824142456, |
| "learning_rate": 3.014896740579084e-05, |
| "loss": 2.4686, |
| "num_input_tokens_seen": 2380464, |
| "step": 3175 |
| }, |
| { |
| "epoch": 4.91499227202473, |
| "grad_norm": 0.5407111048698425, |
| "learning_rate": 3.0082955297591646e-05, |
| "loss": 2.7348, |
| "num_input_tokens_seen": 2384432, |
| "step": 3180 |
| }, |
| { |
| "epoch": 4.922720247295208, |
| "grad_norm": 0.4390753209590912, |
| "learning_rate": 3.001690620127733e-05, |
| "loss": 2.4345, |
| "num_input_tokens_seen": 2388592, |
| "step": 3185 |
| }, |
| { |
| "epoch": 4.930448222565688, |
| "grad_norm": 0.5103745460510254, |
| "learning_rate": 2.9950820597479988e-05, |
| "loss": 2.4188, |
| "num_input_tokens_seen": 2392624, |
| "step": 3190 |
| }, |
| { |
| "epoch": 4.938176197836167, |
| "grad_norm": 0.4389425218105316, |
| "learning_rate": 2.9884698967097425e-05, |
| "loss": 2.2987, |
| "num_input_tokens_seen": 2396208, |
| "step": 3195 |
| }, |
| { |
| "epoch": 4.945904173106646, |
| "grad_norm": 0.7027727961540222, |
| "learning_rate": 2.9818541791289568e-05, |
| "loss": 2.4588, |
| "num_input_tokens_seen": 2399792, |
| "step": 3200 |
| }, |
| { |
| "epoch": 4.9536321483771255, |
| "grad_norm": 0.43571075797080994, |
| "learning_rate": 2.9752349551475028e-05, |
| "loss": 2.5056, |
| "num_input_tokens_seen": 2403952, |
| "step": 3205 |
| }, |
| { |
| "epoch": 4.961360123647604, |
| "grad_norm": 0.39576250314712524, |
| "learning_rate": 2.9686122729327565e-05, |
| "loss": 2.489, |
| "num_input_tokens_seen": 2407472, |
| "step": 3210 |
| }, |
| { |
| "epoch": 4.969088098918084, |
| "grad_norm": 0.5138210654258728, |
| "learning_rate": 2.961986180677258e-05, |
| "loss": 2.3268, |
| "num_input_tokens_seen": 2411248, |
| "step": 3215 |
| }, |
| { |
| "epoch": 4.976816074188562, |
| "grad_norm": 0.6068966388702393, |
| "learning_rate": 2.9553567265983634e-05, |
| "loss": 2.4227, |
| "num_input_tokens_seen": 2414960, |
| "step": 3220 |
| }, |
| { |
| "epoch": 4.984544049459042, |
| "grad_norm": 0.5577759742736816, |
| "learning_rate": 2.9487239589378923e-05, |
| "loss": 2.4349, |
| "num_input_tokens_seen": 2418800, |
| "step": 3225 |
| }, |
| { |
| "epoch": 4.992272024729521, |
| "grad_norm": 0.5744683146476746, |
| "learning_rate": 2.942087925961776e-05, |
| "loss": 2.2231, |
| "num_input_tokens_seen": 2422512, |
| "step": 3230 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.447038471698761, |
| "learning_rate": 2.9354486759597087e-05, |
| "loss": 2.5454, |
| "num_input_tokens_seen": 2426048, |
| "step": 3235 |
| }, |
| { |
| "epoch": 5.0077279752704795, |
| "grad_norm": 0.4673357903957367, |
| "learning_rate": 2.9288062572447926e-05, |
| "loss": 2.3198, |
| "num_input_tokens_seen": 2429824, |
| "step": 3240 |
| }, |
| { |
| "epoch": 5.0077279752704795, |
| "eval_loss": 2.511051893234253, |
| "eval_runtime": 9.8438, |
| "eval_samples_per_second": 58.412, |
| "eval_steps_per_second": 7.314, |
| "num_input_tokens_seen": 2429824, |
| "step": 3240 |
| }, |
| { |
| "epoch": 5.015455950540958, |
| "grad_norm": 0.4172385334968567, |
| "learning_rate": 2.9221607181531897e-05, |
| "loss": 2.5621, |
| "num_input_tokens_seen": 2433728, |
| "step": 3245 |
| }, |
| { |
| "epoch": 5.023183925811438, |
| "grad_norm": 0.40235933661460876, |
| "learning_rate": 2.915512107043767e-05, |
| "loss": 2.7036, |
| "num_input_tokens_seen": 2438208, |
| "step": 3250 |
| }, |
| { |
| "epoch": 5.030911901081916, |
| "grad_norm": 0.4346725046634674, |
| "learning_rate": 2.9088604722977487e-05, |
| "loss": 2.5252, |
| "num_input_tokens_seen": 2442048, |
| "step": 3255 |
| }, |
| { |
| "epoch": 5.038639876352396, |
| "grad_norm": 0.5125299096107483, |
| "learning_rate": 2.9022058623183603e-05, |
| "loss": 2.5612, |
| "num_input_tokens_seen": 2445760, |
| "step": 3260 |
| }, |
| { |
| "epoch": 5.0463678516228745, |
| "grad_norm": 0.4407518208026886, |
| "learning_rate": 2.895548325530477e-05, |
| "loss": 2.3247, |
| "num_input_tokens_seen": 2449280, |
| "step": 3265 |
| }, |
| { |
| "epoch": 5.054095826893354, |
| "grad_norm": 0.4180731475353241, |
| "learning_rate": 2.8888879103802735e-05, |
| "loss": 2.7282, |
| "num_input_tokens_seen": 2452736, |
| "step": 3270 |
| }, |
| { |
| "epoch": 5.061823802163833, |
| "grad_norm": 0.4604838788509369, |
| "learning_rate": 2.882224665334869e-05, |
| "loss": 2.4472, |
| "num_input_tokens_seen": 2456320, |
| "step": 3275 |
| }, |
| { |
| "epoch": 5.069551777434312, |
| "grad_norm": 0.4161360561847687, |
| "learning_rate": 2.8755586388819766e-05, |
| "loss": 2.4265, |
| "num_input_tokens_seen": 2460352, |
| "step": 3280 |
| }, |
| { |
| "epoch": 5.077279752704792, |
| "grad_norm": 0.3719094395637512, |
| "learning_rate": 2.8688898795295477e-05, |
| "loss": 2.3331, |
| "num_input_tokens_seen": 2463808, |
| "step": 3285 |
| }, |
| { |
| "epoch": 5.08500772797527, |
| "grad_norm": 0.5433674454689026, |
| "learning_rate": 2.8622184358054228e-05, |
| "loss": 2.4872, |
| "num_input_tokens_seen": 2467584, |
| "step": 3290 |
| }, |
| { |
| "epoch": 5.09273570324575, |
| "grad_norm": 0.4143747091293335, |
| "learning_rate": 2.855544356256975e-05, |
| "loss": 2.1259, |
| "num_input_tokens_seen": 2471552, |
| "step": 3295 |
| }, |
| { |
| "epoch": 5.1004636785162285, |
| "grad_norm": 0.5412753820419312, |
| "learning_rate": 2.8488676894507577e-05, |
| "loss": 2.3704, |
| "num_input_tokens_seen": 2475008, |
| "step": 3300 |
| }, |
| { |
| "epoch": 5.108191653786708, |
| "grad_norm": 0.4551786184310913, |
| "learning_rate": 2.842188483972153e-05, |
| "loss": 2.3857, |
| "num_input_tokens_seen": 2478272, |
| "step": 3305 |
| }, |
| { |
| "epoch": 5.115919629057187, |
| "grad_norm": 0.4686163365840912, |
| "learning_rate": 2.8355067884250147e-05, |
| "loss": 2.4618, |
| "num_input_tokens_seen": 2481984, |
| "step": 3310 |
| }, |
| { |
| "epoch": 5.123647604327666, |
| "grad_norm": 0.7603648900985718, |
| "learning_rate": 2.8288226514313177e-05, |
| "loss": 2.4902, |
| "num_input_tokens_seen": 2485696, |
| "step": 3315 |
| }, |
| { |
| "epoch": 5.131375579598146, |
| "grad_norm": 0.5762485265731812, |
| "learning_rate": 2.822136121630804e-05, |
| "loss": 2.2111, |
| "num_input_tokens_seen": 2489344, |
| "step": 3320 |
| }, |
| { |
| "epoch": 5.139103554868624, |
| "grad_norm": 0.5430300831794739, |
| "learning_rate": 2.815447247680626e-05, |
| "loss": 2.3911, |
| "num_input_tokens_seen": 2492864, |
| "step": 3325 |
| }, |
| { |
| "epoch": 5.146831530139104, |
| "grad_norm": 0.6499159336090088, |
| "learning_rate": 2.8087560782549944e-05, |
| "loss": 2.5143, |
| "num_input_tokens_seen": 2496448, |
| "step": 3330 |
| }, |
| { |
| "epoch": 5.1545595054095825, |
| "grad_norm": 0.5955728888511658, |
| "learning_rate": 2.8020626620448248e-05, |
| "loss": 2.6991, |
| "num_input_tokens_seen": 2500224, |
| "step": 3335 |
| }, |
| { |
| "epoch": 5.162287480680062, |
| "grad_norm": 0.3927296996116638, |
| "learning_rate": 2.7953670477573823e-05, |
| "loss": 2.5315, |
| "num_input_tokens_seen": 2503936, |
| "step": 3340 |
| }, |
| { |
| "epoch": 5.170015455950541, |
| "grad_norm": 0.4476173222064972, |
| "learning_rate": 2.788669284115926e-05, |
| "loss": 2.5084, |
| "num_input_tokens_seen": 2508096, |
| "step": 3345 |
| }, |
| { |
| "epoch": 5.17774343122102, |
| "grad_norm": 0.594795823097229, |
| "learning_rate": 2.7819694198593567e-05, |
| "loss": 2.523, |
| "num_input_tokens_seen": 2512000, |
| "step": 3350 |
| }, |
| { |
| "epoch": 5.185471406491499, |
| "grad_norm": 0.49172693490982056, |
| "learning_rate": 2.775267503741862e-05, |
| "loss": 2.702, |
| "num_input_tokens_seen": 2515968, |
| "step": 3355 |
| }, |
| { |
| "epoch": 5.193199381761978, |
| "grad_norm": 0.634707510471344, |
| "learning_rate": 2.768563584532558e-05, |
| "loss": 2.133, |
| "num_input_tokens_seen": 2519424, |
| "step": 3360 |
| }, |
| { |
| "epoch": 5.200927357032458, |
| "grad_norm": 0.37945079803466797, |
| "learning_rate": 2.7618577110151394e-05, |
| "loss": 2.2683, |
| "num_input_tokens_seen": 2523136, |
| "step": 3365 |
| }, |
| { |
| "epoch": 5.2086553323029365, |
| "grad_norm": 0.4269804060459137, |
| "learning_rate": 2.7551499319875212e-05, |
| "loss": 2.202, |
| "num_input_tokens_seen": 2526848, |
| "step": 3370 |
| }, |
| { |
| "epoch": 5.216383307573416, |
| "grad_norm": 0.4263918399810791, |
| "learning_rate": 2.748440296261485e-05, |
| "loss": 2.5293, |
| "num_input_tokens_seen": 2531072, |
| "step": 3375 |
| }, |
| { |
| "epoch": 5.224111282843895, |
| "grad_norm": 0.43470582365989685, |
| "learning_rate": 2.741728852662323e-05, |
| "loss": 2.3115, |
| "num_input_tokens_seen": 2534720, |
| "step": 3380 |
| }, |
| { |
| "epoch": 5.231839258114374, |
| "grad_norm": 0.39447730779647827, |
| "learning_rate": 2.735015650028484e-05, |
| "loss": 2.6058, |
| "num_input_tokens_seen": 2539264, |
| "step": 3385 |
| }, |
| { |
| "epoch": 5.239567233384853, |
| "grad_norm": 0.5679147839546204, |
| "learning_rate": 2.728300737211215e-05, |
| "loss": 2.3294, |
| "num_input_tokens_seen": 2542912, |
| "step": 3390 |
| }, |
| { |
| "epoch": 5.247295208655332, |
| "grad_norm": 0.39888083934783936, |
| "learning_rate": 2.7215841630742112e-05, |
| "loss": 2.2617, |
| "num_input_tokens_seen": 2546944, |
| "step": 3395 |
| }, |
| { |
| "epoch": 5.255023183925811, |
| "grad_norm": 0.41213181614875793, |
| "learning_rate": 2.714865976493253e-05, |
| "loss": 2.1931, |
| "num_input_tokens_seen": 2550528, |
| "step": 3400 |
| }, |
| { |
| "epoch": 5.2627511591962906, |
| "grad_norm": 0.4185974597930908, |
| "learning_rate": 2.708146226355858e-05, |
| "loss": 2.2597, |
| "num_input_tokens_seen": 2553920, |
| "step": 3405 |
| }, |
| { |
| "epoch": 5.27047913446677, |
| "grad_norm": 0.5793778300285339, |
| "learning_rate": 2.7014249615609194e-05, |
| "loss": 2.2198, |
| "num_input_tokens_seen": 2557696, |
| "step": 3410 |
| }, |
| { |
| "epoch": 5.278207109737249, |
| "grad_norm": 0.4878511130809784, |
| "learning_rate": 2.6947022310183528e-05, |
| "loss": 2.2857, |
| "num_input_tokens_seen": 2561792, |
| "step": 3415 |
| }, |
| { |
| "epoch": 5.285935085007728, |
| "grad_norm": 0.6548928022384644, |
| "learning_rate": 2.6879780836487412e-05, |
| "loss": 2.2111, |
| "num_input_tokens_seen": 2565440, |
| "step": 3420 |
| }, |
| { |
| "epoch": 5.293663060278207, |
| "grad_norm": 0.5079330801963806, |
| "learning_rate": 2.681252568382976e-05, |
| "loss": 2.5244, |
| "num_input_tokens_seen": 2569088, |
| "step": 3425 |
| }, |
| { |
| "epoch": 5.301391035548686, |
| "grad_norm": 0.4245012700557709, |
| "learning_rate": 2.6745257341619035e-05, |
| "loss": 2.1497, |
| "num_input_tokens_seen": 2572736, |
| "step": 3430 |
| }, |
| { |
| "epoch": 5.309119010819165, |
| "grad_norm": 0.570668637752533, |
| "learning_rate": 2.667797629935967e-05, |
| "loss": 2.5076, |
| "num_input_tokens_seen": 2576512, |
| "step": 3435 |
| }, |
| { |
| "epoch": 5.316846986089645, |
| "grad_norm": 0.4023292660713196, |
| "learning_rate": 2.6610683046648533e-05, |
| "loss": 2.2498, |
| "num_input_tokens_seen": 2579968, |
| "step": 3440 |
| }, |
| { |
| "epoch": 5.324574961360124, |
| "grad_norm": 0.5683576464653015, |
| "learning_rate": 2.654337807317132e-05, |
| "loss": 2.5712, |
| "num_input_tokens_seen": 2583936, |
| "step": 3445 |
| }, |
| { |
| "epoch": 5.332302936630603, |
| "grad_norm": 0.4126036465167999, |
| "learning_rate": 2.647606186869905e-05, |
| "loss": 2.4584, |
| "num_input_tokens_seen": 2587392, |
| "step": 3450 |
| }, |
| { |
| "epoch": 5.340030911901082, |
| "grad_norm": 0.42776116728782654, |
| "learning_rate": 2.6408734923084444e-05, |
| "loss": 2.3173, |
| "num_input_tokens_seen": 2591040, |
| "step": 3455 |
| }, |
| { |
| "epoch": 5.347758887171561, |
| "grad_norm": 0.4142238199710846, |
| "learning_rate": 2.6341397726258392e-05, |
| "loss": 2.2085, |
| "num_input_tokens_seen": 2594368, |
| "step": 3460 |
| }, |
| { |
| "epoch": 5.35548686244204, |
| "grad_norm": 0.4580320715904236, |
| "learning_rate": 2.6274050768226384e-05, |
| "loss": 2.3484, |
| "num_input_tokens_seen": 2598592, |
| "step": 3465 |
| }, |
| { |
| "epoch": 5.363214837712519, |
| "grad_norm": 0.37727484107017517, |
| "learning_rate": 2.620669453906493e-05, |
| "loss": 2.6041, |
| "num_input_tokens_seen": 2602496, |
| "step": 3470 |
| }, |
| { |
| "epoch": 5.370942812982999, |
| "grad_norm": 0.3904915452003479, |
| "learning_rate": 2.6139329528918016e-05, |
| "loss": 2.3611, |
| "num_input_tokens_seen": 2606016, |
| "step": 3475 |
| }, |
| { |
| "epoch": 5.378670788253477, |
| "grad_norm": 0.4388081729412079, |
| "learning_rate": 2.6071956227993538e-05, |
| "loss": 2.1859, |
| "num_input_tokens_seen": 2609600, |
| "step": 3480 |
| }, |
| { |
| "epoch": 5.386398763523957, |
| "grad_norm": 0.39128202199935913, |
| "learning_rate": 2.60045751265597e-05, |
| "loss": 2.2651, |
| "num_input_tokens_seen": 2613760, |
| "step": 3485 |
| }, |
| { |
| "epoch": 5.394126738794436, |
| "grad_norm": 0.4795575737953186, |
| "learning_rate": 2.5937186714941474e-05, |
| "loss": 2.4204, |
| "num_input_tokens_seen": 2617536, |
| "step": 3490 |
| }, |
| { |
| "epoch": 5.401854714064915, |
| "grad_norm": 0.4449489712715149, |
| "learning_rate": 2.586979148351704e-05, |
| "loss": 2.2625, |
| "num_input_tokens_seen": 2621440, |
| "step": 3495 |
| }, |
| { |
| "epoch": 5.409582689335394, |
| "grad_norm": 0.5936563611030579, |
| "learning_rate": 2.5802389922714195e-05, |
| "loss": 2.3463, |
| "num_input_tokens_seen": 2625280, |
| "step": 3500 |
| }, |
| { |
| "epoch": 5.417310664605873, |
| "grad_norm": 0.44013652205467224, |
| "learning_rate": 2.5734982523006786e-05, |
| "loss": 2.5644, |
| "num_input_tokens_seen": 2628736, |
| "step": 3505 |
| }, |
| { |
| "epoch": 5.425038639876353, |
| "grad_norm": 0.5395840406417847, |
| "learning_rate": 2.5667569774911175e-05, |
| "loss": 2.343, |
| "num_input_tokens_seen": 2632320, |
| "step": 3510 |
| }, |
| { |
| "epoch": 5.432766615146831, |
| "grad_norm": 0.3468901813030243, |
| "learning_rate": 2.560015216898262e-05, |
| "loss": 2.1964, |
| "num_input_tokens_seen": 2635968, |
| "step": 3515 |
| }, |
| { |
| "epoch": 5.440494590417311, |
| "grad_norm": 0.4727214276790619, |
| "learning_rate": 2.553273019581174e-05, |
| "loss": 2.2021, |
| "num_input_tokens_seen": 2639872, |
| "step": 3520 |
| }, |
| { |
| "epoch": 5.448222565687789, |
| "grad_norm": 0.5066657066345215, |
| "learning_rate": 2.5465304346020924e-05, |
| "loss": 2.111, |
| "num_input_tokens_seen": 2644224, |
| "step": 3525 |
| }, |
| { |
| "epoch": 5.455950540958269, |
| "grad_norm": 0.4220898449420929, |
| "learning_rate": 2.5397875110260784e-05, |
| "loss": 2.5323, |
| "num_input_tokens_seen": 2648128, |
| "step": 3530 |
| }, |
| { |
| "epoch": 5.4636785162287484, |
| "grad_norm": 0.3824640214443207, |
| "learning_rate": 2.5330442979206566e-05, |
| "loss": 2.428, |
| "num_input_tokens_seen": 2651712, |
| "step": 3535 |
| }, |
| { |
| "epoch": 5.471406491499227, |
| "grad_norm": 0.4880902171134949, |
| "learning_rate": 2.526300844355457e-05, |
| "loss": 2.4646, |
| "num_input_tokens_seen": 2655424, |
| "step": 3540 |
| }, |
| { |
| "epoch": 5.479134466769707, |
| "grad_norm": 0.4218856394290924, |
| "learning_rate": 2.519557199401863e-05, |
| "loss": 2.4592, |
| "num_input_tokens_seen": 2659200, |
| "step": 3545 |
| }, |
| { |
| "epoch": 5.486862442040185, |
| "grad_norm": 0.39387452602386475, |
| "learning_rate": 2.512813412132647e-05, |
| "loss": 2.4111, |
| "num_input_tokens_seen": 2662912, |
| "step": 3550 |
| }, |
| { |
| "epoch": 5.494590417310665, |
| "grad_norm": 0.5984361171722412, |
| "learning_rate": 2.5060695316216188e-05, |
| "loss": 2.3876, |
| "num_input_tokens_seen": 2666944, |
| "step": 3555 |
| }, |
| { |
| "epoch": 5.502318392581143, |
| "grad_norm": 0.46139827370643616, |
| "learning_rate": 2.4993256069432666e-05, |
| "loss": 2.4696, |
| "num_input_tokens_seen": 2670592, |
| "step": 3560 |
| }, |
| { |
| "epoch": 5.508500772797527, |
| "eval_loss": 2.352015256881714, |
| "eval_runtime": 9.8446, |
| "eval_samples_per_second": 58.408, |
| "eval_steps_per_second": 7.314, |
| "num_input_tokens_seen": 2673664, |
| "step": 3564 |
| }, |
| { |
| "epoch": 5.510046367851623, |
| "grad_norm": 0.42209917306900024, |
| "learning_rate": 2.4925816871723997e-05, |
| "loss": 2.4421, |
| "num_input_tokens_seen": 2674304, |
| "step": 3565 |
| }, |
| { |
| "epoch": 5.5177743431221025, |
| "grad_norm": 0.4543595612049103, |
| "learning_rate": 2.4858378213837908e-05, |
| "loss": 2.3185, |
| "num_input_tokens_seen": 2678528, |
| "step": 3570 |
| }, |
| { |
| "epoch": 5.525502318392581, |
| "grad_norm": 0.34188148379325867, |
| "learning_rate": 2.479094058651823e-05, |
| "loss": 2.2586, |
| "num_input_tokens_seen": 2682496, |
| "step": 3575 |
| }, |
| { |
| "epoch": 5.533230293663061, |
| "grad_norm": 0.36892926692962646, |
| "learning_rate": 2.4723504480501248e-05, |
| "loss": 2.5486, |
| "num_input_tokens_seen": 2686592, |
| "step": 3580 |
| }, |
| { |
| "epoch": 5.540958268933539, |
| "grad_norm": 0.8839777708053589, |
| "learning_rate": 2.4656070386512224e-05, |
| "loss": 2.1179, |
| "num_input_tokens_seen": 2690304, |
| "step": 3585 |
| }, |
| { |
| "epoch": 5.548686244204019, |
| "grad_norm": 0.39712879061698914, |
| "learning_rate": 2.4588638795261732e-05, |
| "loss": 2.3346, |
| "num_input_tokens_seen": 2694016, |
| "step": 3590 |
| }, |
| { |
| "epoch": 5.556414219474497, |
| "grad_norm": 0.5067217946052551, |
| "learning_rate": 2.4521210197442176e-05, |
| "loss": 1.9961, |
| "num_input_tokens_seen": 2697728, |
| "step": 3595 |
| }, |
| { |
| "epoch": 5.564142194744977, |
| "grad_norm": 0.6092778444290161, |
| "learning_rate": 2.4453785083724147e-05, |
| "loss": 2.0222, |
| "num_input_tokens_seen": 2700992, |
| "step": 3600 |
| }, |
| { |
| "epoch": 5.571870170015456, |
| "grad_norm": 0.5331489443778992, |
| "learning_rate": 2.438636394475291e-05, |
| "loss": 2.2948, |
| "num_input_tokens_seen": 2704704, |
| "step": 3605 |
| }, |
| { |
| "epoch": 5.579598145285935, |
| "grad_norm": 0.4057201147079468, |
| "learning_rate": 2.4318947271144768e-05, |
| "loss": 2.1596, |
| "num_input_tokens_seen": 2707904, |
| "step": 3610 |
| }, |
| { |
| "epoch": 5.587326120556414, |
| "grad_norm": 0.5431353449821472, |
| "learning_rate": 2.4251535553483575e-05, |
| "loss": 2.1688, |
| "num_input_tokens_seen": 2710784, |
| "step": 3615 |
| }, |
| { |
| "epoch": 5.595054095826893, |
| "grad_norm": 0.5729431509971619, |
| "learning_rate": 2.418412928231708e-05, |
| "loss": 2.3437, |
| "num_input_tokens_seen": 2714560, |
| "step": 3620 |
| }, |
| { |
| "epoch": 5.602782071097373, |
| "grad_norm": 0.39663970470428467, |
| "learning_rate": 2.4116728948153427e-05, |
| "loss": 2.4228, |
| "num_input_tokens_seen": 2718656, |
| "step": 3625 |
| }, |
| { |
| "epoch": 5.6105100463678514, |
| "grad_norm": 0.6071697473526001, |
| "learning_rate": 2.404933504145755e-05, |
| "loss": 2.1259, |
| "num_input_tokens_seen": 2721856, |
| "step": 3630 |
| }, |
| { |
| "epoch": 5.618238021638331, |
| "grad_norm": 0.4467135965824127, |
| "learning_rate": 2.39819480526476e-05, |
| "loss": 2.1758, |
| "num_input_tokens_seen": 2725440, |
| "step": 3635 |
| }, |
| { |
| "epoch": 5.62596599690881, |
| "grad_norm": 0.40764495730400085, |
| "learning_rate": 2.3914568472091393e-05, |
| "loss": 1.9979, |
| "num_input_tokens_seen": 2729088, |
| "step": 3640 |
| }, |
| { |
| "epoch": 5.633693972179289, |
| "grad_norm": 0.457857221364975, |
| "learning_rate": 2.3847196790102853e-05, |
| "loss": 2.2177, |
| "num_input_tokens_seen": 2732736, |
| "step": 3645 |
| }, |
| { |
| "epoch": 5.641421947449768, |
| "grad_norm": 0.42833542823791504, |
| "learning_rate": 2.37798334969384e-05, |
| "loss": 2.2948, |
| "num_input_tokens_seen": 2737024, |
| "step": 3650 |
| }, |
| { |
| "epoch": 5.649149922720247, |
| "grad_norm": 0.47789230942726135, |
| "learning_rate": 2.371247908279343e-05, |
| "loss": 2.4004, |
| "num_input_tokens_seen": 2740928, |
| "step": 3655 |
| }, |
| { |
| "epoch": 5.656877897990727, |
| "grad_norm": 0.5181143283843994, |
| "learning_rate": 2.3645134037798704e-05, |
| "loss": 2.3255, |
| "num_input_tokens_seen": 2744960, |
| "step": 3660 |
| }, |
| { |
| "epoch": 5.6646058732612055, |
| "grad_norm": 0.4948902428150177, |
| "learning_rate": 2.357779885201684e-05, |
| "loss": 2.0075, |
| "num_input_tokens_seen": 2748096, |
| "step": 3665 |
| }, |
| { |
| "epoch": 5.672333848531685, |
| "grad_norm": 0.4389133155345917, |
| "learning_rate": 2.3510474015438673e-05, |
| "loss": 1.9645, |
| "num_input_tokens_seen": 2751616, |
| "step": 3670 |
| }, |
| { |
| "epoch": 5.680061823802164, |
| "grad_norm": 0.4908589720726013, |
| "learning_rate": 2.344316001797977e-05, |
| "loss": 1.9947, |
| "num_input_tokens_seen": 2755328, |
| "step": 3675 |
| }, |
| { |
| "epoch": 5.687789799072643, |
| "grad_norm": 0.4054723083972931, |
| "learning_rate": 2.3375857349476768e-05, |
| "loss": 2.1622, |
| "num_input_tokens_seen": 2758656, |
| "step": 3680 |
| }, |
| { |
| "epoch": 5.695517774343122, |
| "grad_norm": 0.39807790517807007, |
| "learning_rate": 2.3308566499683922e-05, |
| "loss": 2.3103, |
| "num_input_tokens_seen": 2762432, |
| "step": 3685 |
| }, |
| { |
| "epoch": 5.703245749613601, |
| "grad_norm": 0.4937276244163513, |
| "learning_rate": 2.3241287958269442e-05, |
| "loss": 2.0892, |
| "num_input_tokens_seen": 2766656, |
| "step": 3690 |
| }, |
| { |
| "epoch": 5.710973724884081, |
| "grad_norm": 0.432624876499176, |
| "learning_rate": 2.3174022214811993e-05, |
| "loss": 2.2162, |
| "num_input_tokens_seen": 2770496, |
| "step": 3695 |
| }, |
| { |
| "epoch": 5.7187017001545595, |
| "grad_norm": 0.42396533489227295, |
| "learning_rate": 2.31067697587971e-05, |
| "loss": 2.1959, |
| "num_input_tokens_seen": 2774144, |
| "step": 3700 |
| }, |
| { |
| "epoch": 5.726429675425039, |
| "grad_norm": 0.5416736006736755, |
| "learning_rate": 2.3039531079613613e-05, |
| "loss": 1.7434, |
| "num_input_tokens_seen": 2777344, |
| "step": 3705 |
| }, |
| { |
| "epoch": 5.734157650695518, |
| "grad_norm": 0.6681102514266968, |
| "learning_rate": 2.2972306666550098e-05, |
| "loss": 2.2059, |
| "num_input_tokens_seen": 2780928, |
| "step": 3710 |
| }, |
| { |
| "epoch": 5.741885625965997, |
| "grad_norm": 0.5061545372009277, |
| "learning_rate": 2.290509700879135e-05, |
| "loss": 2.3656, |
| "num_input_tokens_seen": 2784448, |
| "step": 3715 |
| }, |
| { |
| "epoch": 5.749613601236476, |
| "grad_norm": 0.41529789566993713, |
| "learning_rate": 2.283790259541474e-05, |
| "loss": 2.1918, |
| "num_input_tokens_seen": 2787840, |
| "step": 3720 |
| }, |
| { |
| "epoch": 5.757341576506955, |
| "grad_norm": 0.4075319468975067, |
| "learning_rate": 2.277072391538676e-05, |
| "loss": 2.1543, |
| "num_input_tokens_seen": 2791680, |
| "step": 3725 |
| }, |
| { |
| "epoch": 5.765069551777434, |
| "grad_norm": 0.37512338161468506, |
| "learning_rate": 2.2703561457559376e-05, |
| "loss": 1.9263, |
| "num_input_tokens_seen": 2794944, |
| "step": 3730 |
| }, |
| { |
| "epoch": 5.7727975270479135, |
| "grad_norm": 0.7959144711494446, |
| "learning_rate": 2.263641571066653e-05, |
| "loss": 2.3819, |
| "num_input_tokens_seen": 2799296, |
| "step": 3735 |
| }, |
| { |
| "epoch": 5.780525502318392, |
| "grad_norm": 0.6679904460906982, |
| "learning_rate": 2.2569287163320534e-05, |
| "loss": 2.3377, |
| "num_input_tokens_seen": 2803264, |
| "step": 3740 |
| }, |
| { |
| "epoch": 5.788253477588872, |
| "grad_norm": 0.4480963945388794, |
| "learning_rate": 2.2502176304008575e-05, |
| "loss": 1.9277, |
| "num_input_tokens_seen": 2807616, |
| "step": 3745 |
| }, |
| { |
| "epoch": 5.795981452859351, |
| "grad_norm": 0.4576186239719391, |
| "learning_rate": 2.2435083621089085e-05, |
| "loss": 2.4183, |
| "num_input_tokens_seen": 2811456, |
| "step": 3750 |
| }, |
| { |
| "epoch": 5.80370942812983, |
| "grad_norm": 0.5255048871040344, |
| "learning_rate": 2.2368009602788264e-05, |
| "loss": 2.2337, |
| "num_input_tokens_seen": 2815360, |
| "step": 3755 |
| }, |
| { |
| "epoch": 5.811437403400309, |
| "grad_norm": 0.5405028462409973, |
| "learning_rate": 2.230095473719647e-05, |
| "loss": 2.6278, |
| "num_input_tokens_seen": 2819264, |
| "step": 3760 |
| }, |
| { |
| "epoch": 5.819165378670788, |
| "grad_norm": 0.43693581223487854, |
| "learning_rate": 2.2233919512264713e-05, |
| "loss": 1.9339, |
| "num_input_tokens_seen": 2823296, |
| "step": 3765 |
| }, |
| { |
| "epoch": 5.8268933539412675, |
| "grad_norm": 0.40162092447280884, |
| "learning_rate": 2.216690441580104e-05, |
| "loss": 2.4475, |
| "num_input_tokens_seen": 2827328, |
| "step": 3770 |
| }, |
| { |
| "epoch": 5.834621329211746, |
| "grad_norm": 0.4420531392097473, |
| "learning_rate": 2.2099909935467076e-05, |
| "loss": 2.0878, |
| "num_input_tokens_seen": 2831040, |
| "step": 3775 |
| }, |
| { |
| "epoch": 5.842349304482226, |
| "grad_norm": 0.5679764151573181, |
| "learning_rate": 2.203293655877437e-05, |
| "loss": 2.1213, |
| "num_input_tokens_seen": 2834752, |
| "step": 3780 |
| }, |
| { |
| "epoch": 5.850077279752705, |
| "grad_norm": 0.665807843208313, |
| "learning_rate": 2.196598477308095e-05, |
| "loss": 2.3224, |
| "num_input_tokens_seen": 2839104, |
| "step": 3785 |
| }, |
| { |
| "epoch": 5.857805255023184, |
| "grad_norm": 0.44736775755882263, |
| "learning_rate": 2.1899055065587698e-05, |
| "loss": 2.235, |
| "num_input_tokens_seen": 2842880, |
| "step": 3790 |
| }, |
| { |
| "epoch": 5.865533230293663, |
| "grad_norm": 0.5406590104103088, |
| "learning_rate": 2.1832147923334853e-05, |
| "loss": 2.0263, |
| "num_input_tokens_seen": 2846528, |
| "step": 3795 |
| }, |
| { |
| "epoch": 5.873261205564142, |
| "grad_norm": 0.4958436191082001, |
| "learning_rate": 2.1765263833198435e-05, |
| "loss": 2.2885, |
| "num_input_tokens_seen": 2850688, |
| "step": 3800 |
| }, |
| { |
| "epoch": 5.8809891808346215, |
| "grad_norm": 0.5033271908760071, |
| "learning_rate": 2.1698403281886734e-05, |
| "loss": 2.003, |
| "num_input_tokens_seen": 2854272, |
| "step": 3805 |
| }, |
| { |
| "epoch": 5.8887171561051, |
| "grad_norm": 0.4177800118923187, |
| "learning_rate": 2.163156675593672e-05, |
| "loss": 2.3459, |
| "num_input_tokens_seen": 2858240, |
| "step": 3810 |
| }, |
| { |
| "epoch": 5.89644513137558, |
| "grad_norm": 0.5443251729011536, |
| "learning_rate": 2.1564754741710578e-05, |
| "loss": 2.5596, |
| "num_input_tokens_seen": 2862208, |
| "step": 3815 |
| }, |
| { |
| "epoch": 5.904173106646059, |
| "grad_norm": 0.410028338432312, |
| "learning_rate": 2.149796772539208e-05, |
| "loss": 2.3332, |
| "num_input_tokens_seen": 2865472, |
| "step": 3820 |
| }, |
| { |
| "epoch": 5.911901081916538, |
| "grad_norm": 0.6708984971046448, |
| "learning_rate": 2.1431206192983117e-05, |
| "loss": 2.7066, |
| "num_input_tokens_seen": 2869568, |
| "step": 3825 |
| }, |
| { |
| "epoch": 5.919629057187017, |
| "grad_norm": 0.5683284401893616, |
| "learning_rate": 2.136447063030012e-05, |
| "loss": 2.1768, |
| "num_input_tokens_seen": 2873344, |
| "step": 3830 |
| }, |
| { |
| "epoch": 5.927357032457496, |
| "grad_norm": 0.49835681915283203, |
| "learning_rate": 2.129776152297057e-05, |
| "loss": 2.2855, |
| "num_input_tokens_seen": 2877376, |
| "step": 3835 |
| }, |
| { |
| "epoch": 5.9350850077279755, |
| "grad_norm": 0.39036598801612854, |
| "learning_rate": 2.1231079356429394e-05, |
| "loss": 2.3365, |
| "num_input_tokens_seen": 2881472, |
| "step": 3840 |
| }, |
| { |
| "epoch": 5.942812982998454, |
| "grad_norm": 0.4243427515029907, |
| "learning_rate": 2.1164424615915514e-05, |
| "loss": 1.8171, |
| "num_input_tokens_seen": 2885504, |
| "step": 3845 |
| }, |
| { |
| "epoch": 5.950540958268934, |
| "grad_norm": 0.42286694049835205, |
| "learning_rate": 2.1097797786468236e-05, |
| "loss": 2.2061, |
| "num_input_tokens_seen": 2889408, |
| "step": 3850 |
| }, |
| { |
| "epoch": 5.958268933539412, |
| "grad_norm": 0.6979780793190002, |
| "learning_rate": 2.10311993529238e-05, |
| "loss": 2.2993, |
| "num_input_tokens_seen": 2893312, |
| "step": 3855 |
| }, |
| { |
| "epoch": 5.965996908809892, |
| "grad_norm": 0.48626625537872314, |
| "learning_rate": 2.0964629799911778e-05, |
| "loss": 2.3468, |
| "num_input_tokens_seen": 2897088, |
| "step": 3860 |
| }, |
| { |
| "epoch": 5.9737248840803705, |
| "grad_norm": 0.5056301951408386, |
| "learning_rate": 2.0898089611851612e-05, |
| "loss": 2.1338, |
| "num_input_tokens_seen": 2901248, |
| "step": 3865 |
| }, |
| { |
| "epoch": 5.98145285935085, |
| "grad_norm": 0.4730468988418579, |
| "learning_rate": 2.0831579272949027e-05, |
| "loss": 2.1404, |
| "num_input_tokens_seen": 2905280, |
| "step": 3870 |
| }, |
| { |
| "epoch": 5.9891808346213296, |
| "grad_norm": 0.40042728185653687, |
| "learning_rate": 2.0765099267192575e-05, |
| "loss": 2.4586, |
| "num_input_tokens_seen": 2908736, |
| "step": 3875 |
| }, |
| { |
| "epoch": 5.996908809891808, |
| "grad_norm": 0.5426847338676453, |
| "learning_rate": 2.069865007835003e-05, |
| "loss": 2.4524, |
| "num_input_tokens_seen": 2912512, |
| "step": 3880 |
| }, |
| { |
| "epoch": 6.004636785162288, |
| "grad_norm": 0.4294570982456207, |
| "learning_rate": 2.0632232189964966e-05, |
| "loss": 2.2091, |
| "num_input_tokens_seen": 2915504, |
| "step": 3885 |
| }, |
| { |
| "epoch": 6.0092735703245745, |
| "eval_loss": 2.2262656688690186, |
| "eval_runtime": 9.8518, |
| "eval_samples_per_second": 58.365, |
| "eval_steps_per_second": 7.308, |
| "num_input_tokens_seen": 2917488, |
| "step": 3888 |
| }, |
| { |
| "epoch": 6.012364760432766, |
| "grad_norm": 0.4641660451889038, |
| "learning_rate": 2.0565846085353147e-05, |
| "loss": 2.0986, |
| "num_input_tokens_seen": 2918832, |
| "step": 3890 |
| }, |
| { |
| "epoch": 6.020092735703246, |
| "grad_norm": 0.39218223094940186, |
| "learning_rate": 2.0499492247599085e-05, |
| "loss": 2.167, |
| "num_input_tokens_seen": 2922544, |
| "step": 3895 |
| }, |
| { |
| "epoch": 6.0278207109737245, |
| "grad_norm": 0.5193983912467957, |
| "learning_rate": 2.0433171159552442e-05, |
| "loss": 2.2209, |
| "num_input_tokens_seen": 2926256, |
| "step": 3900 |
| }, |
| { |
| "epoch": 6.035548686244204, |
| "grad_norm": 0.5520421266555786, |
| "learning_rate": 2.036688330382462e-05, |
| "loss": 2.4272, |
| "num_input_tokens_seen": 2930096, |
| "step": 3905 |
| }, |
| { |
| "epoch": 6.043276661514684, |
| "grad_norm": 0.5342735648155212, |
| "learning_rate": 2.030062916278514e-05, |
| "loss": 2.1048, |
| "num_input_tokens_seen": 2933872, |
| "step": 3910 |
| }, |
| { |
| "epoch": 6.051004636785162, |
| "grad_norm": 0.6556943655014038, |
| "learning_rate": 2.0234409218558226e-05, |
| "loss": 1.9015, |
| "num_input_tokens_seen": 2937328, |
| "step": 3915 |
| }, |
| { |
| "epoch": 6.058732612055642, |
| "grad_norm": 0.4831690192222595, |
| "learning_rate": 2.0168223953019233e-05, |
| "loss": 2.0656, |
| "num_input_tokens_seen": 2940848, |
| "step": 3920 |
| }, |
| { |
| "epoch": 6.06646058732612, |
| "grad_norm": 0.561551034450531, |
| "learning_rate": 2.0102073847791182e-05, |
| "loss": 2.1184, |
| "num_input_tokens_seen": 2944432, |
| "step": 3925 |
| }, |
| { |
| "epoch": 6.0741885625966, |
| "grad_norm": 0.4626164138317108, |
| "learning_rate": 2.0035959384241203e-05, |
| "loss": 2.0853, |
| "num_input_tokens_seen": 2948080, |
| "step": 3930 |
| }, |
| { |
| "epoch": 6.0819165378670785, |
| "grad_norm": 0.5871320962905884, |
| "learning_rate": 1.9969881043477105e-05, |
| "loss": 2.0832, |
| "num_input_tokens_seen": 2951920, |
| "step": 3935 |
| }, |
| { |
| "epoch": 6.089644513137558, |
| "grad_norm": 0.4352818727493286, |
| "learning_rate": 1.9903839306343798e-05, |
| "loss": 2.2511, |
| "num_input_tokens_seen": 2955184, |
| "step": 3940 |
| }, |
| { |
| "epoch": 6.097372488408037, |
| "grad_norm": 0.44709163904190063, |
| "learning_rate": 1.9837834653419862e-05, |
| "loss": 2.2942, |
| "num_input_tokens_seen": 2959088, |
| "step": 3945 |
| }, |
| { |
| "epoch": 6.105100463678516, |
| "grad_norm": 0.518781304359436, |
| "learning_rate": 1.9771867565014008e-05, |
| "loss": 2.0329, |
| "num_input_tokens_seen": 2962800, |
| "step": 3950 |
| }, |
| { |
| "epoch": 6.112828438948996, |
| "grad_norm": 0.6104958057403564, |
| "learning_rate": 1.970593852116159e-05, |
| "loss": 1.9864, |
| "num_input_tokens_seen": 2966384, |
| "step": 3955 |
| }, |
| { |
| "epoch": 6.120556414219474, |
| "grad_norm": 0.39966410398483276, |
| "learning_rate": 1.964004800162111e-05, |
| "loss": 2.1951, |
| "num_input_tokens_seen": 2969712, |
| "step": 3960 |
| }, |
| { |
| "epoch": 6.128284389489954, |
| "grad_norm": 0.44955769181251526, |
| "learning_rate": 1.957419648587076e-05, |
| "loss": 2.1336, |
| "num_input_tokens_seen": 2973616, |
| "step": 3965 |
| }, |
| { |
| "epoch": 6.1360123647604325, |
| "grad_norm": 0.485278844833374, |
| "learning_rate": 1.9508384453104867e-05, |
| "loss": 2.1819, |
| "num_input_tokens_seen": 2977328, |
| "step": 3970 |
| }, |
| { |
| "epoch": 6.143740340030912, |
| "grad_norm": 0.5128074884414673, |
| "learning_rate": 1.9442612382230484e-05, |
| "loss": 2.1988, |
| "num_input_tokens_seen": 2981104, |
| "step": 3975 |
| }, |
| { |
| "epoch": 6.151468315301391, |
| "grad_norm": 0.489914208650589, |
| "learning_rate": 1.9376880751863828e-05, |
| "loss": 1.9642, |
| "num_input_tokens_seen": 2985008, |
| "step": 3980 |
| }, |
| { |
| "epoch": 6.15919629057187, |
| "grad_norm": 0.5936963558197021, |
| "learning_rate": 1.931119004032687e-05, |
| "loss": 2.2304, |
| "num_input_tokens_seen": 2989104, |
| "step": 3985 |
| }, |
| { |
| "epoch": 6.166924265842349, |
| "grad_norm": 0.48407799005508423, |
| "learning_rate": 1.9245540725643788e-05, |
| "loss": 2.3725, |
| "num_input_tokens_seen": 2992624, |
| "step": 3990 |
| }, |
| { |
| "epoch": 6.174652241112828, |
| "grad_norm": 0.46309277415275574, |
| "learning_rate": 1.9179933285537554e-05, |
| "loss": 2.2003, |
| "num_input_tokens_seen": 2996208, |
| "step": 3995 |
| }, |
| { |
| "epoch": 6.182380216383308, |
| "grad_norm": 0.5524800419807434, |
| "learning_rate": 1.911436819742638e-05, |
| "loss": 2.0188, |
| "num_input_tokens_seen": 3000816, |
| "step": 4000 |
| }, |
| { |
| "epoch": 6.190108191653787, |
| "grad_norm": 0.4268532395362854, |
| "learning_rate": 1.9048845938420327e-05, |
| "loss": 2.0816, |
| "num_input_tokens_seen": 3004528, |
| "step": 4005 |
| }, |
| { |
| "epoch": 6.197836166924266, |
| "grad_norm": 0.6886558532714844, |
| "learning_rate": 1.8983366985317763e-05, |
| "loss": 2.2171, |
| "num_input_tokens_seen": 3008112, |
| "step": 4010 |
| }, |
| { |
| "epoch": 6.205564142194745, |
| "grad_norm": 0.4119880497455597, |
| "learning_rate": 1.8917931814601952e-05, |
| "loss": 1.9501, |
| "num_input_tokens_seen": 3012208, |
| "step": 4015 |
| }, |
| { |
| "epoch": 6.213292117465224, |
| "grad_norm": 0.4245622456073761, |
| "learning_rate": 1.885254090243753e-05, |
| "loss": 2.1936, |
| "num_input_tokens_seen": 3015792, |
| "step": 4020 |
| }, |
| { |
| "epoch": 6.221020092735703, |
| "grad_norm": 0.46879667043685913, |
| "learning_rate": 1.8787194724667094e-05, |
| "loss": 1.9108, |
| "num_input_tokens_seen": 3019184, |
| "step": 4025 |
| }, |
| { |
| "epoch": 6.228748068006182, |
| "grad_norm": 0.4324242174625397, |
| "learning_rate": 1.8721893756807694e-05, |
| "loss": 2.1635, |
| "num_input_tokens_seen": 3022832, |
| "step": 4030 |
| }, |
| { |
| "epoch": 6.236476043276662, |
| "grad_norm": 0.3699527978897095, |
| "learning_rate": 1.8656638474047404e-05, |
| "loss": 1.8437, |
| "num_input_tokens_seen": 3026544, |
| "step": 4035 |
| }, |
| { |
| "epoch": 6.244204018547141, |
| "grad_norm": 0.4022957980632782, |
| "learning_rate": 1.859142935124184e-05, |
| "loss": 2.342, |
| "num_input_tokens_seen": 3030256, |
| "step": 4040 |
| }, |
| { |
| "epoch": 6.25193199381762, |
| "grad_norm": 0.3864610195159912, |
| "learning_rate": 1.8526266862910742e-05, |
| "loss": 2.2219, |
| "num_input_tokens_seen": 3034224, |
| "step": 4045 |
| }, |
| { |
| "epoch": 6.259659969088099, |
| "grad_norm": 0.49996402859687805, |
| "learning_rate": 1.8461151483234456e-05, |
| "loss": 2.3413, |
| "num_input_tokens_seen": 3037808, |
| "step": 4050 |
| }, |
| { |
| "epoch": 6.267387944358578, |
| "grad_norm": 0.5802761316299438, |
| "learning_rate": 1.8396083686050573e-05, |
| "loss": 2.06, |
| "num_input_tokens_seen": 3041520, |
| "step": 4055 |
| }, |
| { |
| "epoch": 6.275115919629057, |
| "grad_norm": 0.7262587547302246, |
| "learning_rate": 1.833106394485038e-05, |
| "loss": 2.1752, |
| "num_input_tokens_seen": 3045296, |
| "step": 4060 |
| }, |
| { |
| "epoch": 6.282843894899536, |
| "grad_norm": 0.7141749858856201, |
| "learning_rate": 1.8266092732775514e-05, |
| "loss": 2.0812, |
| "num_input_tokens_seen": 3049008, |
| "step": 4065 |
| }, |
| { |
| "epoch": 6.290571870170015, |
| "grad_norm": 0.3760020434856415, |
| "learning_rate": 1.8201170522614428e-05, |
| "loss": 1.9564, |
| "num_input_tokens_seen": 3052656, |
| "step": 4070 |
| }, |
| { |
| "epoch": 6.298299845440495, |
| "grad_norm": 0.5244548320770264, |
| "learning_rate": 1.8136297786799025e-05, |
| "loss": 1.9544, |
| "num_input_tokens_seen": 3056432, |
| "step": 4075 |
| }, |
| { |
| "epoch": 6.306027820710974, |
| "grad_norm": 0.3853306770324707, |
| "learning_rate": 1.807147499740117e-05, |
| "loss": 1.7615, |
| "num_input_tokens_seen": 3060464, |
| "step": 4080 |
| }, |
| { |
| "epoch": 6.313755795981453, |
| "grad_norm": 0.490071177482605, |
| "learning_rate": 1.8006702626129293e-05, |
| "loss": 2.2103, |
| "num_input_tokens_seen": 3064688, |
| "step": 4085 |
| }, |
| { |
| "epoch": 6.321483771251932, |
| "grad_norm": 0.4956612288951874, |
| "learning_rate": 1.7941981144324904e-05, |
| "loss": 2.0875, |
| "num_input_tokens_seen": 3068144, |
| "step": 4090 |
| }, |
| { |
| "epoch": 6.329211746522411, |
| "grad_norm": 0.4515208899974823, |
| "learning_rate": 1.787731102295924e-05, |
| "loss": 2.2417, |
| "num_input_tokens_seen": 3072048, |
| "step": 4095 |
| }, |
| { |
| "epoch": 6.3369397217928904, |
| "grad_norm": 0.4446680545806885, |
| "learning_rate": 1.7812692732629744e-05, |
| "loss": 1.949, |
| "num_input_tokens_seen": 3075824, |
| "step": 4100 |
| }, |
| { |
| "epoch": 6.344667697063369, |
| "grad_norm": 0.4429253041744232, |
| "learning_rate": 1.7748126743556727e-05, |
| "loss": 2.2903, |
| "num_input_tokens_seen": 3079280, |
| "step": 4105 |
| }, |
| { |
| "epoch": 6.352395672333849, |
| "grad_norm": 0.4691123068332672, |
| "learning_rate": 1.76836135255799e-05, |
| "loss": 2.2928, |
| "num_input_tokens_seen": 3083376, |
| "step": 4110 |
| }, |
| { |
| "epoch": 6.360123647604327, |
| "grad_norm": 0.49845725297927856, |
| "learning_rate": 1.7619153548154967e-05, |
| "loss": 2.0365, |
| "num_input_tokens_seen": 3086832, |
| "step": 4115 |
| }, |
| { |
| "epoch": 6.367851622874807, |
| "grad_norm": 0.4709586501121521, |
| "learning_rate": 1.7554747280350184e-05, |
| "loss": 1.8176, |
| "num_input_tokens_seen": 3090224, |
| "step": 4120 |
| }, |
| { |
| "epoch": 6.375579598145286, |
| "grad_norm": 0.4777339696884155, |
| "learning_rate": 1.7490395190843005e-05, |
| "loss": 2.4652, |
| "num_input_tokens_seen": 3094064, |
| "step": 4125 |
| }, |
| { |
| "epoch": 6.383307573415765, |
| "grad_norm": 0.4250446856021881, |
| "learning_rate": 1.7426097747916602e-05, |
| "loss": 1.8572, |
| "num_input_tokens_seen": 3098160, |
| "step": 4130 |
| }, |
| { |
| "epoch": 6.3910355486862445, |
| "grad_norm": 0.4578975737094879, |
| "learning_rate": 1.7361855419456507e-05, |
| "loss": 2.122, |
| "num_input_tokens_seen": 3101104, |
| "step": 4135 |
| }, |
| { |
| "epoch": 6.398763523956723, |
| "grad_norm": 0.5164878368377686, |
| "learning_rate": 1.729766867294719e-05, |
| "loss": 2.3874, |
| "num_input_tokens_seen": 3104752, |
| "step": 4140 |
| }, |
| { |
| "epoch": 6.406491499227203, |
| "grad_norm": 0.49974367022514343, |
| "learning_rate": 1.7233537975468646e-05, |
| "loss": 1.9827, |
| "num_input_tokens_seen": 3108592, |
| "step": 4145 |
| }, |
| { |
| "epoch": 6.414219474497681, |
| "grad_norm": 0.517787516117096, |
| "learning_rate": 1.7169463793693014e-05, |
| "loss": 2.1285, |
| "num_input_tokens_seen": 3112816, |
| "step": 4150 |
| }, |
| { |
| "epoch": 6.421947449768161, |
| "grad_norm": 0.5714470744132996, |
| "learning_rate": 1.7105446593881186e-05, |
| "loss": 2.2337, |
| "num_input_tokens_seen": 3116656, |
| "step": 4155 |
| }, |
| { |
| "epoch": 6.42967542503864, |
| "grad_norm": 0.44557783007621765, |
| "learning_rate": 1.704148684187937e-05, |
| "loss": 2.2485, |
| "num_input_tokens_seen": 3120496, |
| "step": 4160 |
| }, |
| { |
| "epoch": 6.437403400309119, |
| "grad_norm": 0.38803085684776306, |
| "learning_rate": 1.6977585003115777e-05, |
| "loss": 2.363, |
| "num_input_tokens_seen": 3124272, |
| "step": 4165 |
| }, |
| { |
| "epoch": 6.4451313755795985, |
| "grad_norm": 0.5083007216453552, |
| "learning_rate": 1.6913741542597145e-05, |
| "loss": 2.4348, |
| "num_input_tokens_seen": 3128048, |
| "step": 4170 |
| }, |
| { |
| "epoch": 6.452859350850077, |
| "grad_norm": 0.4518604576587677, |
| "learning_rate": 1.6849956924905435e-05, |
| "loss": 1.8019, |
| "num_input_tokens_seen": 3131376, |
| "step": 4175 |
| }, |
| { |
| "epoch": 6.460587326120557, |
| "grad_norm": 0.4489939510822296, |
| "learning_rate": 1.678623161419439e-05, |
| "loss": 2.3105, |
| "num_input_tokens_seen": 3135216, |
| "step": 4180 |
| }, |
| { |
| "epoch": 6.468315301391035, |
| "grad_norm": 0.4724332094192505, |
| "learning_rate": 1.6722566074186214e-05, |
| "loss": 2.2169, |
| "num_input_tokens_seen": 3138928, |
| "step": 4185 |
| }, |
| { |
| "epoch": 6.476043276661515, |
| "grad_norm": 0.4101565480232239, |
| "learning_rate": 1.665896076816812e-05, |
| "loss": 1.9876, |
| "num_input_tokens_seen": 3142768, |
| "step": 4190 |
| }, |
| { |
| "epoch": 6.483771251931993, |
| "grad_norm": 0.49106982350349426, |
| "learning_rate": 1.659541615898905e-05, |
| "loss": 1.9611, |
| "num_input_tokens_seen": 3146416, |
| "step": 4195 |
| }, |
| { |
| "epoch": 6.491499227202473, |
| "grad_norm": 0.49804726243019104, |
| "learning_rate": 1.6531932709056228e-05, |
| "loss": 1.9822, |
| "num_input_tokens_seen": 3149552, |
| "step": 4200 |
| }, |
| { |
| "epoch": 6.4992272024729525, |
| "grad_norm": 0.38408616185188293, |
| "learning_rate": 1.646851088033185e-05, |
| "loss": 1.9707, |
| "num_input_tokens_seen": 3153648, |
| "step": 4205 |
| }, |
| { |
| "epoch": 6.506955177743431, |
| "grad_norm": 0.387453556060791, |
| "learning_rate": 1.6405151134329687e-05, |
| "loss": 1.8969, |
| "num_input_tokens_seen": 3157744, |
| "step": 4210 |
| }, |
| { |
| "epoch": 6.510046367851623, |
| "eval_loss": 2.1331775188446045, |
| "eval_runtime": 9.8453, |
| "eval_samples_per_second": 58.403, |
| "eval_steps_per_second": 7.313, |
| "num_input_tokens_seen": 3159216, |
| "step": 4212 |
| }, |
| { |
| "epoch": 6.514683153013911, |
| "grad_norm": 0.4173484146595001, |
| "learning_rate": 1.6341853932111767e-05, |
| "loss": 2.1683, |
| "num_input_tokens_seen": 3161072, |
| "step": 4215 |
| }, |
| { |
| "epoch": 6.522411128284389, |
| "grad_norm": 0.5903739333152771, |
| "learning_rate": 1.627861973428496e-05, |
| "loss": 2.025, |
| "num_input_tokens_seen": 3164400, |
| "step": 4220 |
| }, |
| { |
| "epoch": 6.530139103554869, |
| "grad_norm": 0.5123242139816284, |
| "learning_rate": 1.6215449000997667e-05, |
| "loss": 2.1599, |
| "num_input_tokens_seen": 3167984, |
| "step": 4225 |
| }, |
| { |
| "epoch": 6.5378670788253475, |
| "grad_norm": 0.4895840883255005, |
| "learning_rate": 1.6152342191936483e-05, |
| "loss": 2.3419, |
| "num_input_tokens_seen": 3171824, |
| "step": 4230 |
| }, |
| { |
| "epoch": 6.545595054095827, |
| "grad_norm": 0.5733851790428162, |
| "learning_rate": 1.6089299766322812e-05, |
| "loss": 2.0644, |
| "num_input_tokens_seen": 3175728, |
| "step": 4235 |
| }, |
| { |
| "epoch": 6.553323029366306, |
| "grad_norm": 0.4447968900203705, |
| "learning_rate": 1.6026322182909575e-05, |
| "loss": 1.6845, |
| "num_input_tokens_seen": 3179248, |
| "step": 4240 |
| }, |
| { |
| "epoch": 6.561051004636785, |
| "grad_norm": 0.5073620080947876, |
| "learning_rate": 1.5963409899977804e-05, |
| "loss": 2.6895, |
| "num_input_tokens_seen": 3182768, |
| "step": 4245 |
| }, |
| { |
| "epoch": 6.568778979907265, |
| "grad_norm": 0.47167372703552246, |
| "learning_rate": 1.5900563375333388e-05, |
| "loss": 2.2403, |
| "num_input_tokens_seen": 3186544, |
| "step": 4250 |
| }, |
| { |
| "epoch": 6.576506955177743, |
| "grad_norm": 0.47486698627471924, |
| "learning_rate": 1.583778306630366e-05, |
| "loss": 1.6739, |
| "num_input_tokens_seen": 3190320, |
| "step": 4255 |
| }, |
| { |
| "epoch": 6.584234930448223, |
| "grad_norm": 0.6215065717697144, |
| "learning_rate": 1.5775069429734135e-05, |
| "loss": 2.2426, |
| "num_input_tokens_seen": 3194096, |
| "step": 4260 |
| }, |
| { |
| "epoch": 6.5919629057187015, |
| "grad_norm": 0.42234382033348083, |
| "learning_rate": 1.5712422921985157e-05, |
| "loss": 2.4955, |
| "num_input_tokens_seen": 3197872, |
| "step": 4265 |
| }, |
| { |
| "epoch": 6.599690880989181, |
| "grad_norm": 0.4679786264896393, |
| "learning_rate": 1.5649843998928585e-05, |
| "loss": 2.175, |
| "num_input_tokens_seen": 3201456, |
| "step": 4270 |
| }, |
| { |
| "epoch": 6.60741885625966, |
| "grad_norm": 0.43814560770988464, |
| "learning_rate": 1.558733311594444e-05, |
| "loss": 2.047, |
| "num_input_tokens_seen": 3205360, |
| "step": 4275 |
| }, |
| { |
| "epoch": 6.615146831530139, |
| "grad_norm": 0.45244526863098145, |
| "learning_rate": 1.5524890727917676e-05, |
| "loss": 2.0997, |
| "num_input_tokens_seen": 3208752, |
| "step": 4280 |
| }, |
| { |
| "epoch": 6.622874806800619, |
| "grad_norm": 0.45812249183654785, |
| "learning_rate": 1.546251728923476e-05, |
| "loss": 1.977, |
| "num_input_tokens_seen": 3212528, |
| "step": 4285 |
| }, |
| { |
| "epoch": 6.630602782071097, |
| "grad_norm": 0.4441308081150055, |
| "learning_rate": 1.5400213253780467e-05, |
| "loss": 1.9004, |
| "num_input_tokens_seen": 3216176, |
| "step": 4290 |
| }, |
| { |
| "epoch": 6.638330757341577, |
| "grad_norm": 0.5535650253295898, |
| "learning_rate": 1.5337979074934505e-05, |
| "loss": 2.1144, |
| "num_input_tokens_seen": 3220144, |
| "step": 4295 |
| }, |
| { |
| "epoch": 6.6460587326120555, |
| "grad_norm": 0.46812766790390015, |
| "learning_rate": 1.5275815205568264e-05, |
| "loss": 1.9624, |
| "num_input_tokens_seen": 3223920, |
| "step": 4300 |
| }, |
| { |
| "epoch": 6.653786707882535, |
| "grad_norm": 0.38829505443573, |
| "learning_rate": 1.5213722098041472e-05, |
| "loss": 2.0715, |
| "num_input_tokens_seen": 3227312, |
| "step": 4305 |
| }, |
| { |
| "epoch": 6.661514683153014, |
| "grad_norm": 0.5156342387199402, |
| "learning_rate": 1.5151700204198965e-05, |
| "loss": 1.5928, |
| "num_input_tokens_seen": 3231216, |
| "step": 4310 |
| }, |
| { |
| "epoch": 6.669242658423493, |
| "grad_norm": 0.417357474565506, |
| "learning_rate": 1.5089749975367324e-05, |
| "loss": 2.2893, |
| "num_input_tokens_seen": 3234608, |
| "step": 4315 |
| }, |
| { |
| "epoch": 6.676970633693972, |
| "grad_norm": 0.4816182255744934, |
| "learning_rate": 1.5027871862351671e-05, |
| "loss": 1.9179, |
| "num_input_tokens_seen": 3239088, |
| "step": 4320 |
| }, |
| { |
| "epoch": 6.684698608964451, |
| "grad_norm": 0.46260756254196167, |
| "learning_rate": 1.4966066315432331e-05, |
| "loss": 2.071, |
| "num_input_tokens_seen": 3242608, |
| "step": 4325 |
| }, |
| { |
| "epoch": 6.69242658423493, |
| "grad_norm": 0.4879600405693054, |
| "learning_rate": 1.4904333784361568e-05, |
| "loss": 1.8896, |
| "num_input_tokens_seen": 3246256, |
| "step": 4330 |
| }, |
| { |
| "epoch": 6.7001545595054095, |
| "grad_norm": 0.5067187547683716, |
| "learning_rate": 1.4842674718360323e-05, |
| "loss": 2.2088, |
| "num_input_tokens_seen": 3249648, |
| "step": 4335 |
| }, |
| { |
| "epoch": 6.707882534775889, |
| "grad_norm": 0.3779104948043823, |
| "learning_rate": 1.4781089566114953e-05, |
| "loss": 2.2161, |
| "num_input_tokens_seen": 3253232, |
| "step": 4340 |
| }, |
| { |
| "epoch": 6.715610510046368, |
| "grad_norm": 0.5668134689331055, |
| "learning_rate": 1.4719578775773924e-05, |
| "loss": 2.2821, |
| "num_input_tokens_seen": 3257072, |
| "step": 4345 |
| }, |
| { |
| "epoch": 6.723338485316847, |
| "grad_norm": 0.516550600528717, |
| "learning_rate": 1.465814279494461e-05, |
| "loss": 1.9797, |
| "num_input_tokens_seen": 3260400, |
| "step": 4350 |
| }, |
| { |
| "epoch": 6.731066460587326, |
| "grad_norm": 0.4872724115848541, |
| "learning_rate": 1.4596782070689971e-05, |
| "loss": 1.8425, |
| "num_input_tokens_seen": 3263856, |
| "step": 4355 |
| }, |
| { |
| "epoch": 6.738794435857805, |
| "grad_norm": 0.41736355423927307, |
| "learning_rate": 1.4535497049525371e-05, |
| "loss": 2.1885, |
| "num_input_tokens_seen": 3267888, |
| "step": 4360 |
| }, |
| { |
| "epoch": 6.746522411128284, |
| "grad_norm": 0.4729522168636322, |
| "learning_rate": 1.4474288177415245e-05, |
| "loss": 1.8801, |
| "num_input_tokens_seen": 3271472, |
| "step": 4365 |
| }, |
| { |
| "epoch": 6.7542503863987635, |
| "grad_norm": 0.5210955739021301, |
| "learning_rate": 1.4413155899769954e-05, |
| "loss": 2.24, |
| "num_input_tokens_seen": 3275184, |
| "step": 4370 |
| }, |
| { |
| "epoch": 6.761978361669243, |
| "grad_norm": 0.5247458219528198, |
| "learning_rate": 1.4352100661442448e-05, |
| "loss": 1.8741, |
| "num_input_tokens_seen": 3279408, |
| "step": 4375 |
| }, |
| { |
| "epoch": 6.769706336939722, |
| "grad_norm": 0.47709551453590393, |
| "learning_rate": 1.429112290672508e-05, |
| "loss": 1.6511, |
| "num_input_tokens_seen": 3282800, |
| "step": 4380 |
| }, |
| { |
| "epoch": 6.777434312210201, |
| "grad_norm": 0.5259020924568176, |
| "learning_rate": 1.4230223079346371e-05, |
| "loss": 2.294, |
| "num_input_tokens_seen": 3286128, |
| "step": 4385 |
| }, |
| { |
| "epoch": 6.78516228748068, |
| "grad_norm": 0.611424446105957, |
| "learning_rate": 1.4169401622467768e-05, |
| "loss": 2.2026, |
| "num_input_tokens_seen": 3290160, |
| "step": 4390 |
| }, |
| { |
| "epoch": 6.792890262751159, |
| "grad_norm": 0.5359120965003967, |
| "learning_rate": 1.4108658978680422e-05, |
| "loss": 1.62, |
| "num_input_tokens_seen": 3293616, |
| "step": 4395 |
| }, |
| { |
| "epoch": 6.800618238021638, |
| "grad_norm": 0.535717785358429, |
| "learning_rate": 1.4047995590001975e-05, |
| "loss": 2.0069, |
| "num_input_tokens_seen": 3297456, |
| "step": 4400 |
| }, |
| { |
| "epoch": 6.8083462132921175, |
| "grad_norm": 0.6203454732894897, |
| "learning_rate": 1.3987411897873321e-05, |
| "loss": 2.2569, |
| "num_input_tokens_seen": 3301296, |
| "step": 4405 |
| }, |
| { |
| "epoch": 6.816074188562597, |
| "grad_norm": 0.5075907707214355, |
| "learning_rate": 1.3926908343155462e-05, |
| "loss": 1.9357, |
| "num_input_tokens_seen": 3305072, |
| "step": 4410 |
| }, |
| { |
| "epoch": 6.823802163833076, |
| "grad_norm": 0.39903944730758667, |
| "learning_rate": 1.3866485366126169e-05, |
| "loss": 1.6716, |
| "num_input_tokens_seen": 3308784, |
| "step": 4415 |
| }, |
| { |
| "epoch": 6.831530139103555, |
| "grad_norm": 0.6213854551315308, |
| "learning_rate": 1.3806143406476938e-05, |
| "loss": 2.1229, |
| "num_input_tokens_seen": 3313072, |
| "step": 4420 |
| }, |
| { |
| "epoch": 6.839258114374034, |
| "grad_norm": 0.6845709681510925, |
| "learning_rate": 1.3745882903309637e-05, |
| "loss": 1.9561, |
| "num_input_tokens_seen": 3316912, |
| "step": 4425 |
| }, |
| { |
| "epoch": 6.846986089644513, |
| "grad_norm": 0.4705151617527008, |
| "learning_rate": 1.3685704295133451e-05, |
| "loss": 1.5797, |
| "num_input_tokens_seen": 3321584, |
| "step": 4430 |
| }, |
| { |
| "epoch": 6.854714064914992, |
| "grad_norm": 0.5121558308601379, |
| "learning_rate": 1.362560801986158e-05, |
| "loss": 2.215, |
| "num_input_tokens_seen": 3325168, |
| "step": 4435 |
| }, |
| { |
| "epoch": 6.8624420401854715, |
| "grad_norm": 0.45194992423057556, |
| "learning_rate": 1.356559451480811e-05, |
| "loss": 2.1466, |
| "num_input_tokens_seen": 3329584, |
| "step": 4440 |
| }, |
| { |
| "epoch": 6.87017001545595, |
| "grad_norm": 0.5940639972686768, |
| "learning_rate": 1.3505664216684824e-05, |
| "loss": 1.8275, |
| "num_input_tokens_seen": 3334064, |
| "step": 4445 |
| }, |
| { |
| "epoch": 6.87789799072643, |
| "grad_norm": 0.5700973272323608, |
| "learning_rate": 1.3445817561598002e-05, |
| "loss": 2.047, |
| "num_input_tokens_seen": 3338032, |
| "step": 4450 |
| }, |
| { |
| "epoch": 6.885625965996908, |
| "grad_norm": 0.44290101528167725, |
| "learning_rate": 1.3386054985045271e-05, |
| "loss": 2.1383, |
| "num_input_tokens_seen": 3341488, |
| "step": 4455 |
| }, |
| { |
| "epoch": 6.893353941267388, |
| "grad_norm": 0.34908556938171387, |
| "learning_rate": 1.3326376921912431e-05, |
| "loss": 1.6381, |
| "num_input_tokens_seen": 3344816, |
| "step": 4460 |
| }, |
| { |
| "epoch": 6.901081916537867, |
| "grad_norm": 0.4741649329662323, |
| "learning_rate": 1.3266783806470279e-05, |
| "loss": 1.8414, |
| "num_input_tokens_seen": 3348784, |
| "step": 4465 |
| }, |
| { |
| "epoch": 6.908809891808346, |
| "grad_norm": 0.432268887758255, |
| "learning_rate": 1.3207276072371466e-05, |
| "loss": 1.8426, |
| "num_input_tokens_seen": 3352624, |
| "step": 4470 |
| }, |
| { |
| "epoch": 6.916537867078826, |
| "grad_norm": 0.4336986839771271, |
| "learning_rate": 1.3147854152647315e-05, |
| "loss": 2.1196, |
| "num_input_tokens_seen": 3356784, |
| "step": 4475 |
| }, |
| { |
| "epoch": 6.924265842349304, |
| "grad_norm": 0.4205768406391144, |
| "learning_rate": 1.308851847970473e-05, |
| "loss": 1.763, |
| "num_input_tokens_seen": 3361008, |
| "step": 4480 |
| }, |
| { |
| "epoch": 6.931993817619784, |
| "grad_norm": 0.44432491064071655, |
| "learning_rate": 1.3029269485322937e-05, |
| "loss": 1.8207, |
| "num_input_tokens_seen": 3364912, |
| "step": 4485 |
| }, |
| { |
| "epoch": 6.939721792890262, |
| "grad_norm": 0.4959534704685211, |
| "learning_rate": 1.2970107600650483e-05, |
| "loss": 1.9731, |
| "num_input_tokens_seen": 3369072, |
| "step": 4490 |
| }, |
| { |
| "epoch": 6.947449768160742, |
| "grad_norm": 0.40225279331207275, |
| "learning_rate": 1.2911033256201965e-05, |
| "loss": 1.9828, |
| "num_input_tokens_seen": 3372464, |
| "step": 4495 |
| }, |
| { |
| "epoch": 6.955177743431221, |
| "grad_norm": 0.44361382722854614, |
| "learning_rate": 1.2852046881855015e-05, |
| "loss": 2.0853, |
| "num_input_tokens_seen": 3376240, |
| "step": 4500 |
| }, |
| { |
| "epoch": 6.9629057187017, |
| "grad_norm": 0.5393215417861938, |
| "learning_rate": 1.279314890684708e-05, |
| "loss": 1.9504, |
| "num_input_tokens_seen": 3380464, |
| "step": 4505 |
| }, |
| { |
| "epoch": 6.97063369397218, |
| "grad_norm": 0.7672788500785828, |
| "learning_rate": 1.2734339759772341e-05, |
| "loss": 2.0457, |
| "num_input_tokens_seen": 3383792, |
| "step": 4510 |
| }, |
| { |
| "epoch": 6.978361669242658, |
| "grad_norm": 0.4358603060245514, |
| "learning_rate": 1.2675619868578592e-05, |
| "loss": 2.0066, |
| "num_input_tokens_seen": 3387504, |
| "step": 4515 |
| }, |
| { |
| "epoch": 6.986089644513138, |
| "grad_norm": 0.48252299427986145, |
| "learning_rate": 1.2616989660564127e-05, |
| "loss": 1.7536, |
| "num_input_tokens_seen": 3390896, |
| "step": 4520 |
| }, |
| { |
| "epoch": 6.993817619783616, |
| "grad_norm": 0.688645601272583, |
| "learning_rate": 1.2558449562374614e-05, |
| "loss": 1.9599, |
| "num_input_tokens_seen": 3395248, |
| "step": 4525 |
| }, |
| { |
| "epoch": 7.001545595054096, |
| "grad_norm": 0.4696773290634155, |
| "learning_rate": 1.2500000000000006e-05, |
| "loss": 1.9528, |
| "num_input_tokens_seen": 3399200, |
| "step": 4530 |
| }, |
| { |
| "epoch": 7.0092735703245745, |
| "grad_norm": 0.40950486063957214, |
| "learning_rate": 1.2441641398771431e-05, |
| "loss": 1.7298, |
| "num_input_tokens_seen": 3402272, |
| "step": 4535 |
| }, |
| { |
| "epoch": 7.0108191653786704, |
| "eval_loss": 2.0646586418151855, |
| "eval_runtime": 9.85, |
| "eval_samples_per_second": 58.376, |
| "eval_steps_per_second": 7.31, |
| "num_input_tokens_seen": 3403040, |
| "step": 4536 |
| }, |
| { |
| "epoch": 7.017001545595054, |
| "grad_norm": 0.3565578758716583, |
| "learning_rate": 1.2383374183358135e-05, |
| "loss": 2.2223, |
| "num_input_tokens_seen": 3406304, |
| "step": 4540 |
| }, |
| { |
| "epoch": 7.024729520865534, |
| "grad_norm": 0.4340266287326813, |
| "learning_rate": 1.2325198777764297e-05, |
| "loss": 1.9125, |
| "num_input_tokens_seen": 3409824, |
| "step": 4545 |
| }, |
| { |
| "epoch": 7.032457496136012, |
| "grad_norm": 0.569847822189331, |
| "learning_rate": 1.2267115605326076e-05, |
| "loss": 2.0876, |
| "num_input_tokens_seen": 3413600, |
| "step": 4550 |
| }, |
| { |
| "epoch": 7.040185471406492, |
| "grad_norm": 0.5271437764167786, |
| "learning_rate": 1.2209125088708395e-05, |
| "loss": 2.2137, |
| "num_input_tokens_seen": 3417248, |
| "step": 4555 |
| }, |
| { |
| "epoch": 7.04791344667697, |
| "grad_norm": 0.5423272252082825, |
| "learning_rate": 1.2151227649901986e-05, |
| "loss": 1.9259, |
| "num_input_tokens_seen": 3420512, |
| "step": 4560 |
| }, |
| { |
| "epoch": 7.05564142194745, |
| "grad_norm": 0.36704373359680176, |
| "learning_rate": 1.2093423710220231e-05, |
| "loss": 1.9247, |
| "num_input_tokens_seen": 3424096, |
| "step": 4565 |
| }, |
| { |
| "epoch": 7.063369397217929, |
| "grad_norm": 0.5190703868865967, |
| "learning_rate": 1.203571369029614e-05, |
| "loss": 1.765, |
| "num_input_tokens_seen": 3427680, |
| "step": 4570 |
| }, |
| { |
| "epoch": 7.071097372488408, |
| "grad_norm": 0.8384991884231567, |
| "learning_rate": 1.1978098010079275e-05, |
| "loss": 1.9814, |
| "num_input_tokens_seen": 3431712, |
| "step": 4575 |
| }, |
| { |
| "epoch": 7.078825347758887, |
| "grad_norm": 0.5724890232086182, |
| "learning_rate": 1.1920577088832702e-05, |
| "loss": 1.8155, |
| "num_input_tokens_seen": 3434976, |
| "step": 4580 |
| }, |
| { |
| "epoch": 7.086553323029366, |
| "grad_norm": 0.6341719627380371, |
| "learning_rate": 1.1863151345129933e-05, |
| "loss": 1.9678, |
| "num_input_tokens_seen": 3438816, |
| "step": 4585 |
| }, |
| { |
| "epoch": 7.094281298299846, |
| "grad_norm": 0.7012672424316406, |
| "learning_rate": 1.1805821196851886e-05, |
| "loss": 1.8407, |
| "num_input_tokens_seen": 3442400, |
| "step": 4590 |
| }, |
| { |
| "epoch": 7.102009273570324, |
| "grad_norm": 0.5077885985374451, |
| "learning_rate": 1.1748587061183835e-05, |
| "loss": 2.2851, |
| "num_input_tokens_seen": 3445984, |
| "step": 4595 |
| }, |
| { |
| "epoch": 7.109737248840804, |
| "grad_norm": 0.5049794912338257, |
| "learning_rate": 1.1691449354612393e-05, |
| "loss": 2.0633, |
| "num_input_tokens_seen": 3449504, |
| "step": 4600 |
| }, |
| { |
| "epoch": 7.117465224111283, |
| "grad_norm": 0.5647863149642944, |
| "learning_rate": 1.163440849292245e-05, |
| "loss": 1.828, |
| "num_input_tokens_seen": 3453792, |
| "step": 4605 |
| }, |
| { |
| "epoch": 7.125193199381762, |
| "grad_norm": 0.4179993271827698, |
| "learning_rate": 1.1577464891194203e-05, |
| "loss": 1.7997, |
| "num_input_tokens_seen": 3457568, |
| "step": 4610 |
| }, |
| { |
| "epoch": 7.132921174652241, |
| "grad_norm": 0.6215378046035767, |
| "learning_rate": 1.1520618963800043e-05, |
| "loss": 2.0511, |
| "num_input_tokens_seen": 3461152, |
| "step": 4615 |
| }, |
| { |
| "epoch": 7.14064914992272, |
| "grad_norm": 0.442844033241272, |
| "learning_rate": 1.1463871124401657e-05, |
| "loss": 1.8991, |
| "num_input_tokens_seen": 3464864, |
| "step": 4620 |
| }, |
| { |
| "epoch": 7.1483771251932, |
| "grad_norm": 0.5186662077903748, |
| "learning_rate": 1.1407221785946892e-05, |
| "loss": 1.854, |
| "num_input_tokens_seen": 3468128, |
| "step": 4625 |
| }, |
| { |
| "epoch": 7.156105100463678, |
| "grad_norm": 0.6374059319496155, |
| "learning_rate": 1.1350671360666873e-05, |
| "loss": 1.9453, |
| "num_input_tokens_seen": 3471904, |
| "step": 4630 |
| }, |
| { |
| "epoch": 7.163833075734158, |
| "grad_norm": 0.4631650447845459, |
| "learning_rate": 1.1294220260072912e-05, |
| "loss": 1.8227, |
| "num_input_tokens_seen": 3475808, |
| "step": 4635 |
| }, |
| { |
| "epoch": 7.171561051004637, |
| "grad_norm": 0.44500941038131714, |
| "learning_rate": 1.1237868894953554e-05, |
| "loss": 1.9856, |
| "num_input_tokens_seen": 3479392, |
| "step": 4640 |
| }, |
| { |
| "epoch": 7.179289026275116, |
| "grad_norm": 0.4622701406478882, |
| "learning_rate": 1.1181617675371581e-05, |
| "loss": 1.8469, |
| "num_input_tokens_seen": 3483040, |
| "step": 4645 |
| }, |
| { |
| "epoch": 7.187017001545595, |
| "grad_norm": 0.5993435382843018, |
| "learning_rate": 1.112546701066102e-05, |
| "loss": 2.0912, |
| "num_input_tokens_seen": 3487264, |
| "step": 4650 |
| }, |
| { |
| "epoch": 7.194744976816074, |
| "grad_norm": 0.6248143911361694, |
| "learning_rate": 1.1069417309424176e-05, |
| "loss": 2.2415, |
| "num_input_tokens_seen": 3491488, |
| "step": 4655 |
| }, |
| { |
| "epoch": 7.202472952086553, |
| "grad_norm": 0.48824310302734375, |
| "learning_rate": 1.101346897952866e-05, |
| "loss": 1.8952, |
| "num_input_tokens_seen": 3495264, |
| "step": 4660 |
| }, |
| { |
| "epoch": 7.210200927357032, |
| "grad_norm": 0.5553814172744751, |
| "learning_rate": 1.0957622428104394e-05, |
| "loss": 1.7953, |
| "num_input_tokens_seen": 3498592, |
| "step": 4665 |
| }, |
| { |
| "epoch": 7.217928902627512, |
| "grad_norm": 0.65151047706604, |
| "learning_rate": 1.0901878061540712e-05, |
| "loss": 2.0573, |
| "num_input_tokens_seen": 3502560, |
| "step": 4670 |
| }, |
| { |
| "epoch": 7.225656877897991, |
| "grad_norm": 0.4101492464542389, |
| "learning_rate": 1.0846236285483296e-05, |
| "loss": 1.9602, |
| "num_input_tokens_seen": 3506528, |
| "step": 4675 |
| }, |
| { |
| "epoch": 7.23338485316847, |
| "grad_norm": 0.37585628032684326, |
| "learning_rate": 1.079069750483136e-05, |
| "loss": 1.5444, |
| "num_input_tokens_seen": 3510496, |
| "step": 4680 |
| }, |
| { |
| "epoch": 7.241112828438949, |
| "grad_norm": 0.46534785628318787, |
| "learning_rate": 1.0735262123734557e-05, |
| "loss": 2.1299, |
| "num_input_tokens_seen": 3513824, |
| "step": 4685 |
| }, |
| { |
| "epoch": 7.248840803709428, |
| "grad_norm": 0.5972572565078735, |
| "learning_rate": 1.067993054559018e-05, |
| "loss": 2.1535, |
| "num_input_tokens_seen": 3517920, |
| "step": 4690 |
| }, |
| { |
| "epoch": 7.256568778979907, |
| "grad_norm": 0.4854303002357483, |
| "learning_rate": 1.062470317304012e-05, |
| "loss": 1.9984, |
| "num_input_tokens_seen": 3521568, |
| "step": 4695 |
| }, |
| { |
| "epoch": 7.2642967542503865, |
| "grad_norm": 0.4328206479549408, |
| "learning_rate": 1.0569580407967983e-05, |
| "loss": 2.4716, |
| "num_input_tokens_seen": 3525536, |
| "step": 4700 |
| }, |
| { |
| "epoch": 7.272024729520865, |
| "grad_norm": 0.5262501239776611, |
| "learning_rate": 1.0514562651496162e-05, |
| "loss": 2.1431, |
| "num_input_tokens_seen": 3529312, |
| "step": 4705 |
| }, |
| { |
| "epoch": 7.279752704791345, |
| "grad_norm": 0.43528902530670166, |
| "learning_rate": 1.0459650303982912e-05, |
| "loss": 1.8515, |
| "num_input_tokens_seen": 3533536, |
| "step": 4710 |
| }, |
| { |
| "epoch": 7.287480680061824, |
| "grad_norm": 0.5224335789680481, |
| "learning_rate": 1.0404843765019436e-05, |
| "loss": 2.0884, |
| "num_input_tokens_seen": 3537184, |
| "step": 4715 |
| }, |
| { |
| "epoch": 7.295208655332303, |
| "grad_norm": 0.4448750615119934, |
| "learning_rate": 1.0350143433426981e-05, |
| "loss": 1.6565, |
| "num_input_tokens_seen": 3540960, |
| "step": 4720 |
| }, |
| { |
| "epoch": 7.302936630602782, |
| "grad_norm": 0.43913865089416504, |
| "learning_rate": 1.029554970725393e-05, |
| "loss": 1.9423, |
| "num_input_tokens_seen": 3545184, |
| "step": 4725 |
| }, |
| { |
| "epoch": 7.310664605873261, |
| "grad_norm": 0.43090900778770447, |
| "learning_rate": 1.0241062983772939e-05, |
| "loss": 1.8798, |
| "num_input_tokens_seen": 3549024, |
| "step": 4730 |
| }, |
| { |
| "epoch": 7.3183925811437405, |
| "grad_norm": 0.508183479309082, |
| "learning_rate": 1.0186683659477956e-05, |
| "loss": 1.6893, |
| "num_input_tokens_seen": 3553056, |
| "step": 4735 |
| }, |
| { |
| "epoch": 7.326120556414219, |
| "grad_norm": 0.6264638900756836, |
| "learning_rate": 1.0132412130081473e-05, |
| "loss": 1.9907, |
| "num_input_tokens_seen": 3557024, |
| "step": 4740 |
| }, |
| { |
| "epoch": 7.333848531684699, |
| "grad_norm": 0.5664180517196655, |
| "learning_rate": 1.0078248790511492e-05, |
| "loss": 1.7209, |
| "num_input_tokens_seen": 3560544, |
| "step": 4745 |
| }, |
| { |
| "epoch": 7.341576506955178, |
| "grad_norm": 0.45052483677864075, |
| "learning_rate": 1.0024194034908793e-05, |
| "loss": 1.7309, |
| "num_input_tokens_seen": 3564192, |
| "step": 4750 |
| }, |
| { |
| "epoch": 7.349304482225657, |
| "grad_norm": 0.4567320942878723, |
| "learning_rate": 9.970248256623976e-06, |
| "loss": 1.7103, |
| "num_input_tokens_seen": 3567648, |
| "step": 4755 |
| }, |
| { |
| "epoch": 7.357032457496136, |
| "grad_norm": 0.5011356472969055, |
| "learning_rate": 9.916411848214618e-06, |
| "loss": 2.2025, |
| "num_input_tokens_seen": 3571168, |
| "step": 4760 |
| }, |
| { |
| "epoch": 7.364760432766615, |
| "grad_norm": 0.3981023132801056, |
| "learning_rate": 9.86268520144244e-06, |
| "loss": 2.0211, |
| "num_input_tokens_seen": 3575520, |
| "step": 4765 |
| }, |
| { |
| "epoch": 7.3724884080370945, |
| "grad_norm": 0.863990068435669, |
| "learning_rate": 9.809068707270425e-06, |
| "loss": 1.8911, |
| "num_input_tokens_seen": 3579104, |
| "step": 4770 |
| }, |
| { |
| "epoch": 7.380216383307573, |
| "grad_norm": 0.5234079360961914, |
| "learning_rate": 9.755562755859996e-06, |
| "loss": 2.1423, |
| "num_input_tokens_seen": 3582944, |
| "step": 4775 |
| }, |
| { |
| "epoch": 7.387944358578053, |
| "grad_norm": 0.3961091935634613, |
| "learning_rate": 9.702167736568163e-06, |
| "loss": 1.8697, |
| "num_input_tokens_seen": 3586464, |
| "step": 4780 |
| }, |
| { |
| "epoch": 7.395672333848531, |
| "grad_norm": 0.4912174344062805, |
| "learning_rate": 9.6488840379447e-06, |
| "loss": 1.6679, |
| "num_input_tokens_seen": 3590304, |
| "step": 4785 |
| }, |
| { |
| "epoch": 7.403400309119011, |
| "grad_norm": 0.479442834854126, |
| "learning_rate": 9.59571204772931e-06, |
| "loss": 2.1609, |
| "num_input_tokens_seen": 3594016, |
| "step": 4790 |
| }, |
| { |
| "epoch": 7.41112828438949, |
| "grad_norm": 0.550214946269989, |
| "learning_rate": 9.5426521528488e-06, |
| "loss": 1.5704, |
| "num_input_tokens_seen": 3597536, |
| "step": 4795 |
| }, |
| { |
| "epoch": 7.418856259659969, |
| "grad_norm": 0.4294738471508026, |
| "learning_rate": 9.489704739414302e-06, |
| "loss": 2.0111, |
| "num_input_tokens_seen": 3601696, |
| "step": 4800 |
| }, |
| { |
| "epoch": 7.4265842349304485, |
| "grad_norm": 0.512971818447113, |
| "learning_rate": 9.436870192718372e-06, |
| "loss": 1.9792, |
| "num_input_tokens_seen": 3605408, |
| "step": 4805 |
| }, |
| { |
| "epoch": 7.434312210200927, |
| "grad_norm": 0.5744072198867798, |
| "learning_rate": 9.38414889723232e-06, |
| "loss": 1.6386, |
| "num_input_tokens_seen": 3609056, |
| "step": 4810 |
| }, |
| { |
| "epoch": 7.442040185471407, |
| "grad_norm": 0.4001683294773102, |
| "learning_rate": 9.331541236603267e-06, |
| "loss": 2.0762, |
| "num_input_tokens_seen": 3613024, |
| "step": 4815 |
| }, |
| { |
| "epoch": 7.449768160741885, |
| "grad_norm": 0.4479902386665344, |
| "learning_rate": 9.279047593651488e-06, |
| "loss": 2.1537, |
| "num_input_tokens_seen": 3617440, |
| "step": 4820 |
| }, |
| { |
| "epoch": 7.457496136012365, |
| "grad_norm": 0.4362366497516632, |
| "learning_rate": 9.226668350367528e-06, |
| "loss": 2.0465, |
| "num_input_tokens_seen": 3621984, |
| "step": 4825 |
| }, |
| { |
| "epoch": 7.4652241112828435, |
| "grad_norm": 0.4821748733520508, |
| "learning_rate": 9.174403887909466e-06, |
| "loss": 2.0988, |
| "num_input_tokens_seen": 3625696, |
| "step": 4830 |
| }, |
| { |
| "epoch": 7.472952086553323, |
| "grad_norm": 0.4055638313293457, |
| "learning_rate": 9.122254586600138e-06, |
| "loss": 1.8805, |
| "num_input_tokens_seen": 3628960, |
| "step": 4835 |
| }, |
| { |
| "epoch": 7.4806800618238025, |
| "grad_norm": 0.44100403785705566, |
| "learning_rate": 9.070220825924356e-06, |
| "loss": 1.894, |
| "num_input_tokens_seen": 3632416, |
| "step": 4840 |
| }, |
| { |
| "epoch": 7.488408037094281, |
| "grad_norm": 0.4403553903102875, |
| "learning_rate": 9.018302984526161e-06, |
| "loss": 2.2233, |
| "num_input_tokens_seen": 3636576, |
| "step": 4845 |
| }, |
| { |
| "epoch": 7.496136012364761, |
| "grad_norm": 0.4374490976333618, |
| "learning_rate": 8.966501440206063e-06, |
| "loss": 1.9553, |
| "num_input_tokens_seen": 3640288, |
| "step": 4850 |
| }, |
| { |
| "epoch": 7.503863987635239, |
| "grad_norm": 0.49835798144340515, |
| "learning_rate": 8.91481656991828e-06, |
| "loss": 1.8019, |
| "num_input_tokens_seen": 3644320, |
| "step": 4855 |
| }, |
| { |
| "epoch": 7.511591962905719, |
| "grad_norm": 0.559605598449707, |
| "learning_rate": 8.863248749768042e-06, |
| "loss": 2.0465, |
| "num_input_tokens_seen": 3648160, |
| "step": 4860 |
| }, |
| { |
| "epoch": 7.511591962905719, |
| "eval_loss": 2.0179080963134766, |
| "eval_runtime": 9.8471, |
| "eval_samples_per_second": 58.393, |
| "eval_steps_per_second": 7.312, |
| "num_input_tokens_seen": 3648160, |
| "step": 4860 |
| }, |
| { |
| "epoch": 7.5193199381761975, |
| "grad_norm": 0.6090003252029419, |
| "learning_rate": 8.811798355008753e-06, |
| "loss": 1.6761, |
| "num_input_tokens_seen": 3651808, |
| "step": 4865 |
| }, |
| { |
| "epoch": 7.527047913446677, |
| "grad_norm": 0.41798070073127747, |
| "learning_rate": 8.760465760039399e-06, |
| "loss": 1.7734, |
| "num_input_tokens_seen": 3655904, |
| "step": 4870 |
| }, |
| { |
| "epoch": 7.5347758887171565, |
| "grad_norm": 0.5017128586769104, |
| "learning_rate": 8.709251338401681e-06, |
| "loss": 1.7878, |
| "num_input_tokens_seen": 3659680, |
| "step": 4875 |
| }, |
| { |
| "epoch": 7.542503863987635, |
| "grad_norm": 0.6273453831672668, |
| "learning_rate": 8.658155462777418e-06, |
| "loss": 2.0905, |
| "num_input_tokens_seen": 3663264, |
| "step": 4880 |
| }, |
| { |
| "epoch": 7.550231839258115, |
| "grad_norm": 0.45074906945228577, |
| "learning_rate": 8.607178504985759e-06, |
| "loss": 1.9266, |
| "num_input_tokens_seen": 3667424, |
| "step": 4885 |
| }, |
| { |
| "epoch": 7.557959814528593, |
| "grad_norm": 0.41543567180633545, |
| "learning_rate": 8.556320835980503e-06, |
| "loss": 2.0426, |
| "num_input_tokens_seen": 3671008, |
| "step": 4890 |
| }, |
| { |
| "epoch": 7.565687789799073, |
| "grad_norm": 0.4987678825855255, |
| "learning_rate": 8.505582825847397e-06, |
| "loss": 1.7163, |
| "num_input_tokens_seen": 3674976, |
| "step": 4895 |
| }, |
| { |
| "epoch": 7.5734157650695515, |
| "grad_norm": 0.6415022015571594, |
| "learning_rate": 8.454964843801445e-06, |
| "loss": 1.9027, |
| "num_input_tokens_seen": 3678560, |
| "step": 4900 |
| }, |
| { |
| "epoch": 7.581143740340031, |
| "grad_norm": 0.5627877712249756, |
| "learning_rate": 8.404467258184223e-06, |
| "loss": 1.7147, |
| "num_input_tokens_seen": 3682144, |
| "step": 4905 |
| }, |
| { |
| "epoch": 7.58887171561051, |
| "grad_norm": 0.5032637119293213, |
| "learning_rate": 8.354090436461186e-06, |
| "loss": 1.7561, |
| "num_input_tokens_seen": 3685664, |
| "step": 4910 |
| }, |
| { |
| "epoch": 7.596599690880989, |
| "grad_norm": 0.4875337481498718, |
| "learning_rate": 8.303834745219007e-06, |
| "loss": 2.0047, |
| "num_input_tokens_seen": 3689248, |
| "step": 4915 |
| }, |
| { |
| "epoch": 7.604327666151468, |
| "grad_norm": 0.5122409462928772, |
| "learning_rate": 8.25370055016293e-06, |
| "loss": 2.0755, |
| "num_input_tokens_seen": 3693152, |
| "step": 4920 |
| }, |
| { |
| "epoch": 7.612055641421947, |
| "grad_norm": 0.4014969766139984, |
| "learning_rate": 8.203688216114027e-06, |
| "loss": 2.2292, |
| "num_input_tokens_seen": 3697184, |
| "step": 4925 |
| }, |
| { |
| "epoch": 7.619783616692427, |
| "grad_norm": 0.5143061280250549, |
| "learning_rate": 8.153798107006671e-06, |
| "loss": 2.078, |
| "num_input_tokens_seen": 3700768, |
| "step": 4930 |
| }, |
| { |
| "epoch": 7.6275115919629055, |
| "grad_norm": 0.4903722107410431, |
| "learning_rate": 8.10403058588575e-06, |
| "loss": 2.0966, |
| "num_input_tokens_seen": 3705184, |
| "step": 4935 |
| }, |
| { |
| "epoch": 7.635239567233385, |
| "grad_norm": 0.4987504780292511, |
| "learning_rate": 8.054386014904145e-06, |
| "loss": 1.6639, |
| "num_input_tokens_seen": 3709216, |
| "step": 4940 |
| }, |
| { |
| "epoch": 7.642967542503864, |
| "grad_norm": 0.6909155249595642, |
| "learning_rate": 8.004864755320016e-06, |
| "loss": 1.8366, |
| "num_input_tokens_seen": 3713248, |
| "step": 4945 |
| }, |
| { |
| "epoch": 7.650695517774343, |
| "grad_norm": 0.4917794466018677, |
| "learning_rate": 7.955467167494208e-06, |
| "loss": 1.694, |
| "num_input_tokens_seen": 3716832, |
| "step": 4950 |
| }, |
| { |
| "epoch": 7.658423493044822, |
| "grad_norm": 0.5293083786964417, |
| "learning_rate": 7.90619361088761e-06, |
| "loss": 1.9388, |
| "num_input_tokens_seen": 3720608, |
| "step": 4955 |
| }, |
| { |
| "epoch": 7.666151468315301, |
| "grad_norm": 0.4672509431838989, |
| "learning_rate": 7.857044444058562e-06, |
| "loss": 1.6769, |
| "num_input_tokens_seen": 3724256, |
| "step": 4960 |
| }, |
| { |
| "epoch": 7.673879443585781, |
| "grad_norm": 0.6263973116874695, |
| "learning_rate": 7.80802002466023e-06, |
| "loss": 1.9622, |
| "num_input_tokens_seen": 3728032, |
| "step": 4965 |
| }, |
| { |
| "epoch": 7.6816074188562595, |
| "grad_norm": 0.4829172194004059, |
| "learning_rate": 7.759120709437993e-06, |
| "loss": 2.198, |
| "num_input_tokens_seen": 3732640, |
| "step": 4970 |
| }, |
| { |
| "epoch": 7.689335394126739, |
| "grad_norm": 0.42988526821136475, |
| "learning_rate": 7.71034685422688e-06, |
| "loss": 1.8157, |
| "num_input_tokens_seen": 3735968, |
| "step": 4975 |
| }, |
| { |
| "epoch": 7.697063369397218, |
| "grad_norm": 0.3681195378303528, |
| "learning_rate": 7.661698813948953e-06, |
| "loss": 1.6465, |
| "num_input_tokens_seen": 3739808, |
| "step": 4980 |
| }, |
| { |
| "epoch": 7.704791344667697, |
| "grad_norm": 0.5963971018791199, |
| "learning_rate": 7.6131769426107165e-06, |
| "loss": 2.1698, |
| "num_input_tokens_seen": 3743456, |
| "step": 4985 |
| }, |
| { |
| "epoch": 7.712519319938176, |
| "grad_norm": 0.5191236734390259, |
| "learning_rate": 7.564781593300605e-06, |
| "loss": 2.0448, |
| "num_input_tokens_seen": 3746976, |
| "step": 4990 |
| }, |
| { |
| "epoch": 7.720247295208655, |
| "grad_norm": 0.41211771965026855, |
| "learning_rate": 7.516513118186294e-06, |
| "loss": 2.0512, |
| "num_input_tokens_seen": 3751072, |
| "step": 4995 |
| }, |
| { |
| "epoch": 7.727975270479135, |
| "grad_norm": 0.47517818212509155, |
| "learning_rate": 7.468371868512286e-06, |
| "loss": 2.1844, |
| "num_input_tokens_seen": 3755040, |
| "step": 5000 |
| }, |
| { |
| "epoch": 7.7357032457496135, |
| "grad_norm": 0.5340629816055298, |
| "learning_rate": 7.420358194597205e-06, |
| "loss": 2.0041, |
| "num_input_tokens_seen": 3759072, |
| "step": 5005 |
| }, |
| { |
| "epoch": 7.743431221020093, |
| "grad_norm": 0.500800371170044, |
| "learning_rate": 7.37247244583138e-06, |
| "loss": 2.3047, |
| "num_input_tokens_seen": 3762528, |
| "step": 5010 |
| }, |
| { |
| "epoch": 7.751159196290572, |
| "grad_norm": 0.46884685754776, |
| "learning_rate": 7.324714970674212e-06, |
| "loss": 1.9463, |
| "num_input_tokens_seen": 3766176, |
| "step": 5015 |
| }, |
| { |
| "epoch": 7.758887171561051, |
| "grad_norm": 0.419576495885849, |
| "learning_rate": 7.277086116651674e-06, |
| "loss": 1.9457, |
| "num_input_tokens_seen": 3769760, |
| "step": 5020 |
| }, |
| { |
| "epoch": 7.76661514683153, |
| "grad_norm": 0.42754513025283813, |
| "learning_rate": 7.229586230353777e-06, |
| "loss": 2.1629, |
| "num_input_tokens_seen": 3773408, |
| "step": 5025 |
| }, |
| { |
| "epoch": 7.774343122102009, |
| "grad_norm": 0.5824078917503357, |
| "learning_rate": 7.182215657432045e-06, |
| "loss": 2.1221, |
| "num_input_tokens_seen": 3776928, |
| "step": 5030 |
| }, |
| { |
| "epoch": 7.782071097372488, |
| "grad_norm": 0.5148240327835083, |
| "learning_rate": 7.134974742597015e-06, |
| "loss": 1.988, |
| "num_input_tokens_seen": 3780256, |
| "step": 5035 |
| }, |
| { |
| "epoch": 7.789799072642968, |
| "grad_norm": 0.5740344524383545, |
| "learning_rate": 7.087863829615698e-06, |
| "loss": 2.1161, |
| "num_input_tokens_seen": 3784096, |
| "step": 5040 |
| }, |
| { |
| "epoch": 7.797527047913446, |
| "grad_norm": 0.5428912043571472, |
| "learning_rate": 7.0408832613091034e-06, |
| "loss": 2.0609, |
| "num_input_tokens_seen": 3787296, |
| "step": 5045 |
| }, |
| { |
| "epoch": 7.805255023183926, |
| "grad_norm": 0.42774394154548645, |
| "learning_rate": 6.994033379549758e-06, |
| "loss": 2.2296, |
| "num_input_tokens_seen": 3791008, |
| "step": 5050 |
| }, |
| { |
| "epoch": 7.812982998454405, |
| "grad_norm": 0.4487341344356537, |
| "learning_rate": 6.947314525259147e-06, |
| "loss": 2.181, |
| "num_input_tokens_seen": 3794528, |
| "step": 5055 |
| }, |
| { |
| "epoch": 7.820710973724884, |
| "grad_norm": 0.360095739364624, |
| "learning_rate": 6.900727038405344e-06, |
| "loss": 1.9638, |
| "num_input_tokens_seen": 3798048, |
| "step": 5060 |
| }, |
| { |
| "epoch": 7.828438948995363, |
| "grad_norm": 0.5661271810531616, |
| "learning_rate": 6.854271258000414e-06, |
| "loss": 1.9825, |
| "num_input_tokens_seen": 3801760, |
| "step": 5065 |
| }, |
| { |
| "epoch": 7.836166924265842, |
| "grad_norm": 0.48149120807647705, |
| "learning_rate": 6.80794752209806e-06, |
| "loss": 1.9839, |
| "num_input_tokens_seen": 3805280, |
| "step": 5070 |
| }, |
| { |
| "epoch": 7.843894899536322, |
| "grad_norm": 0.4871172606945038, |
| "learning_rate": 6.761756167791083e-06, |
| "loss": 2.031, |
| "num_input_tokens_seen": 3808864, |
| "step": 5075 |
| }, |
| { |
| "epoch": 7.8516228748068, |
| "grad_norm": 0.5680673122406006, |
| "learning_rate": 6.715697531208967e-06, |
| "loss": 1.9571, |
| "num_input_tokens_seen": 3812512, |
| "step": 5080 |
| }, |
| { |
| "epoch": 7.85935085007728, |
| "grad_norm": 0.5056851506233215, |
| "learning_rate": 6.669771947515421e-06, |
| "loss": 1.9788, |
| "num_input_tokens_seen": 3816352, |
| "step": 5085 |
| }, |
| { |
| "epoch": 7.867078825347759, |
| "grad_norm": 0.41278329491615295, |
| "learning_rate": 6.6239797509059424e-06, |
| "loss": 1.6949, |
| "num_input_tokens_seen": 3820000, |
| "step": 5090 |
| }, |
| { |
| "epoch": 7.874806800618238, |
| "grad_norm": 0.43449851870536804, |
| "learning_rate": 6.578321274605384e-06, |
| "loss": 1.7573, |
| "num_input_tokens_seen": 3823584, |
| "step": 5095 |
| }, |
| { |
| "epoch": 7.882534775888717, |
| "grad_norm": 0.5349349975585938, |
| "learning_rate": 6.532796850865539e-06, |
| "loss": 1.8192, |
| "num_input_tokens_seen": 3827296, |
| "step": 5100 |
| }, |
| { |
| "epoch": 7.890262751159196, |
| "grad_norm": 0.5766939520835876, |
| "learning_rate": 6.4874068109626985e-06, |
| "loss": 1.9581, |
| "num_input_tokens_seen": 3830944, |
| "step": 5105 |
| }, |
| { |
| "epoch": 7.897990726429676, |
| "grad_norm": 0.5526953935623169, |
| "learning_rate": 6.442151485195275e-06, |
| "loss": 2.0384, |
| "num_input_tokens_seen": 3834784, |
| "step": 5110 |
| }, |
| { |
| "epoch": 7.905718701700154, |
| "grad_norm": 0.42307910323143005, |
| "learning_rate": 6.397031202881357e-06, |
| "loss": 1.7874, |
| "num_input_tokens_seen": 3838752, |
| "step": 5115 |
| }, |
| { |
| "epoch": 7.913446676970634, |
| "grad_norm": 0.5621873736381531, |
| "learning_rate": 6.352046292356381e-06, |
| "loss": 2.066, |
| "num_input_tokens_seen": 3842272, |
| "step": 5120 |
| }, |
| { |
| "epoch": 7.921174652241113, |
| "grad_norm": 0.5356681942939758, |
| "learning_rate": 6.307197080970634e-06, |
| "loss": 1.7912, |
| "num_input_tokens_seen": 3846368, |
| "step": 5125 |
| }, |
| { |
| "epoch": 7.928902627511592, |
| "grad_norm": 0.5139599442481995, |
| "learning_rate": 6.262483895087002e-06, |
| "loss": 2.0138, |
| "num_input_tokens_seen": 3850272, |
| "step": 5130 |
| }, |
| { |
| "epoch": 7.936630602782071, |
| "grad_norm": 0.45896539092063904, |
| "learning_rate": 6.21790706007846e-06, |
| "loss": 2.1746, |
| "num_input_tokens_seen": 3853600, |
| "step": 5135 |
| }, |
| { |
| "epoch": 7.94435857805255, |
| "grad_norm": 0.4682660698890686, |
| "learning_rate": 6.173466900325839e-06, |
| "loss": 1.8312, |
| "num_input_tokens_seen": 3857312, |
| "step": 5140 |
| }, |
| { |
| "epoch": 7.95208655332303, |
| "grad_norm": 0.49920710921287537, |
| "learning_rate": 6.129163739215352e-06, |
| "loss": 1.9881, |
| "num_input_tokens_seen": 3860960, |
| "step": 5145 |
| }, |
| { |
| "epoch": 7.959814528593508, |
| "grad_norm": 0.3703214228153229, |
| "learning_rate": 6.084997899136311e-06, |
| "loss": 1.7246, |
| "num_input_tokens_seen": 3864736, |
| "step": 5150 |
| }, |
| { |
| "epoch": 7.967542503863988, |
| "grad_norm": 0.43374818563461304, |
| "learning_rate": 6.040969701478743e-06, |
| "loss": 1.8493, |
| "num_input_tokens_seen": 3868640, |
| "step": 5155 |
| }, |
| { |
| "epoch": 7.975270479134466, |
| "grad_norm": 0.49463748931884766, |
| "learning_rate": 5.997079466631081e-06, |
| "loss": 1.6057, |
| "num_input_tokens_seen": 3872224, |
| "step": 5160 |
| }, |
| { |
| "epoch": 7.982998454404946, |
| "grad_norm": 0.4423999786376953, |
| "learning_rate": 5.953327513977805e-06, |
| "loss": 1.9082, |
| "num_input_tokens_seen": 3876512, |
| "step": 5165 |
| }, |
| { |
| "epoch": 7.990726429675425, |
| "grad_norm": 0.5020461082458496, |
| "learning_rate": 5.909714161897137e-06, |
| "loss": 1.885, |
| "num_input_tokens_seen": 3880224, |
| "step": 5170 |
| }, |
| { |
| "epoch": 7.998454404945904, |
| "grad_norm": 0.6084979772567749, |
| "learning_rate": 5.8662397277587125e-06, |
| "loss": 1.7686, |
| "num_input_tokens_seen": 3884384, |
| "step": 5175 |
| }, |
| { |
| "epoch": 8.006182380216384, |
| "grad_norm": 0.5096596479415894, |
| "learning_rate": 5.822904527921285e-06, |
| "loss": 1.6594, |
| "num_input_tokens_seen": 3887664, |
| "step": 5180 |
| }, |
| { |
| "epoch": 8.012364760432767, |
| "eval_loss": 1.9898368120193481, |
| "eval_runtime": 9.8217, |
| "eval_samples_per_second": 58.544, |
| "eval_steps_per_second": 7.331, |
| "num_input_tokens_seen": 3890608, |
| "step": 5184 |
| }, |
| { |
| "epoch": 8.013910355486862, |
| "grad_norm": 0.5536606311798096, |
| "learning_rate": 5.779708877730411e-06, |
| "loss": 1.7109, |
| "num_input_tokens_seen": 3891568, |
| "step": 5185 |
| }, |
| { |
| "epoch": 8.021638330757341, |
| "grad_norm": 0.4795739948749542, |
| "learning_rate": 5.73665309151615e-06, |
| "loss": 1.8976, |
| "num_input_tokens_seen": 3895472, |
| "step": 5190 |
| }, |
| { |
| "epoch": 8.029366306027821, |
| "grad_norm": 0.4783843457698822, |
| "learning_rate": 5.6937374825908e-06, |
| "loss": 2.0581, |
| "num_input_tokens_seen": 3899312, |
| "step": 5195 |
| }, |
| { |
| "epoch": 8.0370942812983, |
| "grad_norm": 0.4617125988006592, |
| "learning_rate": 5.650962363246592e-06, |
| "loss": 2.0033, |
| "num_input_tokens_seen": 3902448, |
| "step": 5200 |
| }, |
| { |
| "epoch": 8.044822256568779, |
| "grad_norm": 0.6532540917396545, |
| "learning_rate": 5.6083280447534585e-06, |
| "loss": 1.481, |
| "num_input_tokens_seen": 3906160, |
| "step": 5205 |
| }, |
| { |
| "epoch": 8.052550231839259, |
| "grad_norm": 0.511972963809967, |
| "learning_rate": 5.5658348373566815e-06, |
| "loss": 1.9637, |
| "num_input_tokens_seen": 3910064, |
| "step": 5210 |
| }, |
| { |
| "epoch": 8.060278207109738, |
| "grad_norm": 0.42409616708755493, |
| "learning_rate": 5.523483050274766e-06, |
| "loss": 1.7105, |
| "num_input_tokens_seen": 3913648, |
| "step": 5215 |
| }, |
| { |
| "epoch": 8.068006182380216, |
| "grad_norm": 0.46803751587867737, |
| "learning_rate": 5.481272991697045e-06, |
| "loss": 2.1197, |
| "num_input_tokens_seen": 3917296, |
| "step": 5220 |
| }, |
| { |
| "epoch": 8.075734157650695, |
| "grad_norm": 0.43307095766067505, |
| "learning_rate": 5.439204968781566e-06, |
| "loss": 1.851, |
| "num_input_tokens_seen": 3920624, |
| "step": 5225 |
| }, |
| { |
| "epoch": 8.083462132921175, |
| "grad_norm": 0.43012967705726624, |
| "learning_rate": 5.397279287652771e-06, |
| "loss": 1.797, |
| "num_input_tokens_seen": 3924272, |
| "step": 5230 |
| }, |
| { |
| "epoch": 8.091190108191654, |
| "grad_norm": 0.5172163248062134, |
| "learning_rate": 5.355496253399294e-06, |
| "loss": 1.7261, |
| "num_input_tokens_seen": 3928176, |
| "step": 5235 |
| }, |
| { |
| "epoch": 8.098918083462133, |
| "grad_norm": 0.5393996834754944, |
| "learning_rate": 5.313856170071754e-06, |
| "loss": 2.2175, |
| "num_input_tokens_seen": 3932400, |
| "step": 5240 |
| }, |
| { |
| "epoch": 8.106646058732611, |
| "grad_norm": 0.42386066913604736, |
| "learning_rate": 5.272359340680524e-06, |
| "loss": 2.1419, |
| "num_input_tokens_seen": 3936240, |
| "step": 5245 |
| }, |
| { |
| "epoch": 8.114374034003092, |
| "grad_norm": 0.44247618317604065, |
| "learning_rate": 5.231006067193539e-06, |
| "loss": 1.7462, |
| "num_input_tokens_seen": 3940080, |
| "step": 5250 |
| }, |
| { |
| "epoch": 8.12210200927357, |
| "grad_norm": 0.5681042075157166, |
| "learning_rate": 5.189796650534093e-06, |
| "loss": 2.2102, |
| "num_input_tokens_seen": 3944048, |
| "step": 5255 |
| }, |
| { |
| "epoch": 8.129829984544049, |
| "grad_norm": 0.4637432396411896, |
| "learning_rate": 5.1487313905786346e-06, |
| "loss": 1.6742, |
| "num_input_tokens_seen": 3947824, |
| "step": 5260 |
| }, |
| { |
| "epoch": 8.13755795981453, |
| "grad_norm": 0.5209901928901672, |
| "learning_rate": 5.107810586154637e-06, |
| "loss": 2.2237, |
| "num_input_tokens_seen": 3951280, |
| "step": 5265 |
| }, |
| { |
| "epoch": 8.145285935085008, |
| "grad_norm": 0.5490251779556274, |
| "learning_rate": 5.0670345350383346e-06, |
| "loss": 1.7778, |
| "num_input_tokens_seen": 3955248, |
| "step": 5270 |
| }, |
| { |
| "epoch": 8.153013910355487, |
| "grad_norm": 0.4561353027820587, |
| "learning_rate": 5.026403533952659e-06, |
| "loss": 2.2651, |
| "num_input_tokens_seen": 3959216, |
| "step": 5275 |
| }, |
| { |
| "epoch": 8.160741885625965, |
| "grad_norm": 0.5111833214759827, |
| "learning_rate": 4.98591787856498e-06, |
| "loss": 1.8427, |
| "num_input_tokens_seen": 3962608, |
| "step": 5280 |
| }, |
| { |
| "epoch": 8.168469860896446, |
| "grad_norm": 0.4493984580039978, |
| "learning_rate": 4.945577863485046e-06, |
| "loss": 1.8568, |
| "num_input_tokens_seen": 3966320, |
| "step": 5285 |
| }, |
| { |
| "epoch": 8.176197836166924, |
| "grad_norm": 0.5941608548164368, |
| "learning_rate": 4.905383782262768e-06, |
| "loss": 1.8362, |
| "num_input_tokens_seen": 3970160, |
| "step": 5290 |
| }, |
| { |
| "epoch": 8.183925811437403, |
| "grad_norm": 0.5745226740837097, |
| "learning_rate": 4.865335927386125e-06, |
| "loss": 1.8006, |
| "num_input_tokens_seen": 3973872, |
| "step": 5295 |
| }, |
| { |
| "epoch": 8.191653786707883, |
| "grad_norm": 0.49066871404647827, |
| "learning_rate": 4.825434590279015e-06, |
| "loss": 1.887, |
| "num_input_tokens_seen": 3977904, |
| "step": 5300 |
| }, |
| { |
| "epoch": 8.199381761978362, |
| "grad_norm": 0.6205068230628967, |
| "learning_rate": 4.785680061299153e-06, |
| "loss": 1.8471, |
| "num_input_tokens_seen": 3981552, |
| "step": 5305 |
| }, |
| { |
| "epoch": 8.20710973724884, |
| "grad_norm": 0.7366943955421448, |
| "learning_rate": 4.746072629735932e-06, |
| "loss": 1.8981, |
| "num_input_tokens_seen": 3985200, |
| "step": 5310 |
| }, |
| { |
| "epoch": 8.21483771251932, |
| "grad_norm": 0.613416850566864, |
| "learning_rate": 4.706612583808348e-06, |
| "loss": 1.8575, |
| "num_input_tokens_seen": 3989424, |
| "step": 5315 |
| }, |
| { |
| "epoch": 8.2225656877898, |
| "grad_norm": 0.41795915365219116, |
| "learning_rate": 4.6673002106628786e-06, |
| "loss": 1.6017, |
| "num_input_tokens_seen": 3993264, |
| "step": 5320 |
| }, |
| { |
| "epoch": 8.230293663060278, |
| "grad_norm": 0.5964676737785339, |
| "learning_rate": 4.628135796371402e-06, |
| "loss": 2.2189, |
| "num_input_tokens_seen": 3997232, |
| "step": 5325 |
| }, |
| { |
| "epoch": 8.238021638330757, |
| "grad_norm": 0.49453970789909363, |
| "learning_rate": 4.5891196259291165e-06, |
| "loss": 1.9771, |
| "num_input_tokens_seen": 4000816, |
| "step": 5330 |
| }, |
| { |
| "epoch": 8.245749613601237, |
| "grad_norm": 0.592045783996582, |
| "learning_rate": 4.550251983252485e-06, |
| "loss": 2.2162, |
| "num_input_tokens_seen": 4004720, |
| "step": 5335 |
| }, |
| { |
| "epoch": 8.253477588871716, |
| "grad_norm": 0.6761609315872192, |
| "learning_rate": 4.511533151177111e-06, |
| "loss": 1.7625, |
| "num_input_tokens_seen": 4008752, |
| "step": 5340 |
| }, |
| { |
| "epoch": 8.261205564142195, |
| "grad_norm": 0.6841716766357422, |
| "learning_rate": 4.472963411455764e-06, |
| "loss": 1.8694, |
| "num_input_tokens_seen": 4012272, |
| "step": 5345 |
| }, |
| { |
| "epoch": 8.268933539412673, |
| "grad_norm": 0.4839349687099457, |
| "learning_rate": 4.434543044756237e-06, |
| "loss": 1.9231, |
| "num_input_tokens_seen": 4015920, |
| "step": 5350 |
| }, |
| { |
| "epoch": 8.276661514683154, |
| "grad_norm": 0.6607086658477783, |
| "learning_rate": 4.396272330659398e-06, |
| "loss": 1.7466, |
| "num_input_tokens_seen": 4019504, |
| "step": 5355 |
| }, |
| { |
| "epoch": 8.284389489953632, |
| "grad_norm": 0.5170435309410095, |
| "learning_rate": 4.35815154765708e-06, |
| "loss": 2.2421, |
| "num_input_tokens_seen": 4023472, |
| "step": 5360 |
| }, |
| { |
| "epoch": 8.292117465224111, |
| "grad_norm": 0.6370357871055603, |
| "learning_rate": 4.32018097315009e-06, |
| "loss": 1.6066, |
| "num_input_tokens_seen": 4027120, |
| "step": 5365 |
| }, |
| { |
| "epoch": 8.29984544049459, |
| "grad_norm": 0.4809553027153015, |
| "learning_rate": 4.28236088344619e-06, |
| "loss": 2.2748, |
| "num_input_tokens_seen": 4031152, |
| "step": 5370 |
| }, |
| { |
| "epoch": 8.30757341576507, |
| "grad_norm": 0.44451555609703064, |
| "learning_rate": 4.244691553758076e-06, |
| "loss": 2.1046, |
| "num_input_tokens_seen": 4034864, |
| "step": 5375 |
| }, |
| { |
| "epoch": 8.315301391035549, |
| "grad_norm": 0.4132705628871918, |
| "learning_rate": 4.207173258201375e-06, |
| "loss": 1.739, |
| "num_input_tokens_seen": 4038384, |
| "step": 5380 |
| }, |
| { |
| "epoch": 8.323029366306027, |
| "grad_norm": 0.5284491181373596, |
| "learning_rate": 4.1698062697926645e-06, |
| "loss": 1.6406, |
| "num_input_tokens_seen": 4041840, |
| "step": 5385 |
| }, |
| { |
| "epoch": 8.330757341576508, |
| "grad_norm": 0.5363196730613708, |
| "learning_rate": 4.132590860447463e-06, |
| "loss": 2.1619, |
| "num_input_tokens_seen": 4045808, |
| "step": 5390 |
| }, |
| { |
| "epoch": 8.338485316846986, |
| "grad_norm": 0.43313705921173096, |
| "learning_rate": 4.095527300978297e-06, |
| "loss": 1.8315, |
| "num_input_tokens_seen": 4049648, |
| "step": 5395 |
| }, |
| { |
| "epoch": 8.346213292117465, |
| "grad_norm": 0.504666268825531, |
| "learning_rate": 4.05861586109264e-06, |
| "loss": 1.8599, |
| "num_input_tokens_seen": 4053680, |
| "step": 5400 |
| }, |
| { |
| "epoch": 8.353941267387944, |
| "grad_norm": 0.6796596050262451, |
| "learning_rate": 4.021856809391075e-06, |
| "loss": 1.8656, |
| "num_input_tokens_seen": 4056944, |
| "step": 5405 |
| }, |
| { |
| "epoch": 8.361669242658424, |
| "grad_norm": 0.3838791847229004, |
| "learning_rate": 3.985250413365213e-06, |
| "loss": 1.8435, |
| "num_input_tokens_seen": 4061040, |
| "step": 5410 |
| }, |
| { |
| "epoch": 8.369397217928903, |
| "grad_norm": 0.492909699678421, |
| "learning_rate": 3.948796939395849e-06, |
| "loss": 1.9219, |
| "num_input_tokens_seen": 4065456, |
| "step": 5415 |
| }, |
| { |
| "epoch": 8.377125193199381, |
| "grad_norm": 0.42821991443634033, |
| "learning_rate": 3.912496652750958e-06, |
| "loss": 1.8958, |
| "num_input_tokens_seen": 4069296, |
| "step": 5420 |
| }, |
| { |
| "epoch": 8.384853168469862, |
| "grad_norm": 0.5378724932670593, |
| "learning_rate": 3.8763498175837965e-06, |
| "loss": 1.7691, |
| "num_input_tokens_seen": 4072880, |
| "step": 5425 |
| }, |
| { |
| "epoch": 8.39258114374034, |
| "grad_norm": 0.5229602456092834, |
| "learning_rate": 3.840356696930969e-06, |
| "loss": 1.9893, |
| "num_input_tokens_seen": 4076528, |
| "step": 5430 |
| }, |
| { |
| "epoch": 8.400309119010819, |
| "grad_norm": 0.5723522305488586, |
| "learning_rate": 3.8045175527105127e-06, |
| "loss": 1.7024, |
| "num_input_tokens_seen": 4079856, |
| "step": 5435 |
| }, |
| { |
| "epoch": 8.408037094281298, |
| "grad_norm": 0.45156463980674744, |
| "learning_rate": 3.7688326457200025e-06, |
| "loss": 1.9089, |
| "num_input_tokens_seen": 4083504, |
| "step": 5440 |
| }, |
| { |
| "epoch": 8.415765069551778, |
| "grad_norm": 0.3913841247558594, |
| "learning_rate": 3.7333022356346365e-06, |
| "loss": 1.6517, |
| "num_input_tokens_seen": 4086576, |
| "step": 5445 |
| }, |
| { |
| "epoch": 8.423493044822257, |
| "grad_norm": 0.6192737817764282, |
| "learning_rate": 3.6979265810053566e-06, |
| "loss": 1.9048, |
| "num_input_tokens_seen": 4090352, |
| "step": 5450 |
| }, |
| { |
| "epoch": 8.431221020092735, |
| "grad_norm": 0.3701123595237732, |
| "learning_rate": 3.6627059392569883e-06, |
| "loss": 1.8114, |
| "num_input_tokens_seen": 4094576, |
| "step": 5455 |
| }, |
| { |
| "epoch": 8.438948995363216, |
| "grad_norm": 0.5786436200141907, |
| "learning_rate": 3.6276405666863023e-06, |
| "loss": 2.2703, |
| "num_input_tokens_seen": 4098160, |
| "step": 5460 |
| }, |
| { |
| "epoch": 8.446676970633694, |
| "grad_norm": 0.5972141623497009, |
| "learning_rate": 3.592730718460241e-06, |
| "loss": 1.8991, |
| "num_input_tokens_seen": 4102192, |
| "step": 5465 |
| }, |
| { |
| "epoch": 8.454404945904173, |
| "grad_norm": 0.6472261548042297, |
| "learning_rate": 3.5579766486139643e-06, |
| "loss": 2.0854, |
| "num_input_tokens_seen": 4105904, |
| "step": 5470 |
| }, |
| { |
| "epoch": 8.462132921174652, |
| "grad_norm": 0.5761744976043701, |
| "learning_rate": 3.523378610049091e-06, |
| "loss": 1.6798, |
| "num_input_tokens_seen": 4109808, |
| "step": 5475 |
| }, |
| { |
| "epoch": 8.469860896445132, |
| "grad_norm": 0.618687093257904, |
| "learning_rate": 3.4889368545317963e-06, |
| "loss": 2.0827, |
| "num_input_tokens_seen": 4113520, |
| "step": 5480 |
| }, |
| { |
| "epoch": 8.47758887171561, |
| "grad_norm": 0.5880991220474243, |
| "learning_rate": 3.4546516326910027e-06, |
| "loss": 1.8648, |
| "num_input_tokens_seen": 4117104, |
| "step": 5485 |
| }, |
| { |
| "epoch": 8.48531684698609, |
| "grad_norm": 0.43996042013168335, |
| "learning_rate": 3.420523194016556e-06, |
| "loss": 2.0635, |
| "num_input_tokens_seen": 4120816, |
| "step": 5490 |
| }, |
| { |
| "epoch": 8.493044822256568, |
| "grad_norm": 0.4868849217891693, |
| "learning_rate": 3.386551786857409e-06, |
| "loss": 1.7675, |
| "num_input_tokens_seen": 4124784, |
| "step": 5495 |
| }, |
| { |
| "epoch": 8.500772797527048, |
| "grad_norm": 0.4164232313632965, |
| "learning_rate": 3.3527376584198104e-06, |
| "loss": 1.8835, |
| "num_input_tokens_seen": 4128880, |
| "step": 5500 |
| }, |
| { |
| "epoch": 8.508500772797527, |
| "grad_norm": 0.6961061358451843, |
| "learning_rate": 3.3190810547655105e-06, |
| "loss": 2.1859, |
| "num_input_tokens_seen": 4133040, |
| "step": 5505 |
| }, |
| { |
| "epoch": 8.513137557959814, |
| "eval_loss": 1.9703497886657715, |
| "eval_runtime": 9.8287, |
| "eval_samples_per_second": 58.502, |
| "eval_steps_per_second": 7.325, |
| "num_input_tokens_seen": 4134704, |
| "step": 5508 |
| }, |
| { |
| "epoch": 8.516228748068006, |
| "grad_norm": 0.4723997414112091, |
| "learning_rate": 3.2855822208099683e-06, |
| "loss": 1.6973, |
| "num_input_tokens_seen": 4136240, |
| "step": 5510 |
| }, |
| { |
| "epoch": 8.523956723338486, |
| "grad_norm": 0.5245389342308044, |
| "learning_rate": 3.2522414003205713e-06, |
| "loss": 1.8515, |
| "num_input_tokens_seen": 4140144, |
| "step": 5515 |
| }, |
| { |
| "epoch": 8.531684698608965, |
| "grad_norm": 0.48844999074935913, |
| "learning_rate": 3.2190588359148537e-06, |
| "loss": 1.6946, |
| "num_input_tokens_seen": 4143536, |
| "step": 5520 |
| }, |
| { |
| "epoch": 8.539412673879443, |
| "grad_norm": 0.5237007737159729, |
| "learning_rate": 3.1860347690587573e-06, |
| "loss": 2.2678, |
| "num_input_tokens_seen": 4147440, |
| "step": 5525 |
| }, |
| { |
| "epoch": 8.547140649149922, |
| "grad_norm": 0.6524617671966553, |
| "learning_rate": 3.153169440064818e-06, |
| "loss": 1.9823, |
| "num_input_tokens_seen": 4151024, |
| "step": 5530 |
| }, |
| { |
| "epoch": 8.554868624420402, |
| "grad_norm": 0.4519035518169403, |
| "learning_rate": 3.1204630880904944e-06, |
| "loss": 1.8902, |
| "num_input_tokens_seen": 4154672, |
| "step": 5535 |
| }, |
| { |
| "epoch": 8.562596599690881, |
| "grad_norm": 0.6340076327323914, |
| "learning_rate": 3.0879159511363525e-06, |
| "loss": 1.7432, |
| "num_input_tokens_seen": 4158512, |
| "step": 5540 |
| }, |
| { |
| "epoch": 8.57032457496136, |
| "grad_norm": 0.5062048435211182, |
| "learning_rate": 3.0555282660443914e-06, |
| "loss": 1.9369, |
| "num_input_tokens_seen": 4162224, |
| "step": 5545 |
| }, |
| { |
| "epoch": 8.578052550231838, |
| "grad_norm": 0.5006168484687805, |
| "learning_rate": 3.0233002684962872e-06, |
| "loss": 1.9589, |
| "num_input_tokens_seen": 4165872, |
| "step": 5550 |
| }, |
| { |
| "epoch": 8.585780525502319, |
| "grad_norm": 0.48547643423080444, |
| "learning_rate": 2.9912321930116836e-06, |
| "loss": 1.8006, |
| "num_input_tokens_seen": 4170160, |
| "step": 5555 |
| }, |
| { |
| "epoch": 8.593508500772797, |
| "grad_norm": 0.4373511075973511, |
| "learning_rate": 2.9593242729464926e-06, |
| "loss": 1.7253, |
| "num_input_tokens_seen": 4173872, |
| "step": 5560 |
| }, |
| { |
| "epoch": 8.601236476043276, |
| "grad_norm": 0.5265281796455383, |
| "learning_rate": 2.927576740491195e-06, |
| "loss": 1.9011, |
| "num_input_tokens_seen": 4177712, |
| "step": 5565 |
| }, |
| { |
| "epoch": 8.608964451313756, |
| "grad_norm": 0.4482889175415039, |
| "learning_rate": 2.8959898266691434e-06, |
| "loss": 1.5769, |
| "num_input_tokens_seen": 4182192, |
| "step": 5570 |
| }, |
| { |
| "epoch": 8.616692426584235, |
| "grad_norm": 0.4474455714225769, |
| "learning_rate": 2.8645637613348904e-06, |
| "loss": 1.7703, |
| "num_input_tokens_seen": 4185776, |
| "step": 5575 |
| }, |
| { |
| "epoch": 8.624420401854714, |
| "grad_norm": 0.553152859210968, |
| "learning_rate": 2.833298773172502e-06, |
| "loss": 2.245, |
| "num_input_tokens_seen": 4189488, |
| "step": 5580 |
| }, |
| { |
| "epoch": 8.632148377125194, |
| "grad_norm": 0.5896114110946655, |
| "learning_rate": 2.8021950896939266e-06, |
| "loss": 2.0839, |
| "num_input_tokens_seen": 4193264, |
| "step": 5585 |
| }, |
| { |
| "epoch": 8.639876352395673, |
| "grad_norm": 0.44060713052749634, |
| "learning_rate": 2.7712529372372814e-06, |
| "loss": 1.8894, |
| "num_input_tokens_seen": 4196912, |
| "step": 5590 |
| }, |
| { |
| "epoch": 8.647604327666151, |
| "grad_norm": 0.40510526299476624, |
| "learning_rate": 2.7404725409652747e-06, |
| "loss": 2.1538, |
| "num_input_tokens_seen": 4200752, |
| "step": 5595 |
| }, |
| { |
| "epoch": 8.65533230293663, |
| "grad_norm": 0.41664084792137146, |
| "learning_rate": 2.7098541248635007e-06, |
| "loss": 1.8733, |
| "num_input_tokens_seen": 4204656, |
| "step": 5600 |
| }, |
| { |
| "epoch": 8.66306027820711, |
| "grad_norm": 0.5716778039932251, |
| "learning_rate": 2.679397911738868e-06, |
| "loss": 2.0691, |
| "num_input_tokens_seen": 4208112, |
| "step": 5605 |
| }, |
| { |
| "epoch": 8.670788253477589, |
| "grad_norm": 0.8286293148994446, |
| "learning_rate": 2.6491041232179352e-06, |
| "loss": 2.0023, |
| "num_input_tokens_seen": 4212016, |
| "step": 5610 |
| }, |
| { |
| "epoch": 8.678516228748068, |
| "grad_norm": 0.461260050535202, |
| "learning_rate": 2.618972979745324e-06, |
| "loss": 1.7957, |
| "num_input_tokens_seen": 4215856, |
| "step": 5615 |
| }, |
| { |
| "epoch": 8.686244204018546, |
| "grad_norm": 0.44486209750175476, |
| "learning_rate": 2.589004700582101e-06, |
| "loss": 1.9745, |
| "num_input_tokens_seen": 4219376, |
| "step": 5620 |
| }, |
| { |
| "epoch": 8.693972179289027, |
| "grad_norm": 0.4456034302711487, |
| "learning_rate": 2.559199503804183e-06, |
| "loss": 1.9242, |
| "num_input_tokens_seen": 4222640, |
| "step": 5625 |
| }, |
| { |
| "epoch": 8.701700154559505, |
| "grad_norm": 0.5015110969543457, |
| "learning_rate": 2.529557606300764e-06, |
| "loss": 1.4574, |
| "num_input_tokens_seen": 4225904, |
| "step": 5630 |
| }, |
| { |
| "epoch": 8.709428129829984, |
| "grad_norm": 0.45508769154548645, |
| "learning_rate": 2.5000792237727165e-06, |
| "loss": 2.1172, |
| "num_input_tokens_seen": 4230576, |
| "step": 5635 |
| }, |
| { |
| "epoch": 8.717156105100464, |
| "grad_norm": 0.4284096360206604, |
| "learning_rate": 2.470764570731038e-06, |
| "loss": 1.9638, |
| "num_input_tokens_seen": 4234544, |
| "step": 5640 |
| }, |
| { |
| "epoch": 8.724884080370943, |
| "grad_norm": 0.4829674959182739, |
| "learning_rate": 2.4416138604952952e-06, |
| "loss": 1.931, |
| "num_input_tokens_seen": 4238320, |
| "step": 5645 |
| }, |
| { |
| "epoch": 8.732612055641422, |
| "grad_norm": 0.5861620306968689, |
| "learning_rate": 2.4126273051920277e-06, |
| "loss": 1.562, |
| "num_input_tokens_seen": 4241840, |
| "step": 5650 |
| }, |
| { |
| "epoch": 8.7403400309119, |
| "grad_norm": 0.4229719340801239, |
| "learning_rate": 2.383805115753279e-06, |
| "loss": 1.8194, |
| "num_input_tokens_seen": 4245232, |
| "step": 5655 |
| }, |
| { |
| "epoch": 8.74806800618238, |
| "grad_norm": 0.47962188720703125, |
| "learning_rate": 2.355147501914981e-06, |
| "loss": 2.1567, |
| "num_input_tokens_seen": 4249072, |
| "step": 5660 |
| }, |
| { |
| "epoch": 8.75579598145286, |
| "grad_norm": 0.40144792199134827, |
| "learning_rate": 2.326654672215503e-06, |
| "loss": 1.7857, |
| "num_input_tokens_seen": 4252336, |
| "step": 5665 |
| }, |
| { |
| "epoch": 8.763523956723338, |
| "grad_norm": 0.5309076905250549, |
| "learning_rate": 2.298326833994069e-06, |
| "loss": 1.8052, |
| "num_input_tokens_seen": 4256176, |
| "step": 5670 |
| }, |
| { |
| "epoch": 8.771251931993817, |
| "grad_norm": 0.4937804937362671, |
| "learning_rate": 2.270164193389296e-06, |
| "loss": 1.9206, |
| "num_input_tokens_seen": 4259696, |
| "step": 5675 |
| }, |
| { |
| "epoch": 8.778979907264297, |
| "grad_norm": 0.5504066348075867, |
| "learning_rate": 2.2421669553376654e-06, |
| "loss": 2.0706, |
| "num_input_tokens_seen": 4263088, |
| "step": 5680 |
| }, |
| { |
| "epoch": 8.786707882534776, |
| "grad_norm": 0.5018023252487183, |
| "learning_rate": 2.214335323572045e-06, |
| "loss": 1.7451, |
| "num_input_tokens_seen": 4266864, |
| "step": 5685 |
| }, |
| { |
| "epoch": 8.794435857805254, |
| "grad_norm": 0.5152173638343811, |
| "learning_rate": 2.1866695006202086e-06, |
| "loss": 1.722, |
| "num_input_tokens_seen": 4270704, |
| "step": 5690 |
| }, |
| { |
| "epoch": 8.802163833075735, |
| "grad_norm": 0.5058028101921082, |
| "learning_rate": 2.15916968780335e-06, |
| "loss": 1.8042, |
| "num_input_tokens_seen": 4273904, |
| "step": 5695 |
| }, |
| { |
| "epoch": 8.809891808346213, |
| "grad_norm": 0.4861660599708557, |
| "learning_rate": 2.1318360852346285e-06, |
| "loss": 1.737, |
| "num_input_tokens_seen": 4278192, |
| "step": 5700 |
| }, |
| { |
| "epoch": 8.817619783616692, |
| "grad_norm": 0.5702298283576965, |
| "learning_rate": 2.1046688918177128e-06, |
| "loss": 1.8842, |
| "num_input_tokens_seen": 4281520, |
| "step": 5705 |
| }, |
| { |
| "epoch": 8.825347758887172, |
| "grad_norm": 0.46663275361061096, |
| "learning_rate": 2.077668305245317e-06, |
| "loss": 1.6112, |
| "num_input_tokens_seen": 4285040, |
| "step": 5710 |
| }, |
| { |
| "epoch": 8.833075734157651, |
| "grad_norm": 0.4473027288913727, |
| "learning_rate": 2.050834521997802e-06, |
| "loss": 1.8675, |
| "num_input_tokens_seen": 4288432, |
| "step": 5715 |
| }, |
| { |
| "epoch": 8.84080370942813, |
| "grad_norm": 0.4665966033935547, |
| "learning_rate": 2.024167737341684e-06, |
| "loss": 1.7568, |
| "num_input_tokens_seen": 4291696, |
| "step": 5720 |
| }, |
| { |
| "epoch": 8.848531684698608, |
| "grad_norm": 0.5533444285392761, |
| "learning_rate": 1.99766814532828e-06, |
| "loss": 2.436, |
| "num_input_tokens_seen": 4295536, |
| "step": 5725 |
| }, |
| { |
| "epoch": 8.856259659969089, |
| "grad_norm": 0.5889791250228882, |
| "learning_rate": 1.9713359387922378e-06, |
| "loss": 1.6759, |
| "num_input_tokens_seen": 4299248, |
| "step": 5730 |
| }, |
| { |
| "epoch": 8.863987635239567, |
| "grad_norm": 0.5730096697807312, |
| "learning_rate": 1.9451713093501855e-06, |
| "loss": 2.1711, |
| "num_input_tokens_seen": 4303088, |
| "step": 5735 |
| }, |
| { |
| "epoch": 8.871715610510046, |
| "grad_norm": 0.534789502620697, |
| "learning_rate": 1.9191744473992913e-06, |
| "loss": 1.5128, |
| "num_input_tokens_seen": 4306736, |
| "step": 5740 |
| }, |
| { |
| "epoch": 8.879443585780525, |
| "grad_norm": 0.4847296178340912, |
| "learning_rate": 1.8933455421159014e-06, |
| "loss": 1.7468, |
| "num_input_tokens_seen": 4310384, |
| "step": 5745 |
| }, |
| { |
| "epoch": 8.887171561051005, |
| "grad_norm": 0.5766510963439941, |
| "learning_rate": 1.8676847814541654e-06, |
| "loss": 1.6574, |
| "num_input_tokens_seen": 4313648, |
| "step": 5750 |
| }, |
| { |
| "epoch": 8.894899536321484, |
| "grad_norm": 0.5227906703948975, |
| "learning_rate": 1.8421923521446587e-06, |
| "loss": 1.7965, |
| "num_input_tokens_seen": 4317488, |
| "step": 5755 |
| }, |
| { |
| "epoch": 8.902627511591962, |
| "grad_norm": 0.4841407537460327, |
| "learning_rate": 1.8168684396930285e-06, |
| "loss": 2.2404, |
| "num_input_tokens_seen": 4321392, |
| "step": 5760 |
| }, |
| { |
| "epoch": 8.910355486862443, |
| "grad_norm": 0.46578606963157654, |
| "learning_rate": 1.7917132283786386e-06, |
| "loss": 1.6578, |
| "num_input_tokens_seen": 4325360, |
| "step": 5765 |
| }, |
| { |
| "epoch": 8.918083462132921, |
| "grad_norm": 0.48235538601875305, |
| "learning_rate": 1.7667269012532406e-06, |
| "loss": 2.2661, |
| "num_input_tokens_seen": 4328752, |
| "step": 5770 |
| }, |
| { |
| "epoch": 8.9258114374034, |
| "grad_norm": 0.4867675304412842, |
| "learning_rate": 1.7419096401396357e-06, |
| "loss": 2.2521, |
| "num_input_tokens_seen": 4332464, |
| "step": 5775 |
| }, |
| { |
| "epoch": 8.933539412673879, |
| "grad_norm": 0.4980892241001129, |
| "learning_rate": 1.7172616256303288e-06, |
| "loss": 1.8225, |
| "num_input_tokens_seen": 4336496, |
| "step": 5780 |
| }, |
| { |
| "epoch": 8.94126738794436, |
| "grad_norm": 0.42616257071495056, |
| "learning_rate": 1.6927830370862736e-06, |
| "loss": 1.645, |
| "num_input_tokens_seen": 4340208, |
| "step": 5785 |
| }, |
| { |
| "epoch": 8.948995363214838, |
| "grad_norm": 0.44497111439704895, |
| "learning_rate": 1.6684740526354853e-06, |
| "loss": 1.8208, |
| "num_input_tokens_seen": 4343472, |
| "step": 5790 |
| }, |
| { |
| "epoch": 8.956723338485316, |
| "grad_norm": 0.4740997850894928, |
| "learning_rate": 1.6443348491718274e-06, |
| "loss": 1.9233, |
| "num_input_tokens_seen": 4347376, |
| "step": 5795 |
| }, |
| { |
| "epoch": 8.964451313755795, |
| "grad_norm": 0.5786256194114685, |
| "learning_rate": 1.6203656023536629e-06, |
| "loss": 1.871, |
| "num_input_tokens_seen": 4351152, |
| "step": 5800 |
| }, |
| { |
| "epoch": 8.972179289026275, |
| "grad_norm": 0.5951398015022278, |
| "learning_rate": 1.5965664866026047e-06, |
| "loss": 1.6606, |
| "num_input_tokens_seen": 4355184, |
| "step": 5805 |
| }, |
| { |
| "epoch": 8.979907264296754, |
| "grad_norm": 0.5134670734405518, |
| "learning_rate": 1.57293767510224e-06, |
| "loss": 1.8239, |
| "num_input_tokens_seen": 4358640, |
| "step": 5810 |
| }, |
| { |
| "epoch": 8.987635239567233, |
| "grad_norm": 0.3904499411582947, |
| "learning_rate": 1.5494793397968694e-06, |
| "loss": 1.918, |
| "num_input_tokens_seen": 4362288, |
| "step": 5815 |
| }, |
| { |
| "epoch": 8.995363214837713, |
| "grad_norm": 0.4581204354763031, |
| "learning_rate": 1.5261916513902603e-06, |
| "loss": 1.7761, |
| "num_input_tokens_seen": 4366768, |
| "step": 5820 |
| }, |
| { |
| "epoch": 9.003091190108192, |
| "grad_norm": 0.4983641505241394, |
| "learning_rate": 1.5030747793443989e-06, |
| "loss": 1.991, |
| "num_input_tokens_seen": 4369936, |
| "step": 5825 |
| }, |
| { |
| "epoch": 9.01081916537867, |
| "grad_norm": 0.45775794982910156, |
| "learning_rate": 1.4801288918782574e-06, |
| "loss": 1.9089, |
| "num_input_tokens_seen": 4374288, |
| "step": 5830 |
| }, |
| { |
| "epoch": 9.013910355486862, |
| "eval_loss": 1.9623879194259644, |
| "eval_runtime": 9.8422, |
| "eval_samples_per_second": 58.422, |
| "eval_steps_per_second": 7.315, |
| "num_input_tokens_seen": 4375824, |
| "step": 5832 |
| }, |
| { |
| "epoch": 9.018547140649149, |
| "grad_norm": 0.5283871293067932, |
| "learning_rate": 1.4573541559665754e-06, |
| "loss": 1.6235, |
| "num_input_tokens_seen": 4378064, |
| "step": 5835 |
| }, |
| { |
| "epoch": 9.02627511591963, |
| "grad_norm": 0.49816635251045227, |
| "learning_rate": 1.4347507373386331e-06, |
| "loss": 1.7438, |
| "num_input_tokens_seen": 4381712, |
| "step": 5840 |
| }, |
| { |
| "epoch": 9.034003091190108, |
| "grad_norm": 0.5023284554481506, |
| "learning_rate": 1.412318800477072e-06, |
| "loss": 2.245, |
| "num_input_tokens_seen": 4385424, |
| "step": 5845 |
| }, |
| { |
| "epoch": 9.041731066460587, |
| "grad_norm": 0.4554198086261749, |
| "learning_rate": 1.3900585086166513e-06, |
| "loss": 2.0172, |
| "num_input_tokens_seen": 4389776, |
| "step": 5850 |
| }, |
| { |
| "epoch": 9.049459041731067, |
| "grad_norm": 0.5572301745414734, |
| "learning_rate": 1.3679700237431203e-06, |
| "loss": 1.9297, |
| "num_input_tokens_seen": 4393488, |
| "step": 5855 |
| }, |
| { |
| "epoch": 9.057187017001546, |
| "grad_norm": 0.4626171290874481, |
| "learning_rate": 1.3460535065919738e-06, |
| "loss": 1.8209, |
| "num_input_tokens_seen": 4397968, |
| "step": 5860 |
| }, |
| { |
| "epoch": 9.064914992272024, |
| "grad_norm": 0.5806289315223694, |
| "learning_rate": 1.324309116647346e-06, |
| "loss": 2.168, |
| "num_input_tokens_seen": 4401936, |
| "step": 5865 |
| }, |
| { |
| "epoch": 9.072642967542503, |
| "grad_norm": 0.39930564165115356, |
| "learning_rate": 1.3027370121408034e-06, |
| "loss": 2.0342, |
| "num_input_tokens_seen": 4405328, |
| "step": 5870 |
| }, |
| { |
| "epoch": 9.080370942812984, |
| "grad_norm": 0.5544669032096863, |
| "learning_rate": 1.2813373500502128e-06, |
| "loss": 1.7005, |
| "num_input_tokens_seen": 4408976, |
| "step": 5875 |
| }, |
| { |
| "epoch": 9.088098918083462, |
| "grad_norm": 0.6343225240707397, |
| "learning_rate": 1.2601102860986008e-06, |
| "loss": 1.5891, |
| "num_input_tokens_seen": 4413136, |
| "step": 5880 |
| }, |
| { |
| "epoch": 9.09582689335394, |
| "grad_norm": 0.4393240213394165, |
| "learning_rate": 1.2390559747530062e-06, |
| "loss": 1.9156, |
| "num_input_tokens_seen": 4416784, |
| "step": 5885 |
| }, |
| { |
| "epoch": 9.103554868624421, |
| "grad_norm": 0.43060049414634705, |
| "learning_rate": 1.2181745692233766e-06, |
| "loss": 1.9855, |
| "num_input_tokens_seen": 4420688, |
| "step": 5890 |
| }, |
| { |
| "epoch": 9.1112828438949, |
| "grad_norm": 0.506955087184906, |
| "learning_rate": 1.1974662214614379e-06, |
| "loss": 2.047, |
| "num_input_tokens_seen": 4424656, |
| "step": 5895 |
| }, |
| { |
| "epoch": 9.119010819165378, |
| "grad_norm": 0.43633195757865906, |
| "learning_rate": 1.1769310821595907e-06, |
| "loss": 2.0666, |
| "num_input_tokens_seen": 4428432, |
| "step": 5900 |
| }, |
| { |
| "epoch": 9.126738794435857, |
| "grad_norm": 0.4248088598251343, |
| "learning_rate": 1.156569300749827e-06, |
| "loss": 1.4644, |
| "num_input_tokens_seen": 4431440, |
| "step": 5905 |
| }, |
| { |
| "epoch": 9.134466769706338, |
| "grad_norm": 0.44746243953704834, |
| "learning_rate": 1.1363810254026108e-06, |
| "loss": 1.9164, |
| "num_input_tokens_seen": 4435344, |
| "step": 5910 |
| }, |
| { |
| "epoch": 9.142194744976816, |
| "grad_norm": 0.5325887799263, |
| "learning_rate": 1.1163664030258536e-06, |
| "loss": 1.8609, |
| "num_input_tokens_seen": 4439184, |
| "step": 5915 |
| }, |
| { |
| "epoch": 9.149922720247295, |
| "grad_norm": 0.6899775266647339, |
| "learning_rate": 1.0965255792637768e-06, |
| "loss": 1.7784, |
| "num_input_tokens_seen": 4442832, |
| "step": 5920 |
| }, |
| { |
| "epoch": 9.157650695517773, |
| "grad_norm": 0.509699821472168, |
| "learning_rate": 1.0768586984959167e-06, |
| "loss": 1.7023, |
| "num_input_tokens_seen": 4446672, |
| "step": 5925 |
| }, |
| { |
| "epoch": 9.165378670788254, |
| "grad_norm": 0.5541631579399109, |
| "learning_rate": 1.0573659038360301e-06, |
| "loss": 1.8195, |
| "num_input_tokens_seen": 4450576, |
| "step": 5930 |
| }, |
| { |
| "epoch": 9.173106646058732, |
| "grad_norm": 0.4798594117164612, |
| "learning_rate": 1.0380473371310762e-06, |
| "loss": 1.8747, |
| "num_input_tokens_seen": 4454160, |
| "step": 5935 |
| }, |
| { |
| "epoch": 9.180834621329211, |
| "grad_norm": 0.5438898801803589, |
| "learning_rate": 1.0189031389601672e-06, |
| "loss": 1.7346, |
| "num_input_tokens_seen": 4457168, |
| "step": 5940 |
| }, |
| { |
| "epoch": 9.188562596599692, |
| "grad_norm": 0.5284126996994019, |
| "learning_rate": 9.999334486335636e-07, |
| "loss": 2.1073, |
| "num_input_tokens_seen": 4460752, |
| "step": 5945 |
| }, |
| { |
| "epoch": 9.19629057187017, |
| "grad_norm": 0.4561935067176819, |
| "learning_rate": 9.81138404191645e-07, |
| "loss": 1.9123, |
| "num_input_tokens_seen": 4464400, |
| "step": 5950 |
| }, |
| { |
| "epoch": 9.204018547140649, |
| "grad_norm": 0.4085952341556549, |
| "learning_rate": 9.625181424039147e-07, |
| "loss": 1.7345, |
| "num_input_tokens_seen": 4467856, |
| "step": 5955 |
| }, |
| { |
| "epoch": 9.211746522411127, |
| "grad_norm": 0.48208120465278625, |
| "learning_rate": 9.440727987679976e-07, |
| "loss": 2.0286, |
| "num_input_tokens_seen": 4471696, |
| "step": 5960 |
| }, |
| { |
| "epoch": 9.219474497681608, |
| "grad_norm": 0.5661913752555847, |
| "learning_rate": 9.25802507508669e-07, |
| "loss": 2.1419, |
| "num_input_tokens_seen": 4475472, |
| "step": 5965 |
| }, |
| { |
| "epoch": 9.227202472952087, |
| "grad_norm": 0.4500444531440735, |
| "learning_rate": 9.077074015768516e-07, |
| "loss": 1.5232, |
| "num_input_tokens_seen": 4478864, |
| "step": 5970 |
| }, |
| { |
| "epoch": 9.234930448222565, |
| "grad_norm": 0.4850940704345703, |
| "learning_rate": 8.897876126486793e-07, |
| "loss": 1.8175, |
| "num_input_tokens_seen": 4482896, |
| "step": 5975 |
| }, |
| { |
| "epoch": 9.242658423493046, |
| "grad_norm": 0.5343092083930969, |
| "learning_rate": 8.720432711245064e-07, |
| "loss": 2.2061, |
| "num_input_tokens_seen": 4486544, |
| "step": 5980 |
| }, |
| { |
| "epoch": 9.250386398763524, |
| "grad_norm": 0.5756357312202454, |
| "learning_rate": 8.544745061279891e-07, |
| "loss": 2.0404, |
| "num_input_tokens_seen": 4490576, |
| "step": 5985 |
| }, |
| { |
| "epoch": 9.258114374034003, |
| "grad_norm": 0.49154385924339294, |
| "learning_rate": 8.370814455051279e-07, |
| "loss": 2.0676, |
| "num_input_tokens_seen": 4494224, |
| "step": 5990 |
| }, |
| { |
| "epoch": 9.265842349304481, |
| "grad_norm": 0.5070616006851196, |
| "learning_rate": 8.198642158233377e-07, |
| "loss": 1.7215, |
| "num_input_tokens_seen": 4498064, |
| "step": 5995 |
| }, |
| { |
| "epoch": 9.273570324574962, |
| "grad_norm": 0.531429648399353, |
| "learning_rate": 8.028229423705375e-07, |
| "loss": 2.2743, |
| "num_input_tokens_seen": 4501776, |
| "step": 6000 |
| }, |
| { |
| "epoch": 9.28129829984544, |
| "grad_norm": 0.4259873032569885, |
| "learning_rate": 7.859577491542259e-07, |
| "loss": 1.6607, |
| "num_input_tokens_seen": 4505296, |
| "step": 6005 |
| }, |
| { |
| "epoch": 9.28902627511592, |
| "grad_norm": 0.6347371935844421, |
| "learning_rate": 7.692687589005876e-07, |
| "loss": 1.7177, |
| "num_input_tokens_seen": 4509136, |
| "step": 6010 |
| }, |
| { |
| "epoch": 9.2967542503864, |
| "grad_norm": 0.550849974155426, |
| "learning_rate": 7.527560930535971e-07, |
| "loss": 1.6047, |
| "num_input_tokens_seen": 4512912, |
| "step": 6015 |
| }, |
| { |
| "epoch": 9.304482225656878, |
| "grad_norm": 0.4533255100250244, |
| "learning_rate": 7.364198717741355e-07, |
| "loss": 1.7645, |
| "num_input_tokens_seen": 4516944, |
| "step": 6020 |
| }, |
| { |
| "epoch": 9.312210200927357, |
| "grad_norm": 0.49000996351242065, |
| "learning_rate": 7.20260213939114e-07, |
| "loss": 1.9519, |
| "num_input_tokens_seen": 4520208, |
| "step": 6025 |
| }, |
| { |
| "epoch": 9.319938176197835, |
| "grad_norm": 0.5339989066123962, |
| "learning_rate": 7.042772371406131e-07, |
| "loss": 2.0867, |
| "num_input_tokens_seen": 4524176, |
| "step": 6030 |
| }, |
| { |
| "epoch": 9.327666151468316, |
| "grad_norm": 0.4784291386604309, |
| "learning_rate": 6.884710576850306e-07, |
| "loss": 1.6407, |
| "num_input_tokens_seen": 4527632, |
| "step": 6035 |
| }, |
| { |
| "epoch": 9.335394126738795, |
| "grad_norm": 0.45697057247161865, |
| "learning_rate": 6.728417905922074e-07, |
| "loss": 2.3036, |
| "num_input_tokens_seen": 4531408, |
| "step": 6040 |
| }, |
| { |
| "epoch": 9.343122102009273, |
| "grad_norm": 0.4117041528224945, |
| "learning_rate": 6.573895495946447e-07, |
| "loss": 1.8848, |
| "num_input_tokens_seen": 4535184, |
| "step": 6045 |
| }, |
| { |
| "epoch": 9.350850077279752, |
| "grad_norm": 0.5256591439247131, |
| "learning_rate": 6.421144471366103e-07, |
| "loss": 1.9648, |
| "num_input_tokens_seen": 4539536, |
| "step": 6050 |
| }, |
| { |
| "epoch": 9.358578052550232, |
| "grad_norm": 0.4190206825733185, |
| "learning_rate": 6.270165943733807e-07, |
| "loss": 1.7893, |
| "num_input_tokens_seen": 4543184, |
| "step": 6055 |
| }, |
| { |
| "epoch": 9.36630602782071, |
| "grad_norm": 0.4589918851852417, |
| "learning_rate": 6.120961011703924e-07, |
| "loss": 1.8888, |
| "num_input_tokens_seen": 4546896, |
| "step": 6060 |
| }, |
| { |
| "epoch": 9.37403400309119, |
| "grad_norm": 0.5328862071037292, |
| "learning_rate": 5.973530761024582e-07, |
| "loss": 1.7838, |
| "num_input_tokens_seen": 4550736, |
| "step": 6065 |
| }, |
| { |
| "epoch": 9.38176197836167, |
| "grad_norm": 0.40755224227905273, |
| "learning_rate": 5.827876264529741e-07, |
| "loss": 1.8243, |
| "num_input_tokens_seen": 4554512, |
| "step": 6070 |
| }, |
| { |
| "epoch": 9.389489953632149, |
| "grad_norm": 0.43924909830093384, |
| "learning_rate": 5.683998582131395e-07, |
| "loss": 1.7699, |
| "num_input_tokens_seen": 4558288, |
| "step": 6075 |
| }, |
| { |
| "epoch": 9.397217928902627, |
| "grad_norm": 0.43658119440078735, |
| "learning_rate": 5.541898760811848e-07, |
| "loss": 1.9572, |
| "num_input_tokens_seen": 4562576, |
| "step": 6080 |
| }, |
| { |
| "epoch": 9.404945904173106, |
| "grad_norm": 0.468034565448761, |
| "learning_rate": 5.401577834616145e-07, |
| "loss": 1.5804, |
| "num_input_tokens_seen": 4566352, |
| "step": 6085 |
| }, |
| { |
| "epoch": 9.412673879443586, |
| "grad_norm": 0.5616998076438904, |
| "learning_rate": 5.26303682464438e-07, |
| "loss": 1.8576, |
| "num_input_tokens_seen": 4570384, |
| "step": 6090 |
| }, |
| { |
| "epoch": 9.420401854714065, |
| "grad_norm": 0.6615525484085083, |
| "learning_rate": 5.126276739044617e-07, |
| "loss": 1.7841, |
| "num_input_tokens_seen": 4574032, |
| "step": 6095 |
| }, |
| { |
| "epoch": 9.428129829984544, |
| "grad_norm": 0.4598679542541504, |
| "learning_rate": 4.991298573005038e-07, |
| "loss": 1.718, |
| "num_input_tokens_seen": 4578000, |
| "step": 6100 |
| }, |
| { |
| "epoch": 9.435857805255024, |
| "grad_norm": 0.4575481116771698, |
| "learning_rate": 4.858103308747225e-07, |
| "loss": 1.6118, |
| "num_input_tokens_seen": 4581904, |
| "step": 6105 |
| }, |
| { |
| "epoch": 9.443585780525503, |
| "grad_norm": 0.5747670531272888, |
| "learning_rate": 4.726691915518694e-07, |
| "loss": 1.9901, |
| "num_input_tokens_seen": 4585552, |
| "step": 6110 |
| }, |
| { |
| "epoch": 9.451313755795981, |
| "grad_norm": 0.5150761604309082, |
| "learning_rate": 4.597065349585844e-07, |
| "loss": 1.843, |
| "num_input_tokens_seen": 4589264, |
| "step": 6115 |
| }, |
| { |
| "epoch": 9.45904173106646, |
| "grad_norm": 0.4432414174079895, |
| "learning_rate": 4.4692245542272417e-07, |
| "loss": 1.8543, |
| "num_input_tokens_seen": 4593552, |
| "step": 6120 |
| }, |
| { |
| "epoch": 9.46676970633694, |
| "grad_norm": 0.4179718792438507, |
| "learning_rate": 4.3431704597264313e-07, |
| "loss": 1.9304, |
| "num_input_tokens_seen": 4597264, |
| "step": 6125 |
| }, |
| { |
| "epoch": 9.474497681607419, |
| "grad_norm": 0.439662903547287, |
| "learning_rate": 4.218903983365469e-07, |
| "loss": 1.8635, |
| "num_input_tokens_seen": 4600976, |
| "step": 6130 |
| }, |
| { |
| "epoch": 9.482225656877898, |
| "grad_norm": 0.6849305033683777, |
| "learning_rate": 4.096426029417982e-07, |
| "loss": 1.86, |
| "num_input_tokens_seen": 4604240, |
| "step": 6135 |
| }, |
| { |
| "epoch": 9.489953632148378, |
| "grad_norm": 0.39302560687065125, |
| "learning_rate": 3.975737489142845e-07, |
| "loss": 1.5986, |
| "num_input_tokens_seen": 4608016, |
| "step": 6140 |
| }, |
| { |
| "epoch": 9.497681607418857, |
| "grad_norm": 0.5208292007446289, |
| "learning_rate": 3.8568392407774544e-07, |
| "loss": 1.4805, |
| "num_input_tokens_seen": 4611664, |
| "step": 6145 |
| }, |
| { |
| "epoch": 9.505409582689335, |
| "grad_norm": 0.6550456285476685, |
| "learning_rate": 3.7397321495314666e-07, |
| "loss": 1.7417, |
| "num_input_tokens_seen": 4615760, |
| "step": 6150 |
| }, |
| { |
| "epoch": 9.513137557959814, |
| "grad_norm": 0.4527139663696289, |
| "learning_rate": 3.624417067580543e-07, |
| "loss": 1.9414, |
| "num_input_tokens_seen": 4619472, |
| "step": 6155 |
| }, |
| { |
| "epoch": 9.51468315301391, |
| "eval_loss": 1.9586824178695679, |
| "eval_runtime": 9.8493, |
| "eval_samples_per_second": 58.38, |
| "eval_steps_per_second": 7.31, |
| "num_input_tokens_seen": 4620240, |
| "step": 6156 |
| }, |
| { |
| "epoch": 9.520865533230294, |
| "grad_norm": 0.46713271737098694, |
| "learning_rate": 3.5108948340600024e-07, |
| "loss": 2.0021, |
| "num_input_tokens_seen": 4623248, |
| "step": 6160 |
| }, |
| { |
| "epoch": 9.528593508500773, |
| "grad_norm": 0.5319985747337341, |
| "learning_rate": 3.399166275058874e-07, |
| "loss": 2.0089, |
| "num_input_tokens_seen": 4627152, |
| "step": 6165 |
| }, |
| { |
| "epoch": 9.536321483771252, |
| "grad_norm": 0.5472551584243774, |
| "learning_rate": 3.289232203613768e-07, |
| "loss": 2.0052, |
| "num_input_tokens_seen": 4630992, |
| "step": 6170 |
| }, |
| { |
| "epoch": 9.54404945904173, |
| "grad_norm": 0.3944055438041687, |
| "learning_rate": 3.181093419702991e-07, |
| "loss": 1.6991, |
| "num_input_tokens_seen": 4634512, |
| "step": 6175 |
| }, |
| { |
| "epoch": 9.55177743431221, |
| "grad_norm": 0.4235127866268158, |
| "learning_rate": 3.074750710240798e-07, |
| "loss": 1.965, |
| "num_input_tokens_seen": 4638160, |
| "step": 6180 |
| }, |
| { |
| "epoch": 9.55950540958269, |
| "grad_norm": 0.5224214196205139, |
| "learning_rate": 2.97020484907154e-07, |
| "loss": 1.5174, |
| "num_input_tokens_seen": 4641232, |
| "step": 6185 |
| }, |
| { |
| "epoch": 9.567233384853168, |
| "grad_norm": 0.48956823348999023, |
| "learning_rate": 2.8674565969641633e-07, |
| "loss": 1.9404, |
| "num_input_tokens_seen": 4645456, |
| "step": 6190 |
| }, |
| { |
| "epoch": 9.574961360123648, |
| "grad_norm": 0.3983800411224365, |
| "learning_rate": 2.766506701606525e-07, |
| "loss": 2.085, |
| "num_input_tokens_seen": 4649360, |
| "step": 6195 |
| }, |
| { |
| "epoch": 9.582689335394127, |
| "grad_norm": 0.46103692054748535, |
| "learning_rate": 2.667355897600088e-07, |
| "loss": 1.7913, |
| "num_input_tokens_seen": 4653392, |
| "step": 6200 |
| }, |
| { |
| "epoch": 9.590417310664606, |
| "grad_norm": 0.48668143153190613, |
| "learning_rate": 2.5700049064545373e-07, |
| "loss": 1.7479, |
| "num_input_tokens_seen": 4656720, |
| "step": 6205 |
| }, |
| { |
| "epoch": 9.598145285935084, |
| "grad_norm": 0.4885518252849579, |
| "learning_rate": 2.4744544365824793e-07, |
| "loss": 2.0991, |
| "num_input_tokens_seen": 4660048, |
| "step": 6210 |
| }, |
| { |
| "epoch": 9.605873261205565, |
| "grad_norm": 0.5907134413719177, |
| "learning_rate": 2.3807051832943071e-07, |
| "loss": 1.9784, |
| "num_input_tokens_seen": 4663824, |
| "step": 6215 |
| }, |
| { |
| "epoch": 9.613601236476043, |
| "grad_norm": 0.469518780708313, |
| "learning_rate": 2.288757828793231e-07, |
| "loss": 1.9894, |
| "num_input_tokens_seen": 4667408, |
| "step": 6220 |
| }, |
| { |
| "epoch": 9.621329211746522, |
| "grad_norm": 0.37500569224357605, |
| "learning_rate": 2.1986130421701445e-07, |
| "loss": 1.8386, |
| "num_input_tokens_seen": 4671120, |
| "step": 6225 |
| }, |
| { |
| "epoch": 9.629057187017002, |
| "grad_norm": 0.569591760635376, |
| "learning_rate": 2.1102714793989063e-07, |
| "loss": 1.5365, |
| "num_input_tokens_seen": 4674768, |
| "step": 6230 |
| }, |
| { |
| "epoch": 9.636785162287481, |
| "grad_norm": 0.5992723703384399, |
| "learning_rate": 2.0237337833315384e-07, |
| "loss": 1.9869, |
| "num_input_tokens_seen": 4678416, |
| "step": 6235 |
| }, |
| { |
| "epoch": 9.64451313755796, |
| "grad_norm": 0.4954376816749573, |
| "learning_rate": 1.9390005836934232e-07, |
| "loss": 1.5203, |
| "num_input_tokens_seen": 4682000, |
| "step": 6240 |
| }, |
| { |
| "epoch": 9.652241112828438, |
| "grad_norm": 0.46121746301651, |
| "learning_rate": 1.8560724970789202e-07, |
| "loss": 1.7654, |
| "num_input_tokens_seen": 4685584, |
| "step": 6245 |
| }, |
| { |
| "epoch": 9.659969088098919, |
| "grad_norm": 0.43793782591819763, |
| "learning_rate": 1.7749501269467282e-07, |
| "loss": 1.5659, |
| "num_input_tokens_seen": 4688912, |
| "step": 6250 |
| }, |
| { |
| "epoch": 9.667697063369397, |
| "grad_norm": 0.4031641483306885, |
| "learning_rate": 1.6956340636155033e-07, |
| "loss": 2.1642, |
| "num_input_tokens_seen": 4692176, |
| "step": 6255 |
| }, |
| { |
| "epoch": 9.675425038639876, |
| "grad_norm": 0.5168727040290833, |
| "learning_rate": 1.6181248842597196e-07, |
| "loss": 1.7044, |
| "num_input_tokens_seen": 4696336, |
| "step": 6260 |
| }, |
| { |
| "epoch": 9.683153013910356, |
| "grad_norm": 0.390639066696167, |
| "learning_rate": 1.5424231529052035e-07, |
| "loss": 1.7278, |
| "num_input_tokens_seen": 4700496, |
| "step": 6265 |
| }, |
| { |
| "epoch": 9.690880989180835, |
| "grad_norm": 0.376815527677536, |
| "learning_rate": 1.4685294204253296e-07, |
| "loss": 1.6015, |
| "num_input_tokens_seen": 4703952, |
| "step": 6270 |
| }, |
| { |
| "epoch": 9.698608964451314, |
| "grad_norm": 0.43562066555023193, |
| "learning_rate": 1.3964442245367193e-07, |
| "loss": 2.5439, |
| "num_input_tokens_seen": 4707984, |
| "step": 6275 |
| }, |
| { |
| "epoch": 9.706336939721792, |
| "grad_norm": 0.8875200748443604, |
| "learning_rate": 1.3261680897955765e-07, |
| "loss": 2.0866, |
| "num_input_tokens_seen": 4711632, |
| "step": 6280 |
| }, |
| { |
| "epoch": 9.714064914992273, |
| "grad_norm": 0.5330743789672852, |
| "learning_rate": 1.2577015275937188e-07, |
| "loss": 1.9099, |
| "num_input_tokens_seen": 4715088, |
| "step": 6285 |
| }, |
| { |
| "epoch": 9.721792890262751, |
| "grad_norm": 0.4825518727302551, |
| "learning_rate": 1.1910450361548587e-07, |
| "loss": 1.708, |
| "num_input_tokens_seen": 4718864, |
| "step": 6290 |
| }, |
| { |
| "epoch": 9.72952086553323, |
| "grad_norm": 0.531721830368042, |
| "learning_rate": 1.1261991005311334e-07, |
| "loss": 1.7221, |
| "num_input_tokens_seen": 4721808, |
| "step": 6295 |
| }, |
| { |
| "epoch": 9.737248840803709, |
| "grad_norm": 0.5634981989860535, |
| "learning_rate": 1.0631641925993307e-07, |
| "loss": 1.8634, |
| "num_input_tokens_seen": 4725520, |
| "step": 6300 |
| }, |
| { |
| "epoch": 9.744976816074189, |
| "grad_norm": 0.5600470900535583, |
| "learning_rate": 1.0019407710576967e-07, |
| "loss": 1.5357, |
| "num_input_tokens_seen": 4729616, |
| "step": 6305 |
| }, |
| { |
| "epoch": 9.752704791344668, |
| "grad_norm": 0.4644871652126312, |
| "learning_rate": 9.425292814224107e-08, |
| "loss": 1.7279, |
| "num_input_tokens_seen": 4733200, |
| "step": 6310 |
| }, |
| { |
| "epoch": 9.760432766615146, |
| "grad_norm": 0.4587952792644501, |
| "learning_rate": 8.849301560244494e-08, |
| "loss": 1.9601, |
| "num_input_tokens_seen": 4736784, |
| "step": 6315 |
| }, |
| { |
| "epoch": 9.768160741885627, |
| "grad_norm": 0.4871441423892975, |
| "learning_rate": 8.291438140064223e-08, |
| "loss": 1.9649, |
| "num_input_tokens_seen": 4740688, |
| "step": 6320 |
| }, |
| { |
| "epoch": 9.775888717156105, |
| "grad_norm": 0.38628652691841125, |
| "learning_rate": 7.751706613194909e-08, |
| "loss": 2.15, |
| "num_input_tokens_seen": 4744400, |
| "step": 6325 |
| }, |
| { |
| "epoch": 9.783616692426584, |
| "grad_norm": 0.46603044867515564, |
| "learning_rate": 7.230110907204269e-08, |
| "loss": 1.904, |
| "num_input_tokens_seen": 4747984, |
| "step": 6330 |
| }, |
| { |
| "epoch": 9.791344667697063, |
| "grad_norm": 0.6037775278091431, |
| "learning_rate": 6.726654817687805e-08, |
| "loss": 1.4794, |
| "num_input_tokens_seen": 4752080, |
| "step": 6335 |
| }, |
| { |
| "epoch": 9.799072642967543, |
| "grad_norm": 0.5536606907844543, |
| "learning_rate": 6.241342008241336e-08, |
| "loss": 2.1317, |
| "num_input_tokens_seen": 4756112, |
| "step": 6340 |
| }, |
| { |
| "epoch": 9.806800618238022, |
| "grad_norm": 0.49296537041664124, |
| "learning_rate": 5.774176010432952e-08, |
| "loss": 1.6372, |
| "num_input_tokens_seen": 4759632, |
| "step": 6345 |
| }, |
| { |
| "epoch": 9.8145285935085, |
| "grad_norm": 0.4221416413784027, |
| "learning_rate": 5.3251602237797126e-08, |
| "loss": 2.142, |
| "num_input_tokens_seen": 4763344, |
| "step": 6350 |
| }, |
| { |
| "epoch": 9.82225656877898, |
| "grad_norm": 0.4220753610134125, |
| "learning_rate": 4.8942979157201586e-08, |
| "loss": 1.7889, |
| "num_input_tokens_seen": 4766736, |
| "step": 6355 |
| }, |
| { |
| "epoch": 9.82998454404946, |
| "grad_norm": 0.4357856512069702, |
| "learning_rate": 4.481592221593223e-08, |
| "loss": 2.2538, |
| "num_input_tokens_seen": 4770576, |
| "step": 6360 |
| }, |
| { |
| "epoch": 9.837712519319938, |
| "grad_norm": 0.45150232315063477, |
| "learning_rate": 4.087046144613249e-08, |
| "loss": 2.0142, |
| "num_input_tokens_seen": 4774352, |
| "step": 6365 |
| }, |
| { |
| "epoch": 9.845440494590417, |
| "grad_norm": 0.3519168198108673, |
| "learning_rate": 3.7106625558494534e-08, |
| "loss": 1.8235, |
| "num_input_tokens_seen": 4778256, |
| "step": 6370 |
| }, |
| { |
| "epoch": 9.853168469860897, |
| "grad_norm": 0.44991186261177063, |
| "learning_rate": 3.352444194203996e-08, |
| "loss": 1.8635, |
| "num_input_tokens_seen": 4782544, |
| "step": 6375 |
| }, |
| { |
| "epoch": 9.860896445131376, |
| "grad_norm": 0.4822252690792084, |
| "learning_rate": 3.012393666393665e-08, |
| "loss": 2.2841, |
| "num_input_tokens_seen": 4786000, |
| "step": 6380 |
| }, |
| { |
| "epoch": 9.868624420401854, |
| "grad_norm": 0.4566858410835266, |
| "learning_rate": 2.690513446929055e-08, |
| "loss": 1.936, |
| "num_input_tokens_seen": 4789584, |
| "step": 6385 |
| }, |
| { |
| "epoch": 9.876352395672335, |
| "grad_norm": 0.5575479865074158, |
| "learning_rate": 2.3868058780979198e-08, |
| "loss": 2.047, |
| "num_input_tokens_seen": 4793424, |
| "step": 6390 |
| }, |
| { |
| "epoch": 9.884080370942813, |
| "grad_norm": 0.41943296790122986, |
| "learning_rate": 2.101273169946849e-08, |
| "loss": 1.5426, |
| "num_input_tokens_seen": 4796880, |
| "step": 6395 |
| }, |
| { |
| "epoch": 9.891808346213292, |
| "grad_norm": 0.4676728844642639, |
| "learning_rate": 1.833917400266838e-08, |
| "loss": 2.3344, |
| "num_input_tokens_seen": 4800848, |
| "step": 6400 |
| }, |
| { |
| "epoch": 9.89953632148377, |
| "grad_norm": 0.5828633308410645, |
| "learning_rate": 1.5847405145769102e-08, |
| "loss": 1.692, |
| "num_input_tokens_seen": 4804432, |
| "step": 6405 |
| }, |
| { |
| "epoch": 9.907264296754251, |
| "grad_norm": 0.38569849729537964, |
| "learning_rate": 1.353744326109685e-08, |
| "loss": 1.9036, |
| "num_input_tokens_seen": 4808592, |
| "step": 6410 |
| }, |
| { |
| "epoch": 9.91499227202473, |
| "grad_norm": 0.5596803426742554, |
| "learning_rate": 1.1409305157999983e-08, |
| "loss": 1.97, |
| "num_input_tokens_seen": 4812240, |
| "step": 6415 |
| }, |
| { |
| "epoch": 9.922720247295208, |
| "grad_norm": 0.5255624651908875, |
| "learning_rate": 9.463006322707468e-09, |
| "loss": 1.9222, |
| "num_input_tokens_seen": 4815952, |
| "step": 6420 |
| }, |
| { |
| "epoch": 9.930448222565687, |
| "grad_norm": 0.5257275700569153, |
| "learning_rate": 7.698560918226183e-09, |
| "loss": 1.9876, |
| "num_input_tokens_seen": 4819856, |
| "step": 6425 |
| }, |
| { |
| "epoch": 9.938176197836167, |
| "grad_norm": 0.5967457294464111, |
| "learning_rate": 6.115981784229896e-09, |
| "loss": 2.1135, |
| "num_input_tokens_seen": 4823568, |
| "step": 6430 |
| }, |
| { |
| "epoch": 9.945904173106646, |
| "grad_norm": 0.4129837453365326, |
| "learning_rate": 4.715280436981551e-09, |
| "loss": 1.6423, |
| "num_input_tokens_seen": 4827344, |
| "step": 6435 |
| }, |
| { |
| "epoch": 9.953632148377125, |
| "grad_norm": 0.5613869428634644, |
| "learning_rate": 3.4964670692277934e-09, |
| "loss": 2.0167, |
| "num_input_tokens_seen": 4831184, |
| "step": 6440 |
| }, |
| { |
| "epoch": 9.961360123647605, |
| "grad_norm": 0.5644814372062683, |
| "learning_rate": 2.4595505501434633e-09, |
| "loss": 2.3765, |
| "num_input_tokens_seen": 4835152, |
| "step": 6445 |
| }, |
| { |
| "epoch": 9.969088098918084, |
| "grad_norm": 0.4961523413658142, |
| "learning_rate": 1.6045384252594275e-09, |
| "loss": 1.8894, |
| "num_input_tokens_seen": 4838800, |
| "step": 6450 |
| }, |
| { |
| "epoch": 9.976816074188562, |
| "grad_norm": 0.6483544707298279, |
| "learning_rate": 9.314369164042936e-10, |
| "loss": 1.875, |
| "num_input_tokens_seen": 4842064, |
| "step": 6455 |
| }, |
| { |
| "epoch": 9.984544049459041, |
| "grad_norm": 0.44400128722190857, |
| "learning_rate": 4.402509216655526e-10, |
| "loss": 1.5224, |
| "num_input_tokens_seen": 4845776, |
| "step": 6460 |
| }, |
| { |
| "epoch": 9.992272024729521, |
| "grad_norm": 0.46386170387268066, |
| "learning_rate": 1.3098401535072137e-10, |
| "loss": 1.6659, |
| "num_input_tokens_seen": 4849296, |
| "step": 6465 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.5652341842651367, |
| "learning_rate": 3.6384479595863445e-12, |
| "loss": 2.0513, |
| "num_input_tokens_seen": 4852608, |
| "step": 6470 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 4852608, |
| "step": 6470, |
| "total_flos": 2.1851074501646746e+17, |
| "train_loss": 3.185584316518981, |
| "train_runtime": 2159.8077, |
| "train_samples_per_second": 23.956, |
| "train_steps_per_second": 2.996 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 6470, |
| "num_input_tokens_seen": 4852608, |
| "num_train_epochs": 10, |
| "save_steps": 324, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.1851074501646746e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|