| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 730, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013717421124828532, | |
| "grad_norm": 1.2567085027694702, | |
| "learning_rate": 1.3043478260869566e-06, | |
| "loss": 1.2424, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.027434842249657063, | |
| "grad_norm": 1.181019902229309, | |
| "learning_rate": 2.9347826086956523e-06, | |
| "loss": 1.2733, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0411522633744856, | |
| "grad_norm": 0.6036953926086426, | |
| "learning_rate": 4.565217391304348e-06, | |
| "loss": 1.2413, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05486968449931413, | |
| "grad_norm": 0.8439048528671265, | |
| "learning_rate": 6.195652173913044e-06, | |
| "loss": 1.2214, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06858710562414266, | |
| "grad_norm": 0.5841037631034851, | |
| "learning_rate": 7.826086956521738e-06, | |
| "loss": 1.2187, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0823045267489712, | |
| "grad_norm": 0.6085624694824219, | |
| "learning_rate": 9.456521739130436e-06, | |
| "loss": 1.1609, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09602194787379972, | |
| "grad_norm": 0.6230579018592834, | |
| "learning_rate": 1.108695652173913e-05, | |
| "loss": 1.1653, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10973936899862825, | |
| "grad_norm": 0.5125408172607422, | |
| "learning_rate": 1.2717391304347827e-05, | |
| "loss": 1.1442, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12345679012345678, | |
| "grad_norm": 0.4418923258781433, | |
| "learning_rate": 1.4347826086956522e-05, | |
| "loss": 1.135, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.13717421124828533, | |
| "grad_norm": 0.39918941259384155, | |
| "learning_rate": 1.597826086956522e-05, | |
| "loss": 1.15, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15089163237311384, | |
| "grad_norm": 0.4432643949985504, | |
| "learning_rate": 1.7608695652173915e-05, | |
| "loss": 1.1159, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1646090534979424, | |
| "grad_norm": 0.7146240472793579, | |
| "learning_rate": 1.9239130434782607e-05, | |
| "loss": 1.1433, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17832647462277093, | |
| "grad_norm": 0.6695220470428467, | |
| "learning_rate": 2.0869565217391306e-05, | |
| "loss": 1.1032, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.19204389574759945, | |
| "grad_norm": 0.489704966545105, | |
| "learning_rate": 2.25e-05, | |
| "loss": 1.0569, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.205761316872428, | |
| "grad_norm": 0.4658520817756653, | |
| "learning_rate": 2.4130434782608697e-05, | |
| "loss": 1.0715, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2194787379972565, | |
| "grad_norm": 0.5942860245704651, | |
| "learning_rate": 2.5760869565217392e-05, | |
| "loss": 1.0534, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.23319615912208505, | |
| "grad_norm": 0.48524120450019836, | |
| "learning_rate": 2.7391304347826085e-05, | |
| "loss": 1.1362, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.24691358024691357, | |
| "grad_norm": 0.41874566674232483, | |
| "learning_rate": 2.9021739130434783e-05, | |
| "loss": 1.0455, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2606310013717421, | |
| "grad_norm": 0.49967625737190247, | |
| "learning_rate": 2.999990141214925e-05, | |
| "loss": 1.0053, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.27434842249657065, | |
| "grad_norm": 0.529108464717865, | |
| "learning_rate": 2.999879231371134e-05, | |
| "loss": 1.0228, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2880658436213992, | |
| "grad_norm": 0.5769373774528503, | |
| "learning_rate": 2.9996450973444988e-05, | |
| "loss": 1.0495, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3017832647462277, | |
| "grad_norm": 0.6768115162849426, | |
| "learning_rate": 2.999287758370551e-05, | |
| "loss": 0.9451, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.31550068587105623, | |
| "grad_norm": 0.6151679754257202, | |
| "learning_rate": 2.998807243806856e-05, | |
| "loss": 1.0238, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.3292181069958848, | |
| "grad_norm": 0.6618012189865112, | |
| "learning_rate": 2.998203593130602e-05, | |
| "loss": 1.0144, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3429355281207133, | |
| "grad_norm": 0.5754362344741821, | |
| "learning_rate": 2.9974768559353564e-05, | |
| "loss": 0.9812, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.35665294924554186, | |
| "grad_norm": 0.614632785320282, | |
| "learning_rate": 2.99662709192699e-05, | |
| "loss": 0.9152, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 0.5445948839187622, | |
| "learning_rate": 2.995654370918775e-05, | |
| "loss": 1.0159, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3840877914951989, | |
| "grad_norm": 0.592310905456543, | |
| "learning_rate": 2.9945587728256456e-05, | |
| "loss": 0.9158, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.39780521262002744, | |
| "grad_norm": 0.6413975954055786, | |
| "learning_rate": 2.9933403876576364e-05, | |
| "loss": 0.938, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.411522633744856, | |
| "grad_norm": 0.5976924300193787, | |
| "learning_rate": 2.9919993155124834e-05, | |
| "loss": 0.8984, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4252400548696845, | |
| "grad_norm": 0.6372507810592651, | |
| "learning_rate": 2.990535666567403e-05, | |
| "loss": 0.8719, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.438957475994513, | |
| "grad_norm": 0.5930208563804626, | |
| "learning_rate": 2.9889495610700416e-05, | |
| "loss": 0.9289, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.45267489711934156, | |
| "grad_norm": 0.6256594061851501, | |
| "learning_rate": 2.9872411293285916e-05, | |
| "loss": 0.852, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4663923182441701, | |
| "grad_norm": 0.6492271423339844, | |
| "learning_rate": 2.985410511701092e-05, | |
| "loss": 0.8581, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.48010973936899864, | |
| "grad_norm": 0.7370251417160034, | |
| "learning_rate": 2.9834578585838907e-05, | |
| "loss": 0.9052, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.49382716049382713, | |
| "grad_norm": 0.7324991226196289, | |
| "learning_rate": 2.9813833303992948e-05, | |
| "loss": 0.8242, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5075445816186557, | |
| "grad_norm": 0.7410762906074524, | |
| "learning_rate": 2.979187097582386e-05, | |
| "loss": 0.8182, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5212620027434842, | |
| "grad_norm": 0.6918640732765198, | |
| "learning_rate": 2.976869340567021e-05, | |
| "loss": 0.838, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5349794238683128, | |
| "grad_norm": 2.690075159072876, | |
| "learning_rate": 2.9744302497710076e-05, | |
| "loss": 0.8393, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5486968449931413, | |
| "grad_norm": 0.7638638019561768, | |
| "learning_rate": 2.9718700255804588e-05, | |
| "loss": 0.8171, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5624142661179699, | |
| "grad_norm": 0.8737165331840515, | |
| "learning_rate": 2.969188878333332e-05, | |
| "loss": 0.8264, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5761316872427984, | |
| "grad_norm": 0.7183387875556946, | |
| "learning_rate": 2.9663870283021477e-05, | |
| "loss": 0.8421, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5898491083676269, | |
| "grad_norm": 0.8239722847938538, | |
| "learning_rate": 2.9634647056758927e-05, | |
| "loss": 0.7993, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6035665294924554, | |
| "grad_norm": 0.8498063683509827, | |
| "learning_rate": 2.960422150541109e-05, | |
| "loss": 0.8239, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6172839506172839, | |
| "grad_norm": 1.0519356727600098, | |
| "learning_rate": 2.9572596128621683e-05, | |
| "loss": 0.7706, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6310013717421125, | |
| "grad_norm": 0.844680666923523, | |
| "learning_rate": 2.9539773524607373e-05, | |
| "loss": 0.7471, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.644718792866941, | |
| "grad_norm": 0.7600374221801758, | |
| "learning_rate": 2.95057563899443e-05, | |
| "loss": 0.7894, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6584362139917695, | |
| "grad_norm": 0.8541322946548462, | |
| "learning_rate": 2.947054751934656e-05, | |
| "loss": 0.7903, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6721536351165981, | |
| "grad_norm": 0.7854955196380615, | |
| "learning_rate": 2.9434149805436586e-05, | |
| "loss": 0.7754, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6858710562414266, | |
| "grad_norm": 0.7242818474769592, | |
| "learning_rate": 2.9396566238507496e-05, | |
| "loss": 0.7455, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6995884773662552, | |
| "grad_norm": 0.9281851053237915, | |
| "learning_rate": 2.935779990627744e-05, | |
| "loss": 0.7562, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7133058984910837, | |
| "grad_norm": 0.8336049318313599, | |
| "learning_rate": 2.931785399363592e-05, | |
| "loss": 0.7557, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7270233196159122, | |
| "grad_norm": 0.967589795589447, | |
| "learning_rate": 2.9276731782382123e-05, | |
| "loss": 0.7062, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.8542262315750122, | |
| "learning_rate": 2.9234436650955297e-05, | |
| "loss": 0.7184, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7544581618655692, | |
| "grad_norm": 0.9164705276489258, | |
| "learning_rate": 2.9190972074157232e-05, | |
| "loss": 0.6814, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7681755829903978, | |
| "grad_norm": 0.8561894297599792, | |
| "learning_rate": 2.9146341622866716e-05, | |
| "loss": 0.6944, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7818930041152263, | |
| "grad_norm": 0.9264963865280151, | |
| "learning_rate": 2.910054896374623e-05, | |
| "loss": 0.6571, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7956104252400549, | |
| "grad_norm": 0.8652317523956299, | |
| "learning_rate": 2.9053597858940666e-05, | |
| "loss": 0.7355, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8093278463648834, | |
| "grad_norm": 0.9738150835037231, | |
| "learning_rate": 2.9005492165768278e-05, | |
| "loss": 0.6453, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.823045267489712, | |
| "grad_norm": 1.012303352355957, | |
| "learning_rate": 2.895623583640375e-05, | |
| "loss": 0.6413, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8367626886145405, | |
| "grad_norm": 0.8940839767456055, | |
| "learning_rate": 2.890583291755351e-05, | |
| "loss": 0.7145, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.850480109739369, | |
| "grad_norm": 0.8137922883033752, | |
| "learning_rate": 2.8854287550123278e-05, | |
| "loss": 0.6835, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8641975308641975, | |
| "grad_norm": 0.8530572056770325, | |
| "learning_rate": 2.880160396887787e-05, | |
| "loss": 0.6702, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.877914951989026, | |
| "grad_norm": 0.9513605237007141, | |
| "learning_rate": 2.8747786502093258e-05, | |
| "loss": 0.7024, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8916323731138546, | |
| "grad_norm": 0.899515688419342, | |
| "learning_rate": 2.8692839571201e-05, | |
| "loss": 0.6845, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9053497942386831, | |
| "grad_norm": 0.9002428650856018, | |
| "learning_rate": 2.863676769042498e-05, | |
| "loss": 0.6447, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9190672153635117, | |
| "grad_norm": 0.8433024883270264, | |
| "learning_rate": 2.8579575466410566e-05, | |
| "loss": 0.6325, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9327846364883402, | |
| "grad_norm": 0.9360433220863342, | |
| "learning_rate": 2.8521267597846094e-05, | |
| "loss": 0.5826, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9465020576131687, | |
| "grad_norm": 0.9888073205947876, | |
| "learning_rate": 2.8461848875076884e-05, | |
| "loss": 0.6349, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9602194787379973, | |
| "grad_norm": 0.914045512676239, | |
| "learning_rate": 2.8401324179711678e-05, | |
| "loss": 0.5755, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9739368998628258, | |
| "grad_norm": 0.9859684705734253, | |
| "learning_rate": 2.8339698484221574e-05, | |
| "loss": 0.6325, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "grad_norm": 0.9974327683448792, | |
| "learning_rate": 2.827697685153151e-05, | |
| "loss": 0.5818, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.362316370010376, | |
| "learning_rate": 2.8213164434604316e-05, | |
| "loss": 0.5783, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.0137174211248285, | |
| "grad_norm": 1.4332367181777954, | |
| "learning_rate": 2.814826647601738e-05, | |
| "loss": 0.5402, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.027434842249657, | |
| "grad_norm": 1.2112072706222534, | |
| "learning_rate": 2.8082288307531914e-05, | |
| "loss": 0.5368, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.0411522633744856, | |
| "grad_norm": 1.2344884872436523, | |
| "learning_rate": 2.8015235349654938e-05, | |
| "loss": 0.5097, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0548696844993142, | |
| "grad_norm": 0.9244922995567322, | |
| "learning_rate": 2.7947113111193936e-05, | |
| "loss": 0.5583, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.0685871056241427, | |
| "grad_norm": 0.980315089225769, | |
| "learning_rate": 2.7877927188804288e-05, | |
| "loss": 0.51, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0823045267489713, | |
| "grad_norm": 0.9603582620620728, | |
| "learning_rate": 2.7807683266529466e-05, | |
| "loss": 0.5517, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.0960219478737998, | |
| "grad_norm": 1.0928096771240234, | |
| "learning_rate": 2.773638711533405e-05, | |
| "loss": 0.5018, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1097393689986284, | |
| "grad_norm": 1.0439014434814453, | |
| "learning_rate": 2.7664044592629615e-05, | |
| "loss": 0.5139, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.123456790123457, | |
| "grad_norm": 0.9458956718444824, | |
| "learning_rate": 2.7590661641793513e-05, | |
| "loss": 0.5583, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.1371742112482854, | |
| "grad_norm": 0.9447291493415833, | |
| "learning_rate": 2.7516244291680565e-05, | |
| "loss": 0.4616, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.1508916323731138, | |
| "grad_norm": 1.076687216758728, | |
| "learning_rate": 2.7440798656127792e-05, | |
| "loss": 0.4695, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.1646090534979423, | |
| "grad_norm": 1.0236420631408691, | |
| "learning_rate": 2.7364330933452094e-05, | |
| "loss": 0.5455, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.1783264746227708, | |
| "grad_norm": 1.0733507871627808, | |
| "learning_rate": 2.7286847405941024e-05, | |
| "loss": 0.4956, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1920438957475994, | |
| "grad_norm": 0.9793803095817566, | |
| "learning_rate": 2.720835443933669e-05, | |
| "loss": 0.5202, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.205761316872428, | |
| "grad_norm": 1.0080183744430542, | |
| "learning_rate": 2.712885848231273e-05, | |
| "loss": 0.4865, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2194787379972565, | |
| "grad_norm": 0.9846312999725342, | |
| "learning_rate": 2.7048366065944538e-05, | |
| "loss": 0.4843, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.233196159122085, | |
| "grad_norm": 1.0748393535614014, | |
| "learning_rate": 2.6966883803172698e-05, | |
| "loss": 0.4714, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2469135802469136, | |
| "grad_norm": 1.0638792514801025, | |
| "learning_rate": 2.6884418388259675e-05, | |
| "loss": 0.5295, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.260631001371742, | |
| "grad_norm": 1.0708755254745483, | |
| "learning_rate": 2.6800976596239855e-05, | |
| "loss": 0.4713, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.2743484224965707, | |
| "grad_norm": 0.9797709584236145, | |
| "learning_rate": 2.6716565282362928e-05, | |
| "loss": 0.4489, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.2880658436213992, | |
| "grad_norm": 1.017024278640747, | |
| "learning_rate": 2.663119138153069e-05, | |
| "loss": 0.4699, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.3017832647462277, | |
| "grad_norm": 1.1177973747253418, | |
| "learning_rate": 2.654486190772729e-05, | |
| "loss": 0.4354, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.3155006858710563, | |
| "grad_norm": 0.9993549585342407, | |
| "learning_rate": 2.6457583953443022e-05, | |
| "loss": 0.4882, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3292181069958848, | |
| "grad_norm": 1.1262377500534058, | |
| "learning_rate": 2.636936468909158e-05, | |
| "loss": 0.41, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.3429355281207134, | |
| "grad_norm": 0.964245080947876, | |
| "learning_rate": 2.628021136242101e-05, | |
| "loss": 0.4364, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.356652949245542, | |
| "grad_norm": 1.0607110261917114, | |
| "learning_rate": 2.619013129791823e-05, | |
| "loss": 0.4891, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.3703703703703702, | |
| "grad_norm": 1.0203349590301514, | |
| "learning_rate": 2.6099131896207327e-05, | |
| "loss": 0.4341, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3840877914951988, | |
| "grad_norm": 0.9529250860214233, | |
| "learning_rate": 2.6007220633441486e-05, | |
| "loss": 0.4266, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.3978052126200273, | |
| "grad_norm": 0.9978516697883606, | |
| "learning_rate": 2.591440506068883e-05, | |
| "loss": 0.434, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.4115226337448559, | |
| "grad_norm": 1.03233003616333, | |
| "learning_rate": 2.582069280331204e-05, | |
| "loss": 0.3978, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.4252400548696844, | |
| "grad_norm": 1.2834852933883667, | |
| "learning_rate": 2.5726091560341873e-05, | |
| "loss": 0.4496, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.438957475994513, | |
| "grad_norm": 1.0178697109222412, | |
| "learning_rate": 2.5630609103844646e-05, | |
| "loss": 0.433, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.4526748971193415, | |
| "grad_norm": 1.0839297771453857, | |
| "learning_rate": 2.5534253278283725e-05, | |
| "loss": 0.4494, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.46639231824417, | |
| "grad_norm": 0.9672828316688538, | |
| "learning_rate": 2.5437031999875047e-05, | |
| "loss": 0.4438, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.4801097393689986, | |
| "grad_norm": 0.9954227805137634, | |
| "learning_rate": 2.533895325593674e-05, | |
| "loss": 0.4253, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.4938271604938271, | |
| "grad_norm": 0.9949136972427368, | |
| "learning_rate": 2.5240025104232938e-05, | |
| "loss": 0.4565, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.5075445816186557, | |
| "grad_norm": 1.2475636005401611, | |
| "learning_rate": 2.514025567231178e-05, | |
| "loss": 0.397, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5212620027434842, | |
| "grad_norm": 1.0037999153137207, | |
| "learning_rate": 2.5039653156837686e-05, | |
| "loss": 0.3955, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.5349794238683128, | |
| "grad_norm": 1.2002379894256592, | |
| "learning_rate": 2.4938225822917932e-05, | |
| "loss": 0.3541, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.5486968449931413, | |
| "grad_norm": 1.0180314779281616, | |
| "learning_rate": 2.4835982003423654e-05, | |
| "loss": 0.403, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.5624142661179699, | |
| "grad_norm": 1.017683982849121, | |
| "learning_rate": 2.473293009830522e-05, | |
| "loss": 0.4082, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.5761316872427984, | |
| "grad_norm": 1.055148720741272, | |
| "learning_rate": 2.4629078573902136e-05, | |
| "loss": 0.4118, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.589849108367627, | |
| "grad_norm": 1.082885980606079, | |
| "learning_rate": 2.45244359622475e-05, | |
| "loss": 0.3646, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.6035665294924555, | |
| "grad_norm": 0.9864196181297302, | |
| "learning_rate": 2.4419010860367013e-05, | |
| "loss": 0.3726, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.617283950617284, | |
| "grad_norm": 1.0728785991668701, | |
| "learning_rate": 2.431281192957271e-05, | |
| "loss": 0.3665, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.6310013717421126, | |
| "grad_norm": 1.0340756177902222, | |
| "learning_rate": 2.4205847894751358e-05, | |
| "loss": 0.402, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.6447187928669411, | |
| "grad_norm": 0.9732591509819031, | |
| "learning_rate": 2.409812754364768e-05, | |
| "loss": 0.3913, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.6584362139917697, | |
| "grad_norm": 0.9975690245628357, | |
| "learning_rate": 2.398965972614235e-05, | |
| "loss": 0.356, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.6721536351165982, | |
| "grad_norm": 1.0766946077346802, | |
| "learning_rate": 2.3880453353524963e-05, | |
| "loss": 0.3921, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.6858710562414267, | |
| "grad_norm": 1.0128309726715088, | |
| "learning_rate": 2.377051739776189e-05, | |
| "loss": 0.4264, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.6995884773662553, | |
| "grad_norm": 1.0055245161056519, | |
| "learning_rate": 2.3659860890759184e-05, | |
| "loss": 0.3296, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.7133058984910838, | |
| "grad_norm": 1.058605432510376, | |
| "learning_rate": 2.3548492923620567e-05, | |
| "loss": 0.3617, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.7270233196159122, | |
| "grad_norm": 1.0829980373382568, | |
| "learning_rate": 2.343642264590051e-05, | |
| "loss": 0.3801, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.7407407407407407, | |
| "grad_norm": 1.2426400184631348, | |
| "learning_rate": 2.3323659264852586e-05, | |
| "loss": 0.3491, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.7544581618655692, | |
| "grad_norm": 1.0566037893295288, | |
| "learning_rate": 2.3210212044672995e-05, | |
| "loss": 0.3547, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.7681755829903978, | |
| "grad_norm": 0.9220610857009888, | |
| "learning_rate": 2.3096090305739476e-05, | |
| "loss": 0.3682, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.7818930041152263, | |
| "grad_norm": 1.124056100845337, | |
| "learning_rate": 2.298130342384559e-05, | |
| "loss": 0.3236, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.7956104252400549, | |
| "grad_norm": 1.10678231716156, | |
| "learning_rate": 2.2865860829430405e-05, | |
| "loss": 0.359, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.8093278463648834, | |
| "grad_norm": 1.1028835773468018, | |
| "learning_rate": 2.2749772006803782e-05, | |
| "loss": 0.3276, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.823045267489712, | |
| "grad_norm": 0.9391087889671326, | |
| "learning_rate": 2.2633046493367128e-05, | |
| "loss": 0.3589, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.8367626886145405, | |
| "grad_norm": 1.010254979133606, | |
| "learning_rate": 2.2515693878829872e-05, | |
| "loss": 0.3517, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.850480109739369, | |
| "grad_norm": 1.0423787832260132, | |
| "learning_rate": 2.2397723804421613e-05, | |
| "loss": 0.32, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.8641975308641974, | |
| "grad_norm": 1.1941534280776978, | |
| "learning_rate": 2.227914596210002e-05, | |
| "loss": 0.3128, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.877914951989026, | |
| "grad_norm": 0.9325462579727173, | |
| "learning_rate": 2.2159970093754583e-05, | |
| "loss": 0.3595, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.8916323731138545, | |
| "grad_norm": 1.2271301746368408, | |
| "learning_rate": 2.2040205990406257e-05, | |
| "loss": 0.3118, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.905349794238683, | |
| "grad_norm": 0.9768636226654053, | |
| "learning_rate": 2.1919863491403083e-05, | |
| "loss": 0.3858, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.9190672153635115, | |
| "grad_norm": 1.3226031064987183, | |
| "learning_rate": 2.1798952483611812e-05, | |
| "loss": 0.3268, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.93278463648834, | |
| "grad_norm": 0.8924455046653748, | |
| "learning_rate": 2.167748290060564e-05, | |
| "loss": 0.2887, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.9465020576131686, | |
| "grad_norm": 1.220363974571228, | |
| "learning_rate": 2.1555464721848107e-05, | |
| "loss": 0.3174, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.9602194787379972, | |
| "grad_norm": 1.1989498138427734, | |
| "learning_rate": 2.1432907971873225e-05, | |
| "loss": 0.3026, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.9739368998628257, | |
| "grad_norm": 1.065783977508545, | |
| "learning_rate": 2.1309822719461905e-05, | |
| "loss": 0.3121, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.9876543209876543, | |
| "grad_norm": 1.019674301147461, | |
| "learning_rate": 2.118621907681474e-05, | |
| "loss": 0.2984, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.3716942071914673, | |
| "learning_rate": 2.106210719872121e-05, | |
| "loss": 0.2699, | |
| "step": 730 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1825, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1214531076208722e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |