| { | |
| "best_global_step": 1860, | |
| "best_metric": 0.7261904761904762, | |
| "best_model_checkpoint": "/www/wwwroot/ai_project/model/checkpoint-1260", | |
| "epoch": 3.0, | |
| "eval_steps": 30, | |
| "global_step": 2514, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011933174224343675, | |
| "grad_norm": 7.105306148529053, | |
| "learning_rate": 9e-06, | |
| "loss": 0.6856, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02386634844868735, | |
| "grad_norm": 3.5623154640197754, | |
| "learning_rate": 1.9e-05, | |
| "loss": 0.6737, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03579952267303103, | |
| "grad_norm": 8.588749885559082, | |
| "learning_rate": 2.9e-05, | |
| "loss": 0.7264, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03579952267303103, | |
| "eval_accuracy": 0.6285714285714286, | |
| "eval_loss": 0.680474042892456, | |
| "eval_runtime": 2.9486, | |
| "eval_samples_per_second": 142.438, | |
| "eval_steps_per_second": 71.219, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0477326968973747, | |
| "grad_norm": 7.816359996795654, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.7096, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.059665871121718374, | |
| "grad_norm": 7.00941276550293, | |
| "learning_rate": 4.9e-05, | |
| "loss": 0.669, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07159904534606205, | |
| "grad_norm": 5.085658073425293, | |
| "learning_rate": 4.9817370129870134e-05, | |
| "loss": 0.6139, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07159904534606205, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6627190113067627, | |
| "eval_runtime": 2.8882, | |
| "eval_samples_per_second": 145.422, | |
| "eval_steps_per_second": 72.711, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08353221957040573, | |
| "grad_norm": 4.888000011444092, | |
| "learning_rate": 4.961444805194805e-05, | |
| "loss": 0.5252, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0954653937947494, | |
| "grad_norm": 9.089728355407715, | |
| "learning_rate": 4.9411525974025976e-05, | |
| "loss": 0.7837, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10739856801909307, | |
| "grad_norm": 3.44459867477417, | |
| "learning_rate": 4.92086038961039e-05, | |
| "loss": 0.6985, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10739856801909307, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6658735275268555, | |
| "eval_runtime": 2.5125, | |
| "eval_samples_per_second": 167.162, | |
| "eval_steps_per_second": 83.581, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11933174224343675, | |
| "grad_norm": 3.5227367877960205, | |
| "learning_rate": 4.900568181818182e-05, | |
| "loss": 0.6999, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13126491646778043, | |
| "grad_norm": 8.138216972351074, | |
| "learning_rate": 4.880275974025974e-05, | |
| "loss": 0.8203, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1431980906921241, | |
| "grad_norm": 3.1432907581329346, | |
| "learning_rate": 4.859983766233767e-05, | |
| "loss": 0.6327, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1431980906921241, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6644813418388367, | |
| "eval_runtime": 2.9572, | |
| "eval_samples_per_second": 142.025, | |
| "eval_steps_per_second": 71.013, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15513126491646778, | |
| "grad_norm": 2.8620526790618896, | |
| "learning_rate": 4.8396915584415585e-05, | |
| "loss": 0.6161, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16706443914081145, | |
| "grad_norm": 3.9090893268585205, | |
| "learning_rate": 4.819399350649351e-05, | |
| "loss": 0.6733, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17899761336515513, | |
| "grad_norm": 8.569962501525879, | |
| "learning_rate": 4.7991071428571433e-05, | |
| "loss": 0.6391, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17899761336515513, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6607769131660461, | |
| "eval_runtime": 3.3316, | |
| "eval_samples_per_second": 126.065, | |
| "eval_steps_per_second": 63.033, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1909307875894988, | |
| "grad_norm": 5.559383392333984, | |
| "learning_rate": 4.778814935064935e-05, | |
| "loss": 0.6183, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20286396181384247, | |
| "grad_norm": 3.5374624729156494, | |
| "learning_rate": 4.7585227272727276e-05, | |
| "loss": 0.6068, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21479713603818615, | |
| "grad_norm": 5.332399368286133, | |
| "learning_rate": 4.73823051948052e-05, | |
| "loss": 0.5847, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.21479713603818615, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6628832817077637, | |
| "eval_runtime": 3.3183, | |
| "eval_samples_per_second": 126.57, | |
| "eval_steps_per_second": 63.285, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.22673031026252982, | |
| "grad_norm": 4.872141361236572, | |
| "learning_rate": 4.717938311688312e-05, | |
| "loss": 0.6055, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2386634844868735, | |
| "grad_norm": 4.306164264678955, | |
| "learning_rate": 4.697646103896104e-05, | |
| "loss": 0.6895, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.25059665871121717, | |
| "grad_norm": 3.5854930877685547, | |
| "learning_rate": 4.6773538961038967e-05, | |
| "loss": 0.693, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.25059665871121717, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6654404401779175, | |
| "eval_runtime": 2.7449, | |
| "eval_samples_per_second": 153.01, | |
| "eval_steps_per_second": 76.505, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.26252983293556087, | |
| "grad_norm": 5.1247782707214355, | |
| "learning_rate": 4.6570616883116884e-05, | |
| "loss": 0.7442, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2744630071599045, | |
| "grad_norm": 3.291513681411743, | |
| "learning_rate": 4.636769480519481e-05, | |
| "loss": 0.6717, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2863961813842482, | |
| "grad_norm": 4.713298797607422, | |
| "learning_rate": 4.616477272727273e-05, | |
| "loss": 0.65, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2863961813842482, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.659385621547699, | |
| "eval_runtime": 2.8781, | |
| "eval_samples_per_second": 145.928, | |
| "eval_steps_per_second": 72.964, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.29832935560859186, | |
| "grad_norm": 2.7979941368103027, | |
| "learning_rate": 4.596185064935065e-05, | |
| "loss": 0.65, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.31026252983293556, | |
| "grad_norm": 2.7257330417633057, | |
| "learning_rate": 4.5758928571428575e-05, | |
| "loss": 0.5902, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3221957040572792, | |
| "grad_norm": 3.3188984394073486, | |
| "learning_rate": 4.55560064935065e-05, | |
| "loss": 0.5494, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3221957040572792, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6594758629798889, | |
| "eval_runtime": 3.3768, | |
| "eval_samples_per_second": 124.379, | |
| "eval_steps_per_second": 62.189, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3341288782816229, | |
| "grad_norm": 5.446512222290039, | |
| "learning_rate": 4.535308441558442e-05, | |
| "loss": 0.7618, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3460620525059666, | |
| "grad_norm": 5.833828926086426, | |
| "learning_rate": 4.5150162337662335e-05, | |
| "loss": 0.6402, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.35799522673031026, | |
| "grad_norm": 3.265965223312378, | |
| "learning_rate": 4.4947240259740266e-05, | |
| "loss": 0.5794, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.35799522673031026, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6596417427062988, | |
| "eval_runtime": 2.8478, | |
| "eval_samples_per_second": 147.481, | |
| "eval_steps_per_second": 73.741, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.36992840095465396, | |
| "grad_norm": 3.3324408531188965, | |
| "learning_rate": 4.4744318181818184e-05, | |
| "loss": 0.6289, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3818615751789976, | |
| "grad_norm": 3.396897315979004, | |
| "learning_rate": 4.45413961038961e-05, | |
| "loss": 0.7469, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3937947494033413, | |
| "grad_norm": 5.416648864746094, | |
| "learning_rate": 4.433847402597403e-05, | |
| "loss": 0.6168, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3937947494033413, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.660291314125061, | |
| "eval_runtime": 2.7128, | |
| "eval_samples_per_second": 154.824, | |
| "eval_steps_per_second": 77.412, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.40572792362768495, | |
| "grad_norm": 3.854402542114258, | |
| "learning_rate": 4.413555194805195e-05, | |
| "loss": 0.5962, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.41766109785202865, | |
| "grad_norm": 3.4200260639190674, | |
| "learning_rate": 4.393262987012987e-05, | |
| "loss": 0.6299, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4295942720763723, | |
| "grad_norm": 4.237409591674805, | |
| "learning_rate": 4.37297077922078e-05, | |
| "loss": 0.7804, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4295942720763723, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6622642874717712, | |
| "eval_runtime": 2.8864, | |
| "eval_samples_per_second": 145.51, | |
| "eval_steps_per_second": 72.755, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.441527446300716, | |
| "grad_norm": 4.508576393127441, | |
| "learning_rate": 4.352678571428572e-05, | |
| "loss": 0.7417, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.45346062052505964, | |
| "grad_norm": 2.8814594745635986, | |
| "learning_rate": 4.3323863636363635e-05, | |
| "loss": 0.6707, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.46539379474940334, | |
| "grad_norm": 6.597533702850342, | |
| "learning_rate": 4.3120941558441566e-05, | |
| "loss": 0.5901, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.46539379474940334, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6637502312660217, | |
| "eval_runtime": 3.1072, | |
| "eval_samples_per_second": 135.172, | |
| "eval_steps_per_second": 67.586, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.477326968973747, | |
| "grad_norm": 5.29579496383667, | |
| "learning_rate": 4.2918019480519484e-05, | |
| "loss": 0.701, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4892601431980907, | |
| "grad_norm": 3.737311601638794, | |
| "learning_rate": 4.27150974025974e-05, | |
| "loss": 0.5864, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5011933174224343, | |
| "grad_norm": 4.80122184753418, | |
| "learning_rate": 4.2512175324675326e-05, | |
| "loss": 0.6265, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5011933174224343, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.663756251335144, | |
| "eval_runtime": 3.0077, | |
| "eval_samples_per_second": 139.639, | |
| "eval_steps_per_second": 69.82, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.513126491646778, | |
| "grad_norm": 3.392441749572754, | |
| "learning_rate": 4.230925324675325e-05, | |
| "loss": 0.7258, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5250596658711217, | |
| "grad_norm": 2.653325319290161, | |
| "learning_rate": 4.210633116883117e-05, | |
| "loss": 0.7126, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5369928400954654, | |
| "grad_norm": 2.605800151824951, | |
| "learning_rate": 4.190340909090909e-05, | |
| "loss": 0.6815, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5369928400954654, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6631113886833191, | |
| "eval_runtime": 3.0366, | |
| "eval_samples_per_second": 138.313, | |
| "eval_steps_per_second": 69.157, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.548926014319809, | |
| "grad_norm": 5.945620536804199, | |
| "learning_rate": 4.170048701298702e-05, | |
| "loss": 0.6262, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5608591885441527, | |
| "grad_norm": 4.985898494720459, | |
| "learning_rate": 4.1497564935064935e-05, | |
| "loss": 0.6651, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5727923627684964, | |
| "grad_norm": 2.658013343811035, | |
| "learning_rate": 4.129464285714286e-05, | |
| "loss": 0.6781, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5727923627684964, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6566287875175476, | |
| "eval_runtime": 3.8874, | |
| "eval_samples_per_second": 108.042, | |
| "eval_steps_per_second": 54.021, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5847255369928401, | |
| "grad_norm": 7.678948879241943, | |
| "learning_rate": 4.1091720779220783e-05, | |
| "loss": 0.7041, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5966587112171837, | |
| "grad_norm": 2.540349006652832, | |
| "learning_rate": 4.08887987012987e-05, | |
| "loss": 0.6513, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6085918854415274, | |
| "grad_norm": 2.527493715286255, | |
| "learning_rate": 4.0685876623376626e-05, | |
| "loss": 0.6985, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6085918854415274, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6595126986503601, | |
| "eval_runtime": 3.3607, | |
| "eval_samples_per_second": 124.974, | |
| "eval_steps_per_second": 62.487, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6205250596658711, | |
| "grad_norm": 5.219762802124023, | |
| "learning_rate": 4.048295454545455e-05, | |
| "loss": 0.6298, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6324582338902148, | |
| "grad_norm": 5.685338020324707, | |
| "learning_rate": 4.028003246753247e-05, | |
| "loss": 0.6654, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6443914081145584, | |
| "grad_norm": 5.298613548278809, | |
| "learning_rate": 4.007711038961039e-05, | |
| "loss": 0.6984, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6443914081145584, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6536137461662292, | |
| "eval_runtime": 4.863, | |
| "eval_samples_per_second": 86.366, | |
| "eval_steps_per_second": 43.183, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6563245823389021, | |
| "grad_norm": 4.052762508392334, | |
| "learning_rate": 3.9874188311688317e-05, | |
| "loss": 0.6984, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6682577565632458, | |
| "grad_norm": 8.290848731994629, | |
| "learning_rate": 3.9671266233766234e-05, | |
| "loss": 0.6689, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6801909307875895, | |
| "grad_norm": 2.341036081314087, | |
| "learning_rate": 3.946834415584416e-05, | |
| "loss": 0.661, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6801909307875895, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6538602709770203, | |
| "eval_runtime": 2.9233, | |
| "eval_samples_per_second": 143.672, | |
| "eval_steps_per_second": 71.836, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6921241050119332, | |
| "grad_norm": 4.4188737869262695, | |
| "learning_rate": 3.926542207792208e-05, | |
| "loss": 0.6026, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7040572792362768, | |
| "grad_norm": 4.814696788787842, | |
| "learning_rate": 3.90625e-05, | |
| "loss": 0.6565, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7159904534606205, | |
| "grad_norm": 5.101158142089844, | |
| "learning_rate": 3.8859577922077925e-05, | |
| "loss": 0.5595, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7159904534606205, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6548095941543579, | |
| "eval_runtime": 3.2011, | |
| "eval_samples_per_second": 131.206, | |
| "eval_steps_per_second": 65.603, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7279236276849642, | |
| "grad_norm": 3.732052803039551, | |
| "learning_rate": 3.865665584415585e-05, | |
| "loss": 0.7697, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7398568019093079, | |
| "grad_norm": 6.07219934463501, | |
| "learning_rate": 3.845373376623377e-05, | |
| "loss": 0.674, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7517899761336515, | |
| "grad_norm": 3.234180212020874, | |
| "learning_rate": 3.825081168831169e-05, | |
| "loss": 0.638, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7517899761336515, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.647255003452301, | |
| "eval_runtime": 3.0145, | |
| "eval_samples_per_second": 139.325, | |
| "eval_steps_per_second": 69.663, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7637231503579952, | |
| "grad_norm": 4.247595310211182, | |
| "learning_rate": 3.8047889610389616e-05, | |
| "loss": 0.6979, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7756563245823389, | |
| "grad_norm": 6.84116268157959, | |
| "learning_rate": 3.7844967532467534e-05, | |
| "loss": 0.6084, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7875894988066826, | |
| "grad_norm": 5.435266017913818, | |
| "learning_rate": 3.764204545454545e-05, | |
| "loss": 0.6514, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7875894988066826, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6437537670135498, | |
| "eval_runtime": 2.3127, | |
| "eval_samples_per_second": 181.608, | |
| "eval_steps_per_second": 90.804, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7995226730310262, | |
| "grad_norm": 2.817640781402588, | |
| "learning_rate": 3.743912337662338e-05, | |
| "loss": 0.696, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8114558472553699, | |
| "grad_norm": 2.5305979251861572, | |
| "learning_rate": 3.72362012987013e-05, | |
| "loss": 0.723, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8233890214797136, | |
| "grad_norm": 8.556059837341309, | |
| "learning_rate": 3.703327922077922e-05, | |
| "loss": 0.661, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8233890214797136, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.644755482673645, | |
| "eval_runtime": 2.2808, | |
| "eval_samples_per_second": 184.148, | |
| "eval_steps_per_second": 92.074, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8353221957040573, | |
| "grad_norm": 2.903482675552368, | |
| "learning_rate": 3.683035714285715e-05, | |
| "loss": 0.6023, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.847255369928401, | |
| "grad_norm": 4.76421594619751, | |
| "learning_rate": 3.662743506493507e-05, | |
| "loss": 0.628, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8591885441527446, | |
| "grad_norm": 4.305855751037598, | |
| "learning_rate": 3.6424512987012985e-05, | |
| "loss": 0.6648, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8591885441527446, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6386857032775879, | |
| "eval_runtime": 2.3714, | |
| "eval_samples_per_second": 177.11, | |
| "eval_steps_per_second": 88.555, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8711217183770883, | |
| "grad_norm": 6.615411758422852, | |
| "learning_rate": 3.6221590909090916e-05, | |
| "loss": 0.6341, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.883054892601432, | |
| "grad_norm": 6.593195915222168, | |
| "learning_rate": 3.6018668831168834e-05, | |
| "loss": 0.6311, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8949880668257757, | |
| "grad_norm": 4.931580543518066, | |
| "learning_rate": 3.581574675324675e-05, | |
| "loss": 0.56, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8949880668257757, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6398793458938599, | |
| "eval_runtime": 2.3361, | |
| "eval_samples_per_second": 179.79, | |
| "eval_steps_per_second": 89.895, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9069212410501193, | |
| "grad_norm": 3.87886118888855, | |
| "learning_rate": 3.561282467532468e-05, | |
| "loss": 0.6329, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.918854415274463, | |
| "grad_norm": 3.161126136779785, | |
| "learning_rate": 3.54099025974026e-05, | |
| "loss": 0.7029, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9307875894988067, | |
| "grad_norm": 7.052578449249268, | |
| "learning_rate": 3.520698051948052e-05, | |
| "loss": 0.6108, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9307875894988067, | |
| "eval_accuracy": 0.6523809523809524, | |
| "eval_loss": 0.6286986470222473, | |
| "eval_runtime": 2.6937, | |
| "eval_samples_per_second": 155.92, | |
| "eval_steps_per_second": 77.96, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9427207637231504, | |
| "grad_norm": 14.41779899597168, | |
| "learning_rate": 3.500405844155844e-05, | |
| "loss": 0.6523, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.954653937947494, | |
| "grad_norm": 4.530862331390381, | |
| "learning_rate": 3.480113636363637e-05, | |
| "loss": 0.6565, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9665871121718377, | |
| "grad_norm": 3.9421231746673584, | |
| "learning_rate": 3.4598214285714284e-05, | |
| "loss": 0.5472, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9665871121718377, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6442738771438599, | |
| "eval_runtime": 2.525, | |
| "eval_samples_per_second": 166.334, | |
| "eval_steps_per_second": 83.167, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9785202863961814, | |
| "grad_norm": 3.9132273197174072, | |
| "learning_rate": 3.439529220779221e-05, | |
| "loss": 0.5797, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9904534606205251, | |
| "grad_norm": 4.5334086418151855, | |
| "learning_rate": 3.4192370129870133e-05, | |
| "loss": 0.7948, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0023866348448687, | |
| "grad_norm": 3.76124906539917, | |
| "learning_rate": 3.398944805194805e-05, | |
| "loss": 0.5284, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0023866348448687, | |
| "eval_accuracy": 0.6238095238095238, | |
| "eval_loss": 0.6406013369560242, | |
| "eval_runtime": 2.4016, | |
| "eval_samples_per_second": 174.884, | |
| "eval_steps_per_second": 87.442, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0143198090692125, | |
| "grad_norm": 10.1142578125, | |
| "learning_rate": 3.3786525974025976e-05, | |
| "loss": 0.8632, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.026252983293556, | |
| "grad_norm": 5.389456272125244, | |
| "learning_rate": 3.35836038961039e-05, | |
| "loss": 0.5965, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.0381861575178997, | |
| "grad_norm": 5.022064685821533, | |
| "learning_rate": 3.338068181818182e-05, | |
| "loss": 0.5518, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.0381861575178997, | |
| "eval_accuracy": 0.6214285714285714, | |
| "eval_loss": 0.6387470960617065, | |
| "eval_runtime": 2.4702, | |
| "eval_samples_per_second": 170.027, | |
| "eval_steps_per_second": 85.013, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.0501193317422435, | |
| "grad_norm": 3.8693058490753174, | |
| "learning_rate": 3.317775974025974e-05, | |
| "loss": 0.6257, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.062052505966587, | |
| "grad_norm": 6.848055839538574, | |
| "learning_rate": 3.2974837662337667e-05, | |
| "loss": 0.7654, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.0739856801909309, | |
| "grad_norm": 6.271612644195557, | |
| "learning_rate": 3.2771915584415584e-05, | |
| "loss": 0.5958, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0739856801909309, | |
| "eval_accuracy": 0.6619047619047619, | |
| "eval_loss": 0.6196722388267517, | |
| "eval_runtime": 2.5711, | |
| "eval_samples_per_second": 163.357, | |
| "eval_steps_per_second": 81.678, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0859188544152745, | |
| "grad_norm": 5.1813764572143555, | |
| "learning_rate": 3.256899350649351e-05, | |
| "loss": 0.6688, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.097852028639618, | |
| "grad_norm": 8.188650131225586, | |
| "learning_rate": 3.236607142857143e-05, | |
| "loss": 0.5075, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.1097852028639619, | |
| "grad_norm": 5.9253249168396, | |
| "learning_rate": 3.216314935064935e-05, | |
| "loss": 0.5268, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1097852028639619, | |
| "eval_accuracy": 0.6547619047619048, | |
| "eval_loss": 0.6201965808868408, | |
| "eval_runtime": 2.6071, | |
| "eval_samples_per_second": 161.097, | |
| "eval_steps_per_second": 80.549, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.1217183770883055, | |
| "grad_norm": 8.462143898010254, | |
| "learning_rate": 3.1960227272727275e-05, | |
| "loss": 0.6805, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.1336515513126493, | |
| "grad_norm": 3.664438247680664, | |
| "learning_rate": 3.17573051948052e-05, | |
| "loss": 0.5489, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.1455847255369929, | |
| "grad_norm": 3.6083319187164307, | |
| "learning_rate": 3.155438311688312e-05, | |
| "loss": 0.5184, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1455847255369929, | |
| "eval_accuracy": 0.638095238095238, | |
| "eval_loss": 0.6298591494560242, | |
| "eval_runtime": 2.2121, | |
| "eval_samples_per_second": 189.863, | |
| "eval_steps_per_second": 94.932, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1575178997613365, | |
| "grad_norm": 4.299609184265137, | |
| "learning_rate": 3.135146103896104e-05, | |
| "loss": 0.5752, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.1694510739856803, | |
| "grad_norm": 4.7070183753967285, | |
| "learning_rate": 3.1148538961038966e-05, | |
| "loss": 0.6226, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1813842482100239, | |
| "grad_norm": 4.521914005279541, | |
| "learning_rate": 3.0945616883116884e-05, | |
| "loss": 0.5337, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1813842482100239, | |
| "eval_accuracy": 0.6428571428571429, | |
| "eval_loss": 0.6394156217575073, | |
| "eval_runtime": 2.1577, | |
| "eval_samples_per_second": 194.651, | |
| "eval_steps_per_second": 97.325, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1933174224343674, | |
| "grad_norm": 5.360909461975098, | |
| "learning_rate": 3.07426948051948e-05, | |
| "loss": 0.7913, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2052505966587113, | |
| "grad_norm": 6.7155585289001465, | |
| "learning_rate": 3.053977272727273e-05, | |
| "loss": 0.6489, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.2171837708830548, | |
| "grad_norm": 7.668753147125244, | |
| "learning_rate": 3.033685064935065e-05, | |
| "loss": 0.5592, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.2171837708830548, | |
| "eval_accuracy": 0.6714285714285714, | |
| "eval_loss": 0.6178216934204102, | |
| "eval_runtime": 2.4819, | |
| "eval_samples_per_second": 169.226, | |
| "eval_steps_per_second": 84.613, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.2291169451073987, | |
| "grad_norm": 3.813314914703369, | |
| "learning_rate": 3.013392857142857e-05, | |
| "loss": 0.5319, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.2410501193317423, | |
| "grad_norm": 8.704251289367676, | |
| "learning_rate": 2.9931006493506496e-05, | |
| "loss": 0.6233, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.2529832935560858, | |
| "grad_norm": 10.739197731018066, | |
| "learning_rate": 2.9728084415584417e-05, | |
| "loss": 0.6285, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2529832935560858, | |
| "eval_accuracy": 0.6761904761904762, | |
| "eval_loss": 0.6063261032104492, | |
| "eval_runtime": 2.5831, | |
| "eval_samples_per_second": 162.595, | |
| "eval_steps_per_second": 81.298, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2649164677804297, | |
| "grad_norm": 8.07175064086914, | |
| "learning_rate": 2.9525162337662338e-05, | |
| "loss": 0.5288, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.2768496420047732, | |
| "grad_norm": 11.14047622680664, | |
| "learning_rate": 2.9322240259740263e-05, | |
| "loss": 0.6402, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.288782816229117, | |
| "grad_norm": 17.40794563293457, | |
| "learning_rate": 2.9119318181818184e-05, | |
| "loss": 0.6681, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.288782816229117, | |
| "eval_accuracy": 0.6738095238095239, | |
| "eval_loss": 0.6034413576126099, | |
| "eval_runtime": 2.1352, | |
| "eval_samples_per_second": 196.701, | |
| "eval_steps_per_second": 98.351, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.3007159904534606, | |
| "grad_norm": 6.053748607635498, | |
| "learning_rate": 2.8916396103896105e-05, | |
| "loss": 0.6375, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.3126491646778042, | |
| "grad_norm": 3.6662399768829346, | |
| "learning_rate": 2.871347402597403e-05, | |
| "loss": 0.4657, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.324582338902148, | |
| "grad_norm": 7.520589351654053, | |
| "learning_rate": 2.851055194805195e-05, | |
| "loss": 0.6052, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.324582338902148, | |
| "eval_accuracy": 0.680952380952381, | |
| "eval_loss": 0.589663028717041, | |
| "eval_runtime": 2.3348, | |
| "eval_samples_per_second": 179.884, | |
| "eval_steps_per_second": 89.942, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.3365155131264916, | |
| "grad_norm": 5.6362409591674805, | |
| "learning_rate": 2.830762987012987e-05, | |
| "loss": 0.4708, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.3484486873508352, | |
| "grad_norm": 6.884702205657959, | |
| "learning_rate": 2.8104707792207796e-05, | |
| "loss": 0.6569, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.360381861575179, | |
| "grad_norm": 6.990943431854248, | |
| "learning_rate": 2.7901785714285717e-05, | |
| "loss": 0.5346, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.360381861575179, | |
| "eval_accuracy": 0.6857142857142857, | |
| "eval_loss": 0.574230432510376, | |
| "eval_runtime": 2.5972, | |
| "eval_samples_per_second": 161.712, | |
| "eval_steps_per_second": 80.856, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.3723150357995226, | |
| "grad_norm": 4.231634616851807, | |
| "learning_rate": 2.7698863636363638e-05, | |
| "loss": 0.6154, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.3842482100238662, | |
| "grad_norm": 7.739737510681152, | |
| "learning_rate": 2.7495941558441562e-05, | |
| "loss": 0.5676, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.39618138424821, | |
| "grad_norm": 9.585612297058105, | |
| "learning_rate": 2.7293019480519483e-05, | |
| "loss": 0.6113, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.39618138424821, | |
| "eval_accuracy": 0.6785714285714286, | |
| "eval_loss": 0.5705173015594482, | |
| "eval_runtime": 2.4049, | |
| "eval_samples_per_second": 174.642, | |
| "eval_steps_per_second": 87.321, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.4081145584725536, | |
| "grad_norm": 10.238170623779297, | |
| "learning_rate": 2.7090097402597404e-05, | |
| "loss": 0.5694, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.4200477326968974, | |
| "grad_norm": 5.449009895324707, | |
| "learning_rate": 2.6887175324675322e-05, | |
| "loss": 0.5336, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.431980906921241, | |
| "grad_norm": 6.410137176513672, | |
| "learning_rate": 2.668425324675325e-05, | |
| "loss": 0.709, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.431980906921241, | |
| "eval_accuracy": 0.6928571428571428, | |
| "eval_loss": 0.5699592232704163, | |
| "eval_runtime": 2.6398, | |
| "eval_samples_per_second": 159.102, | |
| "eval_steps_per_second": 79.551, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4439140811455848, | |
| "grad_norm": 17.243120193481445, | |
| "learning_rate": 2.648133116883117e-05, | |
| "loss": 0.6117, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.4558472553699284, | |
| "grad_norm": 2.6029751300811768, | |
| "learning_rate": 2.627840909090909e-05, | |
| "loss": 0.5584, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.467780429594272, | |
| "grad_norm": 7.715820789337158, | |
| "learning_rate": 2.6075487012987017e-05, | |
| "loss": 0.4975, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.467780429594272, | |
| "eval_accuracy": 0.680952380952381, | |
| "eval_loss": 0.5647696256637573, | |
| "eval_runtime": 2.6005, | |
| "eval_samples_per_second": 161.51, | |
| "eval_steps_per_second": 80.755, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.4797136038186158, | |
| "grad_norm": 4.275643348693848, | |
| "learning_rate": 2.5872564935064934e-05, | |
| "loss": 0.4998, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.4916467780429594, | |
| "grad_norm": 5.787468433380127, | |
| "learning_rate": 2.5669642857142855e-05, | |
| "loss": 0.5196, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.503579952267303, | |
| "grad_norm": 8.201250076293945, | |
| "learning_rate": 2.5466720779220783e-05, | |
| "loss": 0.4744, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.503579952267303, | |
| "eval_accuracy": 0.7047619047619048, | |
| "eval_loss": 0.5737091302871704, | |
| "eval_runtime": 2.7806, | |
| "eval_samples_per_second": 151.045, | |
| "eval_steps_per_second": 75.522, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.5155131264916468, | |
| "grad_norm": 16.53989601135254, | |
| "learning_rate": 2.52637987012987e-05, | |
| "loss": 0.5824, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.5274463007159904, | |
| "grad_norm": 4.177210807800293, | |
| "learning_rate": 2.5060876623376622e-05, | |
| "loss": 0.4096, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.539379474940334, | |
| "grad_norm": 7.058141231536865, | |
| "learning_rate": 2.4857954545454546e-05, | |
| "loss": 0.4575, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.539379474940334, | |
| "eval_accuracy": 0.7119047619047619, | |
| "eval_loss": 0.5913795828819275, | |
| "eval_runtime": 3.4982, | |
| "eval_samples_per_second": 120.063, | |
| "eval_steps_per_second": 60.031, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.5513126491646778, | |
| "grad_norm": 6.7890849113464355, | |
| "learning_rate": 2.4655032467532467e-05, | |
| "loss": 0.6381, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.5632458233890216, | |
| "grad_norm": 15.614151954650879, | |
| "learning_rate": 2.4452110389610392e-05, | |
| "loss": 0.5837, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.575178997613365, | |
| "grad_norm": 3.2906830310821533, | |
| "learning_rate": 2.4249188311688313e-05, | |
| "loss": 0.5163, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.575178997613365, | |
| "eval_accuracy": 0.7047619047619048, | |
| "eval_loss": 0.5870974659919739, | |
| "eval_runtime": 2.4203, | |
| "eval_samples_per_second": 173.531, | |
| "eval_steps_per_second": 86.766, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.5871121718377088, | |
| "grad_norm": 8.762399673461914, | |
| "learning_rate": 2.4046266233766234e-05, | |
| "loss": 0.423, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.5990453460620526, | |
| "grad_norm": 18.587299346923828, | |
| "learning_rate": 2.384334415584416e-05, | |
| "loss": 0.6604, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.6109785202863962, | |
| "grad_norm": 10.328272819519043, | |
| "learning_rate": 2.364042207792208e-05, | |
| "loss": 0.4858, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.6109785202863962, | |
| "eval_accuracy": 0.6976190476190476, | |
| "eval_loss": 0.5857155323028564, | |
| "eval_runtime": 2.5795, | |
| "eval_samples_per_second": 162.825, | |
| "eval_steps_per_second": 81.412, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.6229116945107398, | |
| "grad_norm": 31.050983428955078, | |
| "learning_rate": 2.34375e-05, | |
| "loss": 0.5269, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.6348448687350836, | |
| "grad_norm": 26.770954132080078, | |
| "learning_rate": 2.3234577922077925e-05, | |
| "loss": 0.7522, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.6467780429594272, | |
| "grad_norm": 16.199596405029297, | |
| "learning_rate": 2.3031655844155846e-05, | |
| "loss": 0.4981, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.6467780429594272, | |
| "eval_accuracy": 0.7071428571428572, | |
| "eval_loss": 0.5910340547561646, | |
| "eval_runtime": 2.4005, | |
| "eval_samples_per_second": 174.965, | |
| "eval_steps_per_second": 87.483, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.6587112171837708, | |
| "grad_norm": 8.824426651000977, | |
| "learning_rate": 2.2828733766233767e-05, | |
| "loss": 0.6491, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.6706443914081146, | |
| "grad_norm": 30.093074798583984, | |
| "learning_rate": 2.262581168831169e-05, | |
| "loss": 0.5058, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.6825775656324582, | |
| "grad_norm": 5.514789581298828, | |
| "learning_rate": 2.242288961038961e-05, | |
| "loss": 0.495, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.6825775656324582, | |
| "eval_accuracy": 0.6976190476190476, | |
| "eval_loss": 0.6180436015129089, | |
| "eval_runtime": 2.4224, | |
| "eval_samples_per_second": 173.381, | |
| "eval_steps_per_second": 86.691, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.6945107398568018, | |
| "grad_norm": 3.9637222290039062, | |
| "learning_rate": 2.2219967532467534e-05, | |
| "loss": 0.4785, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.7064439140811456, | |
| "grad_norm": 23.46233367919922, | |
| "learning_rate": 2.2017045454545458e-05, | |
| "loss": 0.9344, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.7183770883054894, | |
| "grad_norm": 7.6329731941223145, | |
| "learning_rate": 2.1814123376623376e-05, | |
| "loss": 0.6106, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.7183770883054894, | |
| "eval_accuracy": 0.6857142857142857, | |
| "eval_loss": 0.6573019623756409, | |
| "eval_runtime": 2.3562, | |
| "eval_samples_per_second": 178.255, | |
| "eval_steps_per_second": 89.127, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.7303102625298328, | |
| "grad_norm": 2.4250106811523438, | |
| "learning_rate": 2.16112012987013e-05, | |
| "loss": 0.812, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.7422434367541766, | |
| "grad_norm": 4.6379313468933105, | |
| "learning_rate": 2.140827922077922e-05, | |
| "loss": 0.4254, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.7541766109785204, | |
| "grad_norm": 2.969158411026001, | |
| "learning_rate": 2.1205357142857142e-05, | |
| "loss": 0.4755, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.7541766109785204, | |
| "eval_accuracy": 0.6904761904761905, | |
| "eval_loss": 0.6418641805648804, | |
| "eval_runtime": 2.8834, | |
| "eval_samples_per_second": 145.661, | |
| "eval_steps_per_second": 72.831, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.766109785202864, | |
| "grad_norm": 30.389511108398438, | |
| "learning_rate": 2.1002435064935067e-05, | |
| "loss": 0.6588, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.7780429594272076, | |
| "grad_norm": 10.750784873962402, | |
| "learning_rate": 2.0799512987012988e-05, | |
| "loss": 0.7834, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.7899761336515514, | |
| "grad_norm": 26.425033569335938, | |
| "learning_rate": 2.059659090909091e-05, | |
| "loss": 0.6807, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.7899761336515514, | |
| "eval_accuracy": 0.6976190476190476, | |
| "eval_loss": 0.6332749724388123, | |
| "eval_runtime": 2.3964, | |
| "eval_samples_per_second": 175.265, | |
| "eval_steps_per_second": 87.632, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.801909307875895, | |
| "grad_norm": 1.6902508735656738, | |
| "learning_rate": 2.0393668831168833e-05, | |
| "loss": 0.6456, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.8138424821002386, | |
| "grad_norm": 4.968941688537598, | |
| "learning_rate": 2.0190746753246754e-05, | |
| "loss": 0.6374, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.8257756563245824, | |
| "grad_norm": 14.968162536621094, | |
| "learning_rate": 1.9987824675324675e-05, | |
| "loss": 0.4483, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.8257756563245824, | |
| "eval_accuracy": 0.6976190476190476, | |
| "eval_loss": 0.6345822215080261, | |
| "eval_runtime": 2.3274, | |
| "eval_samples_per_second": 180.458, | |
| "eval_steps_per_second": 90.229, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.837708830548926, | |
| "grad_norm": 13.755841255187988, | |
| "learning_rate": 1.97849025974026e-05, | |
| "loss": 0.7141, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.8496420047732696, | |
| "grad_norm": 7.046818256378174, | |
| "learning_rate": 1.958198051948052e-05, | |
| "loss": 0.4692, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.8615751789976134, | |
| "grad_norm": 8.693527221679688, | |
| "learning_rate": 1.9379058441558442e-05, | |
| "loss": 0.618, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.8615751789976134, | |
| "eval_accuracy": 0.7023809523809523, | |
| "eval_loss": 0.57932448387146, | |
| "eval_runtime": 2.6212, | |
| "eval_samples_per_second": 160.233, | |
| "eval_steps_per_second": 80.117, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.8735083532219572, | |
| "grad_norm": 17.399871826171875, | |
| "learning_rate": 1.9176136363636366e-05, | |
| "loss": 0.414, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.8854415274463006, | |
| "grad_norm": 14.802628517150879, | |
| "learning_rate": 1.8973214285714284e-05, | |
| "loss": 0.47, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.8973747016706444, | |
| "grad_norm": 2.645390748977661, | |
| "learning_rate": 1.877029220779221e-05, | |
| "loss": 0.2105, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.8973747016706444, | |
| "eval_accuracy": 0.7166666666666667, | |
| "eval_loss": 0.6054596900939941, | |
| "eval_runtime": 2.5618, | |
| "eval_samples_per_second": 163.946, | |
| "eval_steps_per_second": 81.973, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.9093078758949882, | |
| "grad_norm": 19.810827255249023, | |
| "learning_rate": 1.8567370129870133e-05, | |
| "loss": 0.4894, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.9212410501193318, | |
| "grad_norm": 15.894791603088379, | |
| "learning_rate": 1.836444805194805e-05, | |
| "loss": 0.5804, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.9331742243436754, | |
| "grad_norm": 2.4785335063934326, | |
| "learning_rate": 1.8161525974025975e-05, | |
| "loss": 0.791, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.9331742243436754, | |
| "eval_accuracy": 0.6976190476190476, | |
| "eval_loss": 0.6101633906364441, | |
| "eval_runtime": 2.4503, | |
| "eval_samples_per_second": 171.407, | |
| "eval_steps_per_second": 85.704, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.9451073985680192, | |
| "grad_norm": 1.90475594997406, | |
| "learning_rate": 1.79586038961039e-05, | |
| "loss": 0.3093, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.9570405727923628, | |
| "grad_norm": 1.2917793989181519, | |
| "learning_rate": 1.7755681818181817e-05, | |
| "loss": 0.6421, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.9689737470167064, | |
| "grad_norm": 29.986438751220703, | |
| "learning_rate": 1.7552759740259742e-05, | |
| "loss": 0.5379, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.9689737470167064, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 0.6631202101707458, | |
| "eval_runtime": 3.6446, | |
| "eval_samples_per_second": 115.238, | |
| "eval_steps_per_second": 57.619, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.9809069212410502, | |
| "grad_norm": 7.635002136230469, | |
| "learning_rate": 1.7349837662337663e-05, | |
| "loss": 0.5357, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.9928400954653938, | |
| "grad_norm": 12.643198013305664, | |
| "learning_rate": 1.7146915584415584e-05, | |
| "loss": 0.3115, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.0047732696897373, | |
| "grad_norm": 23.04033088684082, | |
| "learning_rate": 1.694399350649351e-05, | |
| "loss": 0.5175, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.0047732696897373, | |
| "eval_accuracy": 0.7, | |
| "eval_loss": 0.7028768658638, | |
| "eval_runtime": 2.5925, | |
| "eval_samples_per_second": 162.009, | |
| "eval_steps_per_second": 81.004, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.016706443914081, | |
| "grad_norm": 7.8505048751831055, | |
| "learning_rate": 1.674107142857143e-05, | |
| "loss": 0.532, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.028639618138425, | |
| "grad_norm": 16.253393173217773, | |
| "learning_rate": 1.653814935064935e-05, | |
| "loss": 0.565, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.0405727923627683, | |
| "grad_norm": 12.83019733428955, | |
| "learning_rate": 1.6335227272727275e-05, | |
| "loss": 0.84, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.0405727923627683, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 0.6765835881233215, | |
| "eval_runtime": 2.7026, | |
| "eval_samples_per_second": 155.407, | |
| "eval_steps_per_second": 77.703, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.052505966587112, | |
| "grad_norm": 28.117868423461914, | |
| "learning_rate": 1.6132305194805196e-05, | |
| "loss": 0.8001, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.064439140811456, | |
| "grad_norm": 24.78237533569336, | |
| "learning_rate": 1.5929383116883117e-05, | |
| "loss": 0.517, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.0763723150357993, | |
| "grad_norm": 10.502264022827148, | |
| "learning_rate": 1.572646103896104e-05, | |
| "loss": 0.6738, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.0763723150357993, | |
| "eval_accuracy": 0.6976190476190476, | |
| "eval_loss": 0.6353161334991455, | |
| "eval_runtime": 3.0434, | |
| "eval_samples_per_second": 138.005, | |
| "eval_steps_per_second": 69.002, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.088305489260143, | |
| "grad_norm": 7.912200927734375, | |
| "learning_rate": 1.5523538961038963e-05, | |
| "loss": 0.4832, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.100238663484487, | |
| "grad_norm": 2.545396327972412, | |
| "learning_rate": 1.5320616883116884e-05, | |
| "loss": 0.4363, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.1121718377088303, | |
| "grad_norm": 15.671684265136719, | |
| "learning_rate": 1.5117694805194806e-05, | |
| "loss": 0.5378, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.1121718377088303, | |
| "eval_accuracy": 0.7023809523809523, | |
| "eval_loss": 0.6347052454948425, | |
| "eval_runtime": 4.5698, | |
| "eval_samples_per_second": 91.907, | |
| "eval_steps_per_second": 45.954, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.124105011933174, | |
| "grad_norm": 9.285810470581055, | |
| "learning_rate": 1.4914772727272727e-05, | |
| "loss": 0.5598, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.136038186157518, | |
| "grad_norm": 11.331174850463867, | |
| "learning_rate": 1.471185064935065e-05, | |
| "loss": 0.514, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.1479713603818618, | |
| "grad_norm": 5.6925950050354, | |
| "learning_rate": 1.4508928571428573e-05, | |
| "loss": 0.8433, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1479713603818618, | |
| "eval_accuracy": 0.7047619047619048, | |
| "eval_loss": 0.6480055451393127, | |
| "eval_runtime": 2.2846, | |
| "eval_samples_per_second": 183.84, | |
| "eval_steps_per_second": 91.92, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.159904534606205, | |
| "grad_norm": 1.9029499292373657, | |
| "learning_rate": 1.4306006493506494e-05, | |
| "loss": 0.5241, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.171837708830549, | |
| "grad_norm": 1.1398459672927856, | |
| "learning_rate": 1.4103084415584417e-05, | |
| "loss": 0.4211, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.1837708830548928, | |
| "grad_norm": 47.07643508911133, | |
| "learning_rate": 1.390016233766234e-05, | |
| "loss": 0.5026, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.1837708830548928, | |
| "eval_accuracy": 0.7119047619047619, | |
| "eval_loss": 0.6571480631828308, | |
| "eval_runtime": 2.2313, | |
| "eval_samples_per_second": 188.234, | |
| "eval_steps_per_second": 94.117, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.195704057279236, | |
| "grad_norm": 23.394567489624023, | |
| "learning_rate": 1.369724025974026e-05, | |
| "loss": 0.5248, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.20763723150358, | |
| "grad_norm": 1.195875883102417, | |
| "learning_rate": 1.3494318181818183e-05, | |
| "loss": 0.6541, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.2195704057279237, | |
| "grad_norm": 50.090850830078125, | |
| "learning_rate": 1.3291396103896103e-05, | |
| "loss": 0.5707, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.2195704057279237, | |
| "eval_accuracy": 0.7261904761904762, | |
| "eval_loss": 0.6893291473388672, | |
| "eval_runtime": 2.3215, | |
| "eval_samples_per_second": 180.916, | |
| "eval_steps_per_second": 90.458, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.231503579952267, | |
| "grad_norm": 10.432644844055176, | |
| "learning_rate": 1.3088474025974025e-05, | |
| "loss": 0.5073, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.243436754176611, | |
| "grad_norm": 1.8977324962615967, | |
| "learning_rate": 1.288555194805195e-05, | |
| "loss": 0.4755, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.2553699284009547, | |
| "grad_norm": 37.40773010253906, | |
| "learning_rate": 1.268262987012987e-05, | |
| "loss": 0.5586, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.2553699284009547, | |
| "eval_accuracy": 0.7214285714285714, | |
| "eval_loss": 0.7031128406524658, | |
| "eval_runtime": 2.3304, | |
| "eval_samples_per_second": 180.23, | |
| "eval_steps_per_second": 90.115, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.2673031026252985, | |
| "grad_norm": 0.6878061890602112, | |
| "learning_rate": 1.2479707792207792e-05, | |
| "loss": 0.4098, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.279236276849642, | |
| "grad_norm": 39.32148361206055, | |
| "learning_rate": 1.2276785714285715e-05, | |
| "loss": 0.5517, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.2911694510739857, | |
| "grad_norm": 22.189918518066406, | |
| "learning_rate": 1.2073863636363638e-05, | |
| "loss": 0.6187, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.2911694510739857, | |
| "eval_accuracy": 0.719047619047619, | |
| "eval_loss": 0.7172130942344666, | |
| "eval_runtime": 2.1357, | |
| "eval_samples_per_second": 196.656, | |
| "eval_steps_per_second": 98.328, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.3031026252983295, | |
| "grad_norm": 2.2258763313293457, | |
| "learning_rate": 1.1870941558441559e-05, | |
| "loss": 0.9511, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.315035799522673, | |
| "grad_norm": 9.942549705505371, | |
| "learning_rate": 1.1668019480519481e-05, | |
| "loss": 1.1122, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.3269689737470167, | |
| "grad_norm": 31.399171829223633, | |
| "learning_rate": 1.1465097402597404e-05, | |
| "loss": 0.2809, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.3269689737470167, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 0.676558792591095, | |
| "eval_runtime": 2.3986, | |
| "eval_samples_per_second": 175.103, | |
| "eval_steps_per_second": 87.552, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.3389021479713605, | |
| "grad_norm": 2.8062965869903564, | |
| "learning_rate": 1.1262175324675325e-05, | |
| "loss": 0.2739, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.350835322195704, | |
| "grad_norm": 3.7827553749084473, | |
| "learning_rate": 1.1059253246753246e-05, | |
| "loss": 0.2032, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.3627684964200477, | |
| "grad_norm": 5.71705961227417, | |
| "learning_rate": 1.085633116883117e-05, | |
| "loss": 0.7962, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.3627684964200477, | |
| "eval_accuracy": 0.7214285714285714, | |
| "eval_loss": 0.70233154296875, | |
| "eval_runtime": 2.5312, | |
| "eval_samples_per_second": 165.93, | |
| "eval_steps_per_second": 82.965, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.3747016706443915, | |
| "grad_norm": 44.944766998291016, | |
| "learning_rate": 1.0653409090909092e-05, | |
| "loss": 0.3817, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.386634844868735, | |
| "grad_norm": 4.525506019592285, | |
| "learning_rate": 1.0450487012987013e-05, | |
| "loss": 0.6826, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.3985680190930787, | |
| "grad_norm": 55.05386734008789, | |
| "learning_rate": 1.0247564935064936e-05, | |
| "loss": 0.5505, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.3985680190930787, | |
| "eval_accuracy": 0.7142857142857143, | |
| "eval_loss": 0.6966074109077454, | |
| "eval_runtime": 2.3118, | |
| "eval_samples_per_second": 181.676, | |
| "eval_steps_per_second": 90.838, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.4105011933174225, | |
| "grad_norm": 31.68248748779297, | |
| "learning_rate": 1.0044642857142858e-05, | |
| "loss": 0.7979, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.422434367541766, | |
| "grad_norm": 12.703028678894043, | |
| "learning_rate": 9.84172077922078e-06, | |
| "loss": 0.3713, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.4343675417661097, | |
| "grad_norm": 32.34121322631836, | |
| "learning_rate": 9.638798701298702e-06, | |
| "loss": 0.6046, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.4343675417661097, | |
| "eval_accuracy": 0.7119047619047619, | |
| "eval_loss": 0.7015347480773926, | |
| "eval_runtime": 2.7625, | |
| "eval_samples_per_second": 152.038, | |
| "eval_steps_per_second": 76.019, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.4463007159904535, | |
| "grad_norm": 7.550265789031982, | |
| "learning_rate": 9.435876623376625e-06, | |
| "loss": 0.5042, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.4582338902147973, | |
| "grad_norm": 25.04802703857422, | |
| "learning_rate": 9.232954545454546e-06, | |
| "loss": 0.7355, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.4701670644391407, | |
| "grad_norm": 28.636362075805664, | |
| "learning_rate": 9.030032467532467e-06, | |
| "loss": 0.5901, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.4701670644391407, | |
| "eval_accuracy": 0.7095238095238096, | |
| "eval_loss": 0.6904149651527405, | |
| "eval_runtime": 2.6961, | |
| "eval_samples_per_second": 155.78, | |
| "eval_steps_per_second": 77.89, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.4821002386634845, | |
| "grad_norm": 0.8589219450950623, | |
| "learning_rate": 8.827110389610391e-06, | |
| "loss": 0.4257, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.4940334128878283, | |
| "grad_norm": 19.250465393066406, | |
| "learning_rate": 8.624188311688313e-06, | |
| "loss": 0.4201, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.5059665871121717, | |
| "grad_norm": 0.9123141765594482, | |
| "learning_rate": 8.421266233766234e-06, | |
| "loss": 0.3045, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.5059665871121717, | |
| "eval_accuracy": 0.7095238095238096, | |
| "eval_loss": 0.6907983422279358, | |
| "eval_runtime": 4.0186, | |
| "eval_samples_per_second": 104.514, | |
| "eval_steps_per_second": 52.257, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.5178997613365155, | |
| "grad_norm": 11.066191673278809, | |
| "learning_rate": 8.218344155844156e-06, | |
| "loss": 0.8202, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.5298329355608593, | |
| "grad_norm": 36.017974853515625, | |
| "learning_rate": 8.015422077922079e-06, | |
| "loss": 0.5932, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.541766109785203, | |
| "grad_norm": 52.19096374511719, | |
| "learning_rate": 7.8125e-06, | |
| "loss": 0.6463, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.541766109785203, | |
| "eval_accuracy": 0.7238095238095238, | |
| "eval_loss": 0.690886914730072, | |
| "eval_runtime": 2.3106, | |
| "eval_samples_per_second": 181.768, | |
| "eval_steps_per_second": 90.884, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.5536992840095465, | |
| "grad_norm": 3.9326083660125732, | |
| "learning_rate": 7.609577922077922e-06, | |
| "loss": 0.555, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.5656324582338903, | |
| "grad_norm": 0.7392826676368713, | |
| "learning_rate": 7.406655844155845e-06, | |
| "loss": 0.4957, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.577565632458234, | |
| "grad_norm": 56.22297668457031, | |
| "learning_rate": 7.203733766233767e-06, | |
| "loss": 0.8132, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.577565632458234, | |
| "eval_accuracy": 0.7261904761904762, | |
| "eval_loss": 0.6950424909591675, | |
| "eval_runtime": 2.2785, | |
| "eval_samples_per_second": 184.335, | |
| "eval_steps_per_second": 92.167, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.5894988066825775, | |
| "grad_norm": 35.17913818359375, | |
| "learning_rate": 7.000811688311689e-06, | |
| "loss": 0.1858, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.6014319809069213, | |
| "grad_norm": 14.962797164916992, | |
| "learning_rate": 6.79788961038961e-06, | |
| "loss": 0.7816, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.613365155131265, | |
| "grad_norm": 12.208561897277832, | |
| "learning_rate": 6.594967532467533e-06, | |
| "loss": 0.2369, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.613365155131265, | |
| "eval_accuracy": 0.7071428571428572, | |
| "eval_loss": 0.7078821063041687, | |
| "eval_runtime": 3.8631, | |
| "eval_samples_per_second": 108.721, | |
| "eval_steps_per_second": 54.36, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.6252983293556085, | |
| "grad_norm": 24.15699005126953, | |
| "learning_rate": 6.392045454545454e-06, | |
| "loss": 0.4773, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.6372315035799523, | |
| "grad_norm": 39.538246154785156, | |
| "learning_rate": 6.189123376623377e-06, | |
| "loss": 1.0734, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.649164677804296, | |
| "grad_norm": 4.4763078689575195, | |
| "learning_rate": 5.986201298701299e-06, | |
| "loss": 0.9035, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.649164677804296, | |
| "eval_accuracy": 0.7095238095238096, | |
| "eval_loss": 0.7065214514732361, | |
| "eval_runtime": 2.3446, | |
| "eval_samples_per_second": 179.131, | |
| "eval_steps_per_second": 89.566, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.6610978520286395, | |
| "grad_norm": 77.67285919189453, | |
| "learning_rate": 5.783279220779221e-06, | |
| "loss": 0.441, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.6730310262529833, | |
| "grad_norm": 33.727378845214844, | |
| "learning_rate": 5.580357142857144e-06, | |
| "loss": 0.8442, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.684964200477327, | |
| "grad_norm": 35.148902893066406, | |
| "learning_rate": 5.377435064935065e-06, | |
| "loss": 0.7039, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.684964200477327, | |
| "eval_accuracy": 0.7119047619047619, | |
| "eval_loss": 0.7021663188934326, | |
| "eval_runtime": 2.3192, | |
| "eval_samples_per_second": 181.1, | |
| "eval_steps_per_second": 90.55, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.6968973747016705, | |
| "grad_norm": 5.653315544128418, | |
| "learning_rate": 5.1745129870129875e-06, | |
| "loss": 0.4918, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.7088305489260143, | |
| "grad_norm": 8.575183868408203, | |
| "learning_rate": 4.9715909090909094e-06, | |
| "loss": 0.7543, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.720763723150358, | |
| "grad_norm": 2.2537381649017334, | |
| "learning_rate": 4.768668831168831e-06, | |
| "loss": 0.5493, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.720763723150358, | |
| "eval_accuracy": 0.7071428571428572, | |
| "eval_loss": 0.7015241384506226, | |
| "eval_runtime": 2.3982, | |
| "eval_samples_per_second": 175.129, | |
| "eval_steps_per_second": 87.564, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.7326968973747015, | |
| "grad_norm": 1.549895167350769, | |
| "learning_rate": 4.565746753246754e-06, | |
| "loss": 0.5912, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.7446300715990453, | |
| "grad_norm": 12.03734016418457, | |
| "learning_rate": 4.362824675324675e-06, | |
| "loss": 0.737, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.756563245823389, | |
| "grad_norm": 41.12443923950195, | |
| "learning_rate": 4.159902597402598e-06, | |
| "loss": 0.7036, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.756563245823389, | |
| "eval_accuracy": 0.7071428571428572, | |
| "eval_loss": 0.7000806331634521, | |
| "eval_runtime": 2.3423, | |
| "eval_samples_per_second": 179.309, | |
| "eval_steps_per_second": 89.655, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.7684964200477324, | |
| "grad_norm": 8.583761215209961, | |
| "learning_rate": 3.95698051948052e-06, | |
| "loss": 0.5139, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.7804295942720763, | |
| "grad_norm": 10.022917747497559, | |
| "learning_rate": 3.7540584415584417e-06, | |
| "loss": 0.6057, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.79236276849642, | |
| "grad_norm": 3.399099588394165, | |
| "learning_rate": 3.551136363636364e-06, | |
| "loss": 0.2663, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.79236276849642, | |
| "eval_accuracy": 0.7095238095238096, | |
| "eval_loss": 0.700248658657074, | |
| "eval_runtime": 2.2158, | |
| "eval_samples_per_second": 189.552, | |
| "eval_steps_per_second": 94.776, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.804295942720764, | |
| "grad_norm": 7.17982292175293, | |
| "learning_rate": 3.348214285714286e-06, | |
| "loss": 0.3956, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.8162291169451072, | |
| "grad_norm": 16.822969436645508, | |
| "learning_rate": 3.1452922077922083e-06, | |
| "loss": 0.7688, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.828162291169451, | |
| "grad_norm": 34.44697952270508, | |
| "learning_rate": 2.94237012987013e-06, | |
| "loss": 0.7124, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.828162291169451, | |
| "eval_accuracy": 0.7071428571428572, | |
| "eval_loss": 0.704397439956665, | |
| "eval_runtime": 2.2782, | |
| "eval_samples_per_second": 184.357, | |
| "eval_steps_per_second": 92.178, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.840095465393795, | |
| "grad_norm": 45.26344680786133, | |
| "learning_rate": 2.739448051948052e-06, | |
| "loss": 0.5108, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.8520286396181387, | |
| "grad_norm": 1.225951075553894, | |
| "learning_rate": 2.536525974025974e-06, | |
| "loss": 0.2401, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.863961813842482, | |
| "grad_norm": 41.542396545410156, | |
| "learning_rate": 2.333603896103896e-06, | |
| "loss": 0.6387, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.863961813842482, | |
| "eval_accuracy": 0.7071428571428572, | |
| "eval_loss": 0.7078101634979248, | |
| "eval_runtime": 2.3861, | |
| "eval_samples_per_second": 176.023, | |
| "eval_steps_per_second": 88.011, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.875894988066826, | |
| "grad_norm": 1.9556139707565308, | |
| "learning_rate": 2.1306818181818183e-06, | |
| "loss": 0.2796, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.8878281622911697, | |
| "grad_norm": 0.7640268802642822, | |
| "learning_rate": 1.9277597402597406e-06, | |
| "loss": 0.6271, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.899761336515513, | |
| "grad_norm": 0.8632296919822693, | |
| "learning_rate": 1.7248376623376625e-06, | |
| "loss": 0.4763, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.899761336515513, | |
| "eval_accuracy": 0.7095238095238096, | |
| "eval_loss": 0.708879292011261, | |
| "eval_runtime": 2.5831, | |
| "eval_samples_per_second": 162.593, | |
| "eval_steps_per_second": 81.297, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.911694510739857, | |
| "grad_norm": 32.62404251098633, | |
| "learning_rate": 1.5219155844155844e-06, | |
| "loss": 0.596, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.9236276849642007, | |
| "grad_norm": 29.240238189697266, | |
| "learning_rate": 1.3189935064935065e-06, | |
| "loss": 0.8641, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.935560859188544, | |
| "grad_norm": 37.549339294433594, | |
| "learning_rate": 1.1160714285714287e-06, | |
| "loss": 1.1935, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.935560859188544, | |
| "eval_accuracy": 0.7119047619047619, | |
| "eval_loss": 0.710150420665741, | |
| "eval_runtime": 2.4165, | |
| "eval_samples_per_second": 173.806, | |
| "eval_steps_per_second": 86.903, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.947494033412888, | |
| "grad_norm": 46.25457763671875, | |
| "learning_rate": 9.131493506493507e-07, | |
| "loss": 0.7683, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.9594272076372317, | |
| "grad_norm": 16.143142700195312, | |
| "learning_rate": 7.102272727272728e-07, | |
| "loss": 0.2425, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.971360381861575, | |
| "grad_norm": 1.7046856880187988, | |
| "learning_rate": 5.073051948051948e-07, | |
| "loss": 0.5129, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.971360381861575, | |
| "eval_accuracy": 0.7119047619047619, | |
| "eval_loss": 0.7110002636909485, | |
| "eval_runtime": 2.7123, | |
| "eval_samples_per_second": 154.848, | |
| "eval_steps_per_second": 77.424, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.983293556085919, | |
| "grad_norm": 0.6385570168495178, | |
| "learning_rate": 3.043831168831169e-07, | |
| "loss": 0.3595, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.9952267303102627, | |
| "grad_norm": 8.002150535583496, | |
| "learning_rate": 1.0146103896103895e-07, | |
| "loss": 0.4435, | |
| "step": 2510 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2514, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 798501104640.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |