{ "best_global_step": 1860, "best_metric": 0.7261904761904762, "best_model_checkpoint": "/www/wwwroot/ai_project/model/checkpoint-1260", "epoch": 3.0, "eval_steps": 30, "global_step": 2514, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011933174224343675, "grad_norm": 7.105306148529053, "learning_rate": 9e-06, "loss": 0.6856, "step": 10 }, { "epoch": 0.02386634844868735, "grad_norm": 3.5623154640197754, "learning_rate": 1.9e-05, "loss": 0.6737, "step": 20 }, { "epoch": 0.03579952267303103, "grad_norm": 8.588749885559082, "learning_rate": 2.9e-05, "loss": 0.7264, "step": 30 }, { "epoch": 0.03579952267303103, "eval_accuracy": 0.6285714285714286, "eval_loss": 0.680474042892456, "eval_runtime": 2.9486, "eval_samples_per_second": 142.438, "eval_steps_per_second": 71.219, "step": 30 }, { "epoch": 0.0477326968973747, "grad_norm": 7.816359996795654, "learning_rate": 3.9000000000000006e-05, "loss": 0.7096, "step": 40 }, { "epoch": 0.059665871121718374, "grad_norm": 7.00941276550293, "learning_rate": 4.9e-05, "loss": 0.669, "step": 50 }, { "epoch": 0.07159904534606205, "grad_norm": 5.085658073425293, "learning_rate": 4.9817370129870134e-05, "loss": 0.6139, "step": 60 }, { "epoch": 0.07159904534606205, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6627190113067627, "eval_runtime": 2.8882, "eval_samples_per_second": 145.422, "eval_steps_per_second": 72.711, "step": 60 }, { "epoch": 0.08353221957040573, "grad_norm": 4.888000011444092, "learning_rate": 4.961444805194805e-05, "loss": 0.5252, "step": 70 }, { "epoch": 0.0954653937947494, "grad_norm": 9.089728355407715, "learning_rate": 4.9411525974025976e-05, "loss": 0.7837, "step": 80 }, { "epoch": 0.10739856801909307, "grad_norm": 3.44459867477417, "learning_rate": 4.92086038961039e-05, "loss": 0.6985, "step": 90 }, { "epoch": 0.10739856801909307, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6658735275268555, "eval_runtime": 2.5125, "eval_samples_per_second": 167.162, "eval_steps_per_second": 83.581, "step": 90 }, { "epoch": 0.11933174224343675, "grad_norm": 3.5227367877960205, "learning_rate": 4.900568181818182e-05, "loss": 0.6999, "step": 100 }, { "epoch": 0.13126491646778043, "grad_norm": 8.138216972351074, "learning_rate": 4.880275974025974e-05, "loss": 0.8203, "step": 110 }, { "epoch": 0.1431980906921241, "grad_norm": 3.1432907581329346, "learning_rate": 4.859983766233767e-05, "loss": 0.6327, "step": 120 }, { "epoch": 0.1431980906921241, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6644813418388367, "eval_runtime": 2.9572, "eval_samples_per_second": 142.025, "eval_steps_per_second": 71.013, "step": 120 }, { "epoch": 0.15513126491646778, "grad_norm": 2.8620526790618896, "learning_rate": 4.8396915584415585e-05, "loss": 0.6161, "step": 130 }, { "epoch": 0.16706443914081145, "grad_norm": 3.9090893268585205, "learning_rate": 4.819399350649351e-05, "loss": 0.6733, "step": 140 }, { "epoch": 0.17899761336515513, "grad_norm": 8.569962501525879, "learning_rate": 4.7991071428571433e-05, "loss": 0.6391, "step": 150 }, { "epoch": 0.17899761336515513, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6607769131660461, "eval_runtime": 3.3316, "eval_samples_per_second": 126.065, "eval_steps_per_second": 63.033, "step": 150 }, { "epoch": 0.1909307875894988, "grad_norm": 5.559383392333984, "learning_rate": 4.778814935064935e-05, "loss": 0.6183, "step": 160 }, { "epoch": 0.20286396181384247, "grad_norm": 3.5374624729156494, "learning_rate": 4.7585227272727276e-05, "loss": 0.6068, "step": 170 }, { "epoch": 0.21479713603818615, "grad_norm": 5.332399368286133, "learning_rate": 4.73823051948052e-05, "loss": 0.5847, "step": 180 }, { "epoch": 0.21479713603818615, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6628832817077637, "eval_runtime": 3.3183, "eval_samples_per_second": 126.57, "eval_steps_per_second": 63.285, "step": 180 }, { "epoch": 0.22673031026252982, "grad_norm": 4.872141361236572, "learning_rate": 4.717938311688312e-05, "loss": 0.6055, "step": 190 }, { "epoch": 0.2386634844868735, "grad_norm": 4.306164264678955, "learning_rate": 4.697646103896104e-05, "loss": 0.6895, "step": 200 }, { "epoch": 0.25059665871121717, "grad_norm": 3.5854930877685547, "learning_rate": 4.6773538961038967e-05, "loss": 0.693, "step": 210 }, { "epoch": 0.25059665871121717, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6654404401779175, "eval_runtime": 2.7449, "eval_samples_per_second": 153.01, "eval_steps_per_second": 76.505, "step": 210 }, { "epoch": 0.26252983293556087, "grad_norm": 5.1247782707214355, "learning_rate": 4.6570616883116884e-05, "loss": 0.7442, "step": 220 }, { "epoch": 0.2744630071599045, "grad_norm": 3.291513681411743, "learning_rate": 4.636769480519481e-05, "loss": 0.6717, "step": 230 }, { "epoch": 0.2863961813842482, "grad_norm": 4.713298797607422, "learning_rate": 4.616477272727273e-05, "loss": 0.65, "step": 240 }, { "epoch": 0.2863961813842482, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.659385621547699, "eval_runtime": 2.8781, "eval_samples_per_second": 145.928, "eval_steps_per_second": 72.964, "step": 240 }, { "epoch": 0.29832935560859186, "grad_norm": 2.7979941368103027, "learning_rate": 4.596185064935065e-05, "loss": 0.65, "step": 250 }, { "epoch": 0.31026252983293556, "grad_norm": 2.7257330417633057, "learning_rate": 4.5758928571428575e-05, "loss": 0.5902, "step": 260 }, { "epoch": 0.3221957040572792, "grad_norm": 3.3188984394073486, "learning_rate": 4.55560064935065e-05, "loss": 0.5494, "step": 270 }, { "epoch": 0.3221957040572792, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6594758629798889, "eval_runtime": 3.3768, "eval_samples_per_second": 124.379, "eval_steps_per_second": 62.189, "step": 270 }, { "epoch": 0.3341288782816229, "grad_norm": 5.446512222290039, "learning_rate": 4.535308441558442e-05, "loss": 0.7618, "step": 280 }, { "epoch": 0.3460620525059666, "grad_norm": 5.833828926086426, "learning_rate": 4.5150162337662335e-05, "loss": 0.6402, "step": 290 }, { "epoch": 0.35799522673031026, "grad_norm": 3.265965223312378, "learning_rate": 4.4947240259740266e-05, "loss": 0.5794, "step": 300 }, { "epoch": 0.35799522673031026, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6596417427062988, "eval_runtime": 2.8478, "eval_samples_per_second": 147.481, "eval_steps_per_second": 73.741, "step": 300 }, { "epoch": 0.36992840095465396, "grad_norm": 3.3324408531188965, "learning_rate": 4.4744318181818184e-05, "loss": 0.6289, "step": 310 }, { "epoch": 0.3818615751789976, "grad_norm": 3.396897315979004, "learning_rate": 4.45413961038961e-05, "loss": 0.7469, "step": 320 }, { "epoch": 0.3937947494033413, "grad_norm": 5.416648864746094, "learning_rate": 4.433847402597403e-05, "loss": 0.6168, "step": 330 }, { "epoch": 0.3937947494033413, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.660291314125061, "eval_runtime": 2.7128, "eval_samples_per_second": 154.824, "eval_steps_per_second": 77.412, "step": 330 }, { "epoch": 0.40572792362768495, "grad_norm": 3.854402542114258, "learning_rate": 4.413555194805195e-05, "loss": 0.5962, "step": 340 }, { "epoch": 0.41766109785202865, "grad_norm": 3.4200260639190674, "learning_rate": 4.393262987012987e-05, "loss": 0.6299, "step": 350 }, { "epoch": 0.4295942720763723, "grad_norm": 4.237409591674805, "learning_rate": 4.37297077922078e-05, "loss": 0.7804, "step": 360 }, { "epoch": 0.4295942720763723, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6622642874717712, "eval_runtime": 2.8864, "eval_samples_per_second": 145.51, "eval_steps_per_second": 72.755, "step": 360 }, { "epoch": 0.441527446300716, "grad_norm": 4.508576393127441, "learning_rate": 4.352678571428572e-05, "loss": 0.7417, "step": 370 }, { "epoch": 0.45346062052505964, "grad_norm": 2.8814594745635986, "learning_rate": 4.3323863636363635e-05, "loss": 0.6707, "step": 380 }, { "epoch": 0.46539379474940334, "grad_norm": 6.597533702850342, "learning_rate": 4.3120941558441566e-05, "loss": 0.5901, "step": 390 }, { "epoch": 0.46539379474940334, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6637502312660217, "eval_runtime": 3.1072, "eval_samples_per_second": 135.172, "eval_steps_per_second": 67.586, "step": 390 }, { "epoch": 0.477326968973747, "grad_norm": 5.29579496383667, "learning_rate": 4.2918019480519484e-05, "loss": 0.701, "step": 400 }, { "epoch": 0.4892601431980907, "grad_norm": 3.737311601638794, "learning_rate": 4.27150974025974e-05, "loss": 0.5864, "step": 410 }, { "epoch": 0.5011933174224343, "grad_norm": 4.80122184753418, "learning_rate": 4.2512175324675326e-05, "loss": 0.6265, "step": 420 }, { "epoch": 0.5011933174224343, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.663756251335144, "eval_runtime": 3.0077, "eval_samples_per_second": 139.639, "eval_steps_per_second": 69.82, "step": 420 }, { "epoch": 0.513126491646778, "grad_norm": 3.392441749572754, "learning_rate": 4.230925324675325e-05, "loss": 0.7258, "step": 430 }, { "epoch": 0.5250596658711217, "grad_norm": 2.653325319290161, "learning_rate": 4.210633116883117e-05, "loss": 0.7126, "step": 440 }, { "epoch": 0.5369928400954654, "grad_norm": 2.605800151824951, "learning_rate": 4.190340909090909e-05, "loss": 0.6815, "step": 450 }, { "epoch": 0.5369928400954654, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6631113886833191, "eval_runtime": 3.0366, "eval_samples_per_second": 138.313, "eval_steps_per_second": 69.157, "step": 450 }, { "epoch": 0.548926014319809, "grad_norm": 5.945620536804199, "learning_rate": 4.170048701298702e-05, "loss": 0.6262, "step": 460 }, { "epoch": 0.5608591885441527, "grad_norm": 4.985898494720459, "learning_rate": 4.1497564935064935e-05, "loss": 0.6651, "step": 470 }, { "epoch": 0.5727923627684964, "grad_norm": 2.658013343811035, "learning_rate": 4.129464285714286e-05, "loss": 0.6781, "step": 480 }, { "epoch": 0.5727923627684964, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6566287875175476, "eval_runtime": 3.8874, "eval_samples_per_second": 108.042, "eval_steps_per_second": 54.021, "step": 480 }, { "epoch": 0.5847255369928401, "grad_norm": 7.678948879241943, "learning_rate": 4.1091720779220783e-05, "loss": 0.7041, "step": 490 }, { "epoch": 0.5966587112171837, "grad_norm": 2.540349006652832, "learning_rate": 4.08887987012987e-05, "loss": 0.6513, "step": 500 }, { "epoch": 0.6085918854415274, "grad_norm": 2.527493715286255, "learning_rate": 4.0685876623376626e-05, "loss": 0.6985, "step": 510 }, { "epoch": 0.6085918854415274, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6595126986503601, "eval_runtime": 3.3607, "eval_samples_per_second": 124.974, "eval_steps_per_second": 62.487, "step": 510 }, { "epoch": 0.6205250596658711, "grad_norm": 5.219762802124023, "learning_rate": 4.048295454545455e-05, "loss": 0.6298, "step": 520 }, { "epoch": 0.6324582338902148, "grad_norm": 5.685338020324707, "learning_rate": 4.028003246753247e-05, "loss": 0.6654, "step": 530 }, { "epoch": 0.6443914081145584, "grad_norm": 5.298613548278809, "learning_rate": 4.007711038961039e-05, "loss": 0.6984, "step": 540 }, { "epoch": 0.6443914081145584, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6536137461662292, "eval_runtime": 4.863, "eval_samples_per_second": 86.366, "eval_steps_per_second": 43.183, "step": 540 }, { "epoch": 0.6563245823389021, "grad_norm": 4.052762508392334, "learning_rate": 3.9874188311688317e-05, "loss": 0.6984, "step": 550 }, { "epoch": 0.6682577565632458, "grad_norm": 8.290848731994629, "learning_rate": 3.9671266233766234e-05, "loss": 0.6689, "step": 560 }, { "epoch": 0.6801909307875895, "grad_norm": 2.341036081314087, "learning_rate": 3.946834415584416e-05, "loss": 0.661, "step": 570 }, { "epoch": 0.6801909307875895, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6538602709770203, "eval_runtime": 2.9233, "eval_samples_per_second": 143.672, "eval_steps_per_second": 71.836, "step": 570 }, { "epoch": 0.6921241050119332, "grad_norm": 4.4188737869262695, "learning_rate": 3.926542207792208e-05, "loss": 0.6026, "step": 580 }, { "epoch": 0.7040572792362768, "grad_norm": 4.814696788787842, "learning_rate": 3.90625e-05, "loss": 0.6565, "step": 590 }, { "epoch": 0.7159904534606205, "grad_norm": 5.101158142089844, "learning_rate": 3.8859577922077925e-05, "loss": 0.5595, "step": 600 }, { "epoch": 0.7159904534606205, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6548095941543579, "eval_runtime": 3.2011, "eval_samples_per_second": 131.206, "eval_steps_per_second": 65.603, "step": 600 }, { "epoch": 0.7279236276849642, "grad_norm": 3.732052803039551, "learning_rate": 3.865665584415585e-05, "loss": 0.7697, "step": 610 }, { "epoch": 0.7398568019093079, "grad_norm": 6.07219934463501, "learning_rate": 3.845373376623377e-05, "loss": 0.674, "step": 620 }, { "epoch": 0.7517899761336515, "grad_norm": 3.234180212020874, "learning_rate": 3.825081168831169e-05, "loss": 0.638, "step": 630 }, { "epoch": 0.7517899761336515, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.647255003452301, "eval_runtime": 3.0145, "eval_samples_per_second": 139.325, "eval_steps_per_second": 69.663, "step": 630 }, { "epoch": 0.7637231503579952, "grad_norm": 4.247595310211182, "learning_rate": 3.8047889610389616e-05, "loss": 0.6979, "step": 640 }, { "epoch": 0.7756563245823389, "grad_norm": 6.84116268157959, "learning_rate": 3.7844967532467534e-05, "loss": 0.6084, "step": 650 }, { "epoch": 0.7875894988066826, "grad_norm": 5.435266017913818, "learning_rate": 3.764204545454545e-05, "loss": 0.6514, "step": 660 }, { "epoch": 0.7875894988066826, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6437537670135498, "eval_runtime": 2.3127, "eval_samples_per_second": 181.608, "eval_steps_per_second": 90.804, "step": 660 }, { "epoch": 0.7995226730310262, "grad_norm": 2.817640781402588, "learning_rate": 3.743912337662338e-05, "loss": 0.696, "step": 670 }, { "epoch": 0.8114558472553699, "grad_norm": 2.5305979251861572, "learning_rate": 3.72362012987013e-05, "loss": 0.723, "step": 680 }, { "epoch": 0.8233890214797136, "grad_norm": 8.556059837341309, "learning_rate": 3.703327922077922e-05, "loss": 0.661, "step": 690 }, { "epoch": 0.8233890214797136, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.644755482673645, "eval_runtime": 2.2808, "eval_samples_per_second": 184.148, "eval_steps_per_second": 92.074, "step": 690 }, { "epoch": 0.8353221957040573, "grad_norm": 2.903482675552368, "learning_rate": 3.683035714285715e-05, "loss": 0.6023, "step": 700 }, { "epoch": 0.847255369928401, "grad_norm": 4.76421594619751, "learning_rate": 3.662743506493507e-05, "loss": 0.628, "step": 710 }, { "epoch": 0.8591885441527446, "grad_norm": 4.305855751037598, "learning_rate": 3.6424512987012985e-05, "loss": 0.6648, "step": 720 }, { "epoch": 0.8591885441527446, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6386857032775879, "eval_runtime": 2.3714, "eval_samples_per_second": 177.11, "eval_steps_per_second": 88.555, "step": 720 }, { "epoch": 0.8711217183770883, "grad_norm": 6.615411758422852, "learning_rate": 3.6221590909090916e-05, "loss": 0.6341, "step": 730 }, { "epoch": 0.883054892601432, "grad_norm": 6.593195915222168, "learning_rate": 3.6018668831168834e-05, "loss": 0.6311, "step": 740 }, { "epoch": 0.8949880668257757, "grad_norm": 4.931580543518066, "learning_rate": 3.581574675324675e-05, "loss": 0.56, "step": 750 }, { "epoch": 0.8949880668257757, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6398793458938599, "eval_runtime": 2.3361, "eval_samples_per_second": 179.79, "eval_steps_per_second": 89.895, "step": 750 }, { "epoch": 0.9069212410501193, "grad_norm": 3.87886118888855, "learning_rate": 3.561282467532468e-05, "loss": 0.6329, "step": 760 }, { "epoch": 0.918854415274463, "grad_norm": 3.161126136779785, "learning_rate": 3.54099025974026e-05, "loss": 0.7029, "step": 770 }, { "epoch": 0.9307875894988067, "grad_norm": 7.052578449249268, "learning_rate": 3.520698051948052e-05, "loss": 0.6108, "step": 780 }, { "epoch": 0.9307875894988067, "eval_accuracy": 0.6523809523809524, "eval_loss": 0.6286986470222473, "eval_runtime": 2.6937, "eval_samples_per_second": 155.92, "eval_steps_per_second": 77.96, "step": 780 }, { "epoch": 0.9427207637231504, "grad_norm": 14.41779899597168, "learning_rate": 3.500405844155844e-05, "loss": 0.6523, "step": 790 }, { "epoch": 0.954653937947494, "grad_norm": 4.530862331390381, "learning_rate": 3.480113636363637e-05, "loss": 0.6565, "step": 800 }, { "epoch": 0.9665871121718377, "grad_norm": 3.9421231746673584, "learning_rate": 3.4598214285714284e-05, "loss": 0.5472, "step": 810 }, { "epoch": 0.9665871121718377, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6442738771438599, "eval_runtime": 2.525, "eval_samples_per_second": 166.334, "eval_steps_per_second": 83.167, "step": 810 }, { "epoch": 0.9785202863961814, "grad_norm": 3.9132273197174072, "learning_rate": 3.439529220779221e-05, "loss": 0.5797, "step": 820 }, { "epoch": 0.9904534606205251, "grad_norm": 4.5334086418151855, "learning_rate": 3.4192370129870133e-05, "loss": 0.7948, "step": 830 }, { "epoch": 1.0023866348448687, "grad_norm": 3.76124906539917, "learning_rate": 3.398944805194805e-05, "loss": 0.5284, "step": 840 }, { "epoch": 1.0023866348448687, "eval_accuracy": 0.6238095238095238, "eval_loss": 0.6406013369560242, "eval_runtime": 2.4016, "eval_samples_per_second": 174.884, "eval_steps_per_second": 87.442, "step": 840 }, { "epoch": 1.0143198090692125, "grad_norm": 10.1142578125, "learning_rate": 3.3786525974025976e-05, "loss": 0.8632, "step": 850 }, { "epoch": 1.026252983293556, "grad_norm": 5.389456272125244, "learning_rate": 3.35836038961039e-05, "loss": 0.5965, "step": 860 }, { "epoch": 1.0381861575178997, "grad_norm": 5.022064685821533, "learning_rate": 3.338068181818182e-05, "loss": 0.5518, "step": 870 }, { "epoch": 1.0381861575178997, "eval_accuracy": 0.6214285714285714, "eval_loss": 0.6387470960617065, "eval_runtime": 2.4702, "eval_samples_per_second": 170.027, "eval_steps_per_second": 85.013, "step": 870 }, { "epoch": 1.0501193317422435, "grad_norm": 3.8693058490753174, "learning_rate": 3.317775974025974e-05, "loss": 0.6257, "step": 880 }, { "epoch": 1.062052505966587, "grad_norm": 6.848055839538574, "learning_rate": 3.2974837662337667e-05, "loss": 0.7654, "step": 890 }, { "epoch": 1.0739856801909309, "grad_norm": 6.271612644195557, "learning_rate": 3.2771915584415584e-05, "loss": 0.5958, "step": 900 }, { "epoch": 1.0739856801909309, "eval_accuracy": 0.6619047619047619, "eval_loss": 0.6196722388267517, "eval_runtime": 2.5711, "eval_samples_per_second": 163.357, "eval_steps_per_second": 81.678, "step": 900 }, { "epoch": 1.0859188544152745, "grad_norm": 5.1813764572143555, "learning_rate": 3.256899350649351e-05, "loss": 0.6688, "step": 910 }, { "epoch": 1.097852028639618, "grad_norm": 8.188650131225586, "learning_rate": 3.236607142857143e-05, "loss": 0.5075, "step": 920 }, { "epoch": 1.1097852028639619, "grad_norm": 5.9253249168396, "learning_rate": 3.216314935064935e-05, "loss": 0.5268, "step": 930 }, { "epoch": 1.1097852028639619, "eval_accuracy": 0.6547619047619048, "eval_loss": 0.6201965808868408, "eval_runtime": 2.6071, "eval_samples_per_second": 161.097, "eval_steps_per_second": 80.549, "step": 930 }, { "epoch": 1.1217183770883055, "grad_norm": 8.462143898010254, "learning_rate": 3.1960227272727275e-05, "loss": 0.6805, "step": 940 }, { "epoch": 1.1336515513126493, "grad_norm": 3.664438247680664, "learning_rate": 3.17573051948052e-05, "loss": 0.5489, "step": 950 }, { "epoch": 1.1455847255369929, "grad_norm": 3.6083319187164307, "learning_rate": 3.155438311688312e-05, "loss": 0.5184, "step": 960 }, { "epoch": 1.1455847255369929, "eval_accuracy": 0.638095238095238, "eval_loss": 0.6298591494560242, "eval_runtime": 2.2121, "eval_samples_per_second": 189.863, "eval_steps_per_second": 94.932, "step": 960 }, { "epoch": 1.1575178997613365, "grad_norm": 4.299609184265137, "learning_rate": 3.135146103896104e-05, "loss": 0.5752, "step": 970 }, { "epoch": 1.1694510739856803, "grad_norm": 4.7070183753967285, "learning_rate": 3.1148538961038966e-05, "loss": 0.6226, "step": 980 }, { "epoch": 1.1813842482100239, "grad_norm": 4.521914005279541, "learning_rate": 3.0945616883116884e-05, "loss": 0.5337, "step": 990 }, { "epoch": 1.1813842482100239, "eval_accuracy": 0.6428571428571429, "eval_loss": 0.6394156217575073, "eval_runtime": 2.1577, "eval_samples_per_second": 194.651, "eval_steps_per_second": 97.325, "step": 990 }, { "epoch": 1.1933174224343674, "grad_norm": 5.360909461975098, "learning_rate": 3.07426948051948e-05, "loss": 0.7913, "step": 1000 }, { "epoch": 1.2052505966587113, "grad_norm": 6.7155585289001465, "learning_rate": 3.053977272727273e-05, "loss": 0.6489, "step": 1010 }, { "epoch": 1.2171837708830548, "grad_norm": 7.668753147125244, "learning_rate": 3.033685064935065e-05, "loss": 0.5592, "step": 1020 }, { "epoch": 1.2171837708830548, "eval_accuracy": 0.6714285714285714, "eval_loss": 0.6178216934204102, "eval_runtime": 2.4819, "eval_samples_per_second": 169.226, "eval_steps_per_second": 84.613, "step": 1020 }, { "epoch": 1.2291169451073987, "grad_norm": 3.813314914703369, "learning_rate": 3.013392857142857e-05, "loss": 0.5319, "step": 1030 }, { "epoch": 1.2410501193317423, "grad_norm": 8.704251289367676, "learning_rate": 2.9931006493506496e-05, "loss": 0.6233, "step": 1040 }, { "epoch": 1.2529832935560858, "grad_norm": 10.739197731018066, "learning_rate": 2.9728084415584417e-05, "loss": 0.6285, "step": 1050 }, { "epoch": 1.2529832935560858, "eval_accuracy": 0.6761904761904762, "eval_loss": 0.6063261032104492, "eval_runtime": 2.5831, "eval_samples_per_second": 162.595, "eval_steps_per_second": 81.298, "step": 1050 }, { "epoch": 1.2649164677804297, "grad_norm": 8.07175064086914, "learning_rate": 2.9525162337662338e-05, "loss": 0.5288, "step": 1060 }, { "epoch": 1.2768496420047732, "grad_norm": 11.14047622680664, "learning_rate": 2.9322240259740263e-05, "loss": 0.6402, "step": 1070 }, { "epoch": 1.288782816229117, "grad_norm": 17.40794563293457, "learning_rate": 2.9119318181818184e-05, "loss": 0.6681, "step": 1080 }, { "epoch": 1.288782816229117, "eval_accuracy": 0.6738095238095239, "eval_loss": 0.6034413576126099, "eval_runtime": 2.1352, "eval_samples_per_second": 196.701, "eval_steps_per_second": 98.351, "step": 1080 }, { "epoch": 1.3007159904534606, "grad_norm": 6.053748607635498, "learning_rate": 2.8916396103896105e-05, "loss": 0.6375, "step": 1090 }, { "epoch": 1.3126491646778042, "grad_norm": 3.6662399768829346, "learning_rate": 2.871347402597403e-05, "loss": 0.4657, "step": 1100 }, { "epoch": 1.324582338902148, "grad_norm": 7.520589351654053, "learning_rate": 2.851055194805195e-05, "loss": 0.6052, "step": 1110 }, { "epoch": 1.324582338902148, "eval_accuracy": 0.680952380952381, "eval_loss": 0.589663028717041, "eval_runtime": 2.3348, "eval_samples_per_second": 179.884, "eval_steps_per_second": 89.942, "step": 1110 }, { "epoch": 1.3365155131264916, "grad_norm": 5.6362409591674805, "learning_rate": 2.830762987012987e-05, "loss": 0.4708, "step": 1120 }, { "epoch": 1.3484486873508352, "grad_norm": 6.884702205657959, "learning_rate": 2.8104707792207796e-05, "loss": 0.6569, "step": 1130 }, { "epoch": 1.360381861575179, "grad_norm": 6.990943431854248, "learning_rate": 2.7901785714285717e-05, "loss": 0.5346, "step": 1140 }, { "epoch": 1.360381861575179, "eval_accuracy": 0.6857142857142857, "eval_loss": 0.574230432510376, "eval_runtime": 2.5972, "eval_samples_per_second": 161.712, "eval_steps_per_second": 80.856, "step": 1140 }, { "epoch": 1.3723150357995226, "grad_norm": 4.231634616851807, "learning_rate": 2.7698863636363638e-05, "loss": 0.6154, "step": 1150 }, { "epoch": 1.3842482100238662, "grad_norm": 7.739737510681152, "learning_rate": 2.7495941558441562e-05, "loss": 0.5676, "step": 1160 }, { "epoch": 1.39618138424821, "grad_norm": 9.585612297058105, "learning_rate": 2.7293019480519483e-05, "loss": 0.6113, "step": 1170 }, { "epoch": 1.39618138424821, "eval_accuracy": 0.6785714285714286, "eval_loss": 0.5705173015594482, "eval_runtime": 2.4049, "eval_samples_per_second": 174.642, "eval_steps_per_second": 87.321, "step": 1170 }, { "epoch": 1.4081145584725536, "grad_norm": 10.238170623779297, "learning_rate": 2.7090097402597404e-05, "loss": 0.5694, "step": 1180 }, { "epoch": 1.4200477326968974, "grad_norm": 5.449009895324707, "learning_rate": 2.6887175324675322e-05, "loss": 0.5336, "step": 1190 }, { "epoch": 1.431980906921241, "grad_norm": 6.410137176513672, "learning_rate": 2.668425324675325e-05, "loss": 0.709, "step": 1200 }, { "epoch": 1.431980906921241, "eval_accuracy": 0.6928571428571428, "eval_loss": 0.5699592232704163, "eval_runtime": 2.6398, "eval_samples_per_second": 159.102, "eval_steps_per_second": 79.551, "step": 1200 }, { "epoch": 1.4439140811455848, "grad_norm": 17.243120193481445, "learning_rate": 2.648133116883117e-05, "loss": 0.6117, "step": 1210 }, { "epoch": 1.4558472553699284, "grad_norm": 2.6029751300811768, "learning_rate": 2.627840909090909e-05, "loss": 0.5584, "step": 1220 }, { "epoch": 1.467780429594272, "grad_norm": 7.715820789337158, "learning_rate": 2.6075487012987017e-05, "loss": 0.4975, "step": 1230 }, { "epoch": 1.467780429594272, "eval_accuracy": 0.680952380952381, "eval_loss": 0.5647696256637573, "eval_runtime": 2.6005, "eval_samples_per_second": 161.51, "eval_steps_per_second": 80.755, "step": 1230 }, { "epoch": 1.4797136038186158, "grad_norm": 4.275643348693848, "learning_rate": 2.5872564935064934e-05, "loss": 0.4998, "step": 1240 }, { "epoch": 1.4916467780429594, "grad_norm": 5.787468433380127, "learning_rate": 2.5669642857142855e-05, "loss": 0.5196, "step": 1250 }, { "epoch": 1.503579952267303, "grad_norm": 8.201250076293945, "learning_rate": 2.5466720779220783e-05, "loss": 0.4744, "step": 1260 }, { "epoch": 1.503579952267303, "eval_accuracy": 0.7047619047619048, "eval_loss": 0.5737091302871704, "eval_runtime": 2.7806, "eval_samples_per_second": 151.045, "eval_steps_per_second": 75.522, "step": 1260 }, { "epoch": 1.5155131264916468, "grad_norm": 16.53989601135254, "learning_rate": 2.52637987012987e-05, "loss": 0.5824, "step": 1270 }, { "epoch": 1.5274463007159904, "grad_norm": 4.177210807800293, "learning_rate": 2.5060876623376622e-05, "loss": 0.4096, "step": 1280 }, { "epoch": 1.539379474940334, "grad_norm": 7.058141231536865, "learning_rate": 2.4857954545454546e-05, "loss": 0.4575, "step": 1290 }, { "epoch": 1.539379474940334, "eval_accuracy": 0.7119047619047619, "eval_loss": 0.5913795828819275, "eval_runtime": 3.4982, "eval_samples_per_second": 120.063, "eval_steps_per_second": 60.031, "step": 1290 }, { "epoch": 1.5513126491646778, "grad_norm": 6.7890849113464355, "learning_rate": 2.4655032467532467e-05, "loss": 0.6381, "step": 1300 }, { "epoch": 1.5632458233890216, "grad_norm": 15.614151954650879, "learning_rate": 2.4452110389610392e-05, "loss": 0.5837, "step": 1310 }, { "epoch": 1.575178997613365, "grad_norm": 3.2906830310821533, "learning_rate": 2.4249188311688313e-05, "loss": 0.5163, "step": 1320 }, { "epoch": 1.575178997613365, "eval_accuracy": 0.7047619047619048, "eval_loss": 0.5870974659919739, "eval_runtime": 2.4203, "eval_samples_per_second": 173.531, "eval_steps_per_second": 86.766, "step": 1320 }, { "epoch": 1.5871121718377088, "grad_norm": 8.762399673461914, "learning_rate": 2.4046266233766234e-05, "loss": 0.423, "step": 1330 }, { "epoch": 1.5990453460620526, "grad_norm": 18.587299346923828, "learning_rate": 2.384334415584416e-05, "loss": 0.6604, "step": 1340 }, { "epoch": 1.6109785202863962, "grad_norm": 10.328272819519043, "learning_rate": 2.364042207792208e-05, "loss": 0.4858, "step": 1350 }, { "epoch": 1.6109785202863962, "eval_accuracy": 0.6976190476190476, "eval_loss": 0.5857155323028564, "eval_runtime": 2.5795, "eval_samples_per_second": 162.825, "eval_steps_per_second": 81.412, "step": 1350 }, { "epoch": 1.6229116945107398, "grad_norm": 31.050983428955078, "learning_rate": 2.34375e-05, "loss": 0.5269, "step": 1360 }, { "epoch": 1.6348448687350836, "grad_norm": 26.770954132080078, "learning_rate": 2.3234577922077925e-05, "loss": 0.7522, "step": 1370 }, { "epoch": 1.6467780429594272, "grad_norm": 16.199596405029297, "learning_rate": 2.3031655844155846e-05, "loss": 0.4981, "step": 1380 }, { "epoch": 1.6467780429594272, "eval_accuracy": 0.7071428571428572, "eval_loss": 0.5910340547561646, "eval_runtime": 2.4005, "eval_samples_per_second": 174.965, "eval_steps_per_second": 87.483, "step": 1380 }, { "epoch": 1.6587112171837708, "grad_norm": 8.824426651000977, "learning_rate": 2.2828733766233767e-05, "loss": 0.6491, "step": 1390 }, { "epoch": 1.6706443914081146, "grad_norm": 30.093074798583984, "learning_rate": 2.262581168831169e-05, "loss": 0.5058, "step": 1400 }, { "epoch": 1.6825775656324582, "grad_norm": 5.514789581298828, "learning_rate": 2.242288961038961e-05, "loss": 0.495, "step": 1410 }, { "epoch": 1.6825775656324582, "eval_accuracy": 0.6976190476190476, "eval_loss": 0.6180436015129089, "eval_runtime": 2.4224, "eval_samples_per_second": 173.381, "eval_steps_per_second": 86.691, "step": 1410 }, { "epoch": 1.6945107398568018, "grad_norm": 3.9637222290039062, "learning_rate": 2.2219967532467534e-05, "loss": 0.4785, "step": 1420 }, { "epoch": 1.7064439140811456, "grad_norm": 23.46233367919922, "learning_rate": 2.2017045454545458e-05, "loss": 0.9344, "step": 1430 }, { "epoch": 1.7183770883054894, "grad_norm": 7.6329731941223145, "learning_rate": 2.1814123376623376e-05, "loss": 0.6106, "step": 1440 }, { "epoch": 1.7183770883054894, "eval_accuracy": 0.6857142857142857, "eval_loss": 0.6573019623756409, "eval_runtime": 2.3562, "eval_samples_per_second": 178.255, "eval_steps_per_second": 89.127, "step": 1440 }, { "epoch": 1.7303102625298328, "grad_norm": 2.4250106811523438, "learning_rate": 2.16112012987013e-05, "loss": 0.812, "step": 1450 }, { "epoch": 1.7422434367541766, "grad_norm": 4.6379313468933105, "learning_rate": 2.140827922077922e-05, "loss": 0.4254, "step": 1460 }, { "epoch": 1.7541766109785204, "grad_norm": 2.969158411026001, "learning_rate": 2.1205357142857142e-05, "loss": 0.4755, "step": 1470 }, { "epoch": 1.7541766109785204, "eval_accuracy": 0.6904761904761905, "eval_loss": 0.6418641805648804, "eval_runtime": 2.8834, "eval_samples_per_second": 145.661, "eval_steps_per_second": 72.831, "step": 1470 }, { "epoch": 1.766109785202864, "grad_norm": 30.389511108398438, "learning_rate": 2.1002435064935067e-05, "loss": 0.6588, "step": 1480 }, { "epoch": 1.7780429594272076, "grad_norm": 10.750784873962402, "learning_rate": 2.0799512987012988e-05, "loss": 0.7834, "step": 1490 }, { "epoch": 1.7899761336515514, "grad_norm": 26.425033569335938, "learning_rate": 2.059659090909091e-05, "loss": 0.6807, "step": 1500 }, { "epoch": 1.7899761336515514, "eval_accuracy": 0.6976190476190476, "eval_loss": 0.6332749724388123, "eval_runtime": 2.3964, "eval_samples_per_second": 175.265, "eval_steps_per_second": 87.632, "step": 1500 }, { "epoch": 1.801909307875895, "grad_norm": 1.6902508735656738, "learning_rate": 2.0393668831168833e-05, "loss": 0.6456, "step": 1510 }, { "epoch": 1.8138424821002386, "grad_norm": 4.968941688537598, "learning_rate": 2.0190746753246754e-05, "loss": 0.6374, "step": 1520 }, { "epoch": 1.8257756563245824, "grad_norm": 14.968162536621094, "learning_rate": 1.9987824675324675e-05, "loss": 0.4483, "step": 1530 }, { "epoch": 1.8257756563245824, "eval_accuracy": 0.6976190476190476, "eval_loss": 0.6345822215080261, "eval_runtime": 2.3274, "eval_samples_per_second": 180.458, "eval_steps_per_second": 90.229, "step": 1530 }, { "epoch": 1.837708830548926, "grad_norm": 13.755841255187988, "learning_rate": 1.97849025974026e-05, "loss": 0.7141, "step": 1540 }, { "epoch": 1.8496420047732696, "grad_norm": 7.046818256378174, "learning_rate": 1.958198051948052e-05, "loss": 0.4692, "step": 1550 }, { "epoch": 1.8615751789976134, "grad_norm": 8.693527221679688, "learning_rate": 1.9379058441558442e-05, "loss": 0.618, "step": 1560 }, { "epoch": 1.8615751789976134, "eval_accuracy": 0.7023809523809523, "eval_loss": 0.57932448387146, "eval_runtime": 2.6212, "eval_samples_per_second": 160.233, "eval_steps_per_second": 80.117, "step": 1560 }, { "epoch": 1.8735083532219572, "grad_norm": 17.399871826171875, "learning_rate": 1.9176136363636366e-05, "loss": 0.414, "step": 1570 }, { "epoch": 1.8854415274463006, "grad_norm": 14.802628517150879, "learning_rate": 1.8973214285714284e-05, "loss": 0.47, "step": 1580 }, { "epoch": 1.8973747016706444, "grad_norm": 2.645390748977661, "learning_rate": 1.877029220779221e-05, "loss": 0.2105, "step": 1590 }, { "epoch": 1.8973747016706444, "eval_accuracy": 0.7166666666666667, "eval_loss": 0.6054596900939941, "eval_runtime": 2.5618, "eval_samples_per_second": 163.946, "eval_steps_per_second": 81.973, "step": 1590 }, { "epoch": 1.9093078758949882, "grad_norm": 19.810827255249023, "learning_rate": 1.8567370129870133e-05, "loss": 0.4894, "step": 1600 }, { "epoch": 1.9212410501193318, "grad_norm": 15.894791603088379, "learning_rate": 1.836444805194805e-05, "loss": 0.5804, "step": 1610 }, { "epoch": 1.9331742243436754, "grad_norm": 2.4785335063934326, "learning_rate": 1.8161525974025975e-05, "loss": 0.791, "step": 1620 }, { "epoch": 1.9331742243436754, "eval_accuracy": 0.6976190476190476, "eval_loss": 0.6101633906364441, "eval_runtime": 2.4503, "eval_samples_per_second": 171.407, "eval_steps_per_second": 85.704, "step": 1620 }, { "epoch": 1.9451073985680192, "grad_norm": 1.90475594997406, "learning_rate": 1.79586038961039e-05, "loss": 0.3093, "step": 1630 }, { "epoch": 1.9570405727923628, "grad_norm": 1.2917793989181519, "learning_rate": 1.7755681818181817e-05, "loss": 0.6421, "step": 1640 }, { "epoch": 1.9689737470167064, "grad_norm": 29.986438751220703, "learning_rate": 1.7552759740259742e-05, "loss": 0.5379, "step": 1650 }, { "epoch": 1.9689737470167064, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.6631202101707458, "eval_runtime": 3.6446, "eval_samples_per_second": 115.238, "eval_steps_per_second": 57.619, "step": 1650 }, { "epoch": 1.9809069212410502, "grad_norm": 7.635002136230469, "learning_rate": 1.7349837662337663e-05, "loss": 0.5357, "step": 1660 }, { "epoch": 1.9928400954653938, "grad_norm": 12.643198013305664, "learning_rate": 1.7146915584415584e-05, "loss": 0.3115, "step": 1670 }, { "epoch": 2.0047732696897373, "grad_norm": 23.04033088684082, "learning_rate": 1.694399350649351e-05, "loss": 0.5175, "step": 1680 }, { "epoch": 2.0047732696897373, "eval_accuracy": 0.7, "eval_loss": 0.7028768658638, "eval_runtime": 2.5925, "eval_samples_per_second": 162.009, "eval_steps_per_second": 81.004, "step": 1680 }, { "epoch": 2.016706443914081, "grad_norm": 7.8505048751831055, "learning_rate": 1.674107142857143e-05, "loss": 0.532, "step": 1690 }, { "epoch": 2.028639618138425, "grad_norm": 16.253393173217773, "learning_rate": 1.653814935064935e-05, "loss": 0.565, "step": 1700 }, { "epoch": 2.0405727923627683, "grad_norm": 12.83019733428955, "learning_rate": 1.6335227272727275e-05, "loss": 0.84, "step": 1710 }, { "epoch": 2.0405727923627683, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.6765835881233215, "eval_runtime": 2.7026, "eval_samples_per_second": 155.407, "eval_steps_per_second": 77.703, "step": 1710 }, { "epoch": 2.052505966587112, "grad_norm": 28.117868423461914, "learning_rate": 1.6132305194805196e-05, "loss": 0.8001, "step": 1720 }, { "epoch": 2.064439140811456, "grad_norm": 24.78237533569336, "learning_rate": 1.5929383116883117e-05, "loss": 0.517, "step": 1730 }, { "epoch": 2.0763723150357993, "grad_norm": 10.502264022827148, "learning_rate": 1.572646103896104e-05, "loss": 0.6738, "step": 1740 }, { "epoch": 2.0763723150357993, "eval_accuracy": 0.6976190476190476, "eval_loss": 0.6353161334991455, "eval_runtime": 3.0434, "eval_samples_per_second": 138.005, "eval_steps_per_second": 69.002, "step": 1740 }, { "epoch": 2.088305489260143, "grad_norm": 7.912200927734375, "learning_rate": 1.5523538961038963e-05, "loss": 0.4832, "step": 1750 }, { "epoch": 2.100238663484487, "grad_norm": 2.545396327972412, "learning_rate": 1.5320616883116884e-05, "loss": 0.4363, "step": 1760 }, { "epoch": 2.1121718377088303, "grad_norm": 15.671684265136719, "learning_rate": 1.5117694805194806e-05, "loss": 0.5378, "step": 1770 }, { "epoch": 2.1121718377088303, "eval_accuracy": 0.7023809523809523, "eval_loss": 0.6347052454948425, "eval_runtime": 4.5698, "eval_samples_per_second": 91.907, "eval_steps_per_second": 45.954, "step": 1770 }, { "epoch": 2.124105011933174, "grad_norm": 9.285810470581055, "learning_rate": 1.4914772727272727e-05, "loss": 0.5598, "step": 1780 }, { "epoch": 2.136038186157518, "grad_norm": 11.331174850463867, "learning_rate": 1.471185064935065e-05, "loss": 0.514, "step": 1790 }, { "epoch": 2.1479713603818618, "grad_norm": 5.6925950050354, "learning_rate": 1.4508928571428573e-05, "loss": 0.8433, "step": 1800 }, { "epoch": 2.1479713603818618, "eval_accuracy": 0.7047619047619048, "eval_loss": 0.6480055451393127, "eval_runtime": 2.2846, "eval_samples_per_second": 183.84, "eval_steps_per_second": 91.92, "step": 1800 }, { "epoch": 2.159904534606205, "grad_norm": 1.9029499292373657, "learning_rate": 1.4306006493506494e-05, "loss": 0.5241, "step": 1810 }, { "epoch": 2.171837708830549, "grad_norm": 1.1398459672927856, "learning_rate": 1.4103084415584417e-05, "loss": 0.4211, "step": 1820 }, { "epoch": 2.1837708830548928, "grad_norm": 47.07643508911133, "learning_rate": 1.390016233766234e-05, "loss": 0.5026, "step": 1830 }, { "epoch": 2.1837708830548928, "eval_accuracy": 0.7119047619047619, "eval_loss": 0.6571480631828308, "eval_runtime": 2.2313, "eval_samples_per_second": 188.234, "eval_steps_per_second": 94.117, "step": 1830 }, { "epoch": 2.195704057279236, "grad_norm": 23.394567489624023, "learning_rate": 1.369724025974026e-05, "loss": 0.5248, "step": 1840 }, { "epoch": 2.20763723150358, "grad_norm": 1.195875883102417, "learning_rate": 1.3494318181818183e-05, "loss": 0.6541, "step": 1850 }, { "epoch": 2.2195704057279237, "grad_norm": 50.090850830078125, "learning_rate": 1.3291396103896103e-05, "loss": 0.5707, "step": 1860 }, { "epoch": 2.2195704057279237, "eval_accuracy": 0.7261904761904762, "eval_loss": 0.6893291473388672, "eval_runtime": 2.3215, "eval_samples_per_second": 180.916, "eval_steps_per_second": 90.458, "step": 1860 }, { "epoch": 2.231503579952267, "grad_norm": 10.432644844055176, "learning_rate": 1.3088474025974025e-05, "loss": 0.5073, "step": 1870 }, { "epoch": 2.243436754176611, "grad_norm": 1.8977324962615967, "learning_rate": 1.288555194805195e-05, "loss": 0.4755, "step": 1880 }, { "epoch": 2.2553699284009547, "grad_norm": 37.40773010253906, "learning_rate": 1.268262987012987e-05, "loss": 0.5586, "step": 1890 }, { "epoch": 2.2553699284009547, "eval_accuracy": 0.7214285714285714, "eval_loss": 0.7031128406524658, "eval_runtime": 2.3304, "eval_samples_per_second": 180.23, "eval_steps_per_second": 90.115, "step": 1890 }, { "epoch": 2.2673031026252985, "grad_norm": 0.6878061890602112, "learning_rate": 1.2479707792207792e-05, "loss": 0.4098, "step": 1900 }, { "epoch": 2.279236276849642, "grad_norm": 39.32148361206055, "learning_rate": 1.2276785714285715e-05, "loss": 0.5517, "step": 1910 }, { "epoch": 2.2911694510739857, "grad_norm": 22.189918518066406, "learning_rate": 1.2073863636363638e-05, "loss": 0.6187, "step": 1920 }, { "epoch": 2.2911694510739857, "eval_accuracy": 0.719047619047619, "eval_loss": 0.7172130942344666, "eval_runtime": 2.1357, "eval_samples_per_second": 196.656, "eval_steps_per_second": 98.328, "step": 1920 }, { "epoch": 2.3031026252983295, "grad_norm": 2.2258763313293457, "learning_rate": 1.1870941558441559e-05, "loss": 0.9511, "step": 1930 }, { "epoch": 2.315035799522673, "grad_norm": 9.942549705505371, "learning_rate": 1.1668019480519481e-05, "loss": 1.1122, "step": 1940 }, { "epoch": 2.3269689737470167, "grad_norm": 31.399171829223633, "learning_rate": 1.1465097402597404e-05, "loss": 0.2809, "step": 1950 }, { "epoch": 2.3269689737470167, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.676558792591095, "eval_runtime": 2.3986, "eval_samples_per_second": 175.103, "eval_steps_per_second": 87.552, "step": 1950 }, { "epoch": 2.3389021479713605, "grad_norm": 2.8062965869903564, "learning_rate": 1.1262175324675325e-05, "loss": 0.2739, "step": 1960 }, { "epoch": 2.350835322195704, "grad_norm": 3.7827553749084473, "learning_rate": 1.1059253246753246e-05, "loss": 0.2032, "step": 1970 }, { "epoch": 2.3627684964200477, "grad_norm": 5.71705961227417, "learning_rate": 1.085633116883117e-05, "loss": 0.7962, "step": 1980 }, { "epoch": 2.3627684964200477, "eval_accuracy": 0.7214285714285714, "eval_loss": 0.70233154296875, "eval_runtime": 2.5312, "eval_samples_per_second": 165.93, "eval_steps_per_second": 82.965, "step": 1980 }, { "epoch": 2.3747016706443915, "grad_norm": 44.944766998291016, "learning_rate": 1.0653409090909092e-05, "loss": 0.3817, "step": 1990 }, { "epoch": 2.386634844868735, "grad_norm": 4.525506019592285, "learning_rate": 1.0450487012987013e-05, "loss": 0.6826, "step": 2000 }, { "epoch": 2.3985680190930787, "grad_norm": 55.05386734008789, "learning_rate": 1.0247564935064936e-05, "loss": 0.5505, "step": 2010 }, { "epoch": 2.3985680190930787, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.6966074109077454, "eval_runtime": 2.3118, "eval_samples_per_second": 181.676, "eval_steps_per_second": 90.838, "step": 2010 }, { "epoch": 2.4105011933174225, "grad_norm": 31.68248748779297, "learning_rate": 1.0044642857142858e-05, "loss": 0.7979, "step": 2020 }, { "epoch": 2.422434367541766, "grad_norm": 12.703028678894043, "learning_rate": 9.84172077922078e-06, "loss": 0.3713, "step": 2030 }, { "epoch": 2.4343675417661097, "grad_norm": 32.34121322631836, "learning_rate": 9.638798701298702e-06, "loss": 0.6046, "step": 2040 }, { "epoch": 2.4343675417661097, "eval_accuracy": 0.7119047619047619, "eval_loss": 0.7015347480773926, "eval_runtime": 2.7625, "eval_samples_per_second": 152.038, "eval_steps_per_second": 76.019, "step": 2040 }, { "epoch": 2.4463007159904535, "grad_norm": 7.550265789031982, "learning_rate": 9.435876623376625e-06, "loss": 0.5042, "step": 2050 }, { "epoch": 2.4582338902147973, "grad_norm": 25.04802703857422, "learning_rate": 9.232954545454546e-06, "loss": 0.7355, "step": 2060 }, { "epoch": 2.4701670644391407, "grad_norm": 28.636362075805664, "learning_rate": 9.030032467532467e-06, "loss": 0.5901, "step": 2070 }, { "epoch": 2.4701670644391407, "eval_accuracy": 0.7095238095238096, "eval_loss": 0.6904149651527405, "eval_runtime": 2.6961, "eval_samples_per_second": 155.78, "eval_steps_per_second": 77.89, "step": 2070 }, { "epoch": 2.4821002386634845, "grad_norm": 0.8589219450950623, "learning_rate": 8.827110389610391e-06, "loss": 0.4257, "step": 2080 }, { "epoch": 2.4940334128878283, "grad_norm": 19.250465393066406, "learning_rate": 8.624188311688313e-06, "loss": 0.4201, "step": 2090 }, { "epoch": 2.5059665871121717, "grad_norm": 0.9123141765594482, "learning_rate": 8.421266233766234e-06, "loss": 0.3045, "step": 2100 }, { "epoch": 2.5059665871121717, "eval_accuracy": 0.7095238095238096, "eval_loss": 0.6907983422279358, "eval_runtime": 4.0186, "eval_samples_per_second": 104.514, "eval_steps_per_second": 52.257, "step": 2100 }, { "epoch": 2.5178997613365155, "grad_norm": 11.066191673278809, "learning_rate": 8.218344155844156e-06, "loss": 0.8202, "step": 2110 }, { "epoch": 2.5298329355608593, "grad_norm": 36.017974853515625, "learning_rate": 8.015422077922079e-06, "loss": 0.5932, "step": 2120 }, { "epoch": 2.541766109785203, "grad_norm": 52.19096374511719, "learning_rate": 7.8125e-06, "loss": 0.6463, "step": 2130 }, { "epoch": 2.541766109785203, "eval_accuracy": 0.7238095238095238, "eval_loss": 0.690886914730072, "eval_runtime": 2.3106, "eval_samples_per_second": 181.768, "eval_steps_per_second": 90.884, "step": 2130 }, { "epoch": 2.5536992840095465, "grad_norm": 3.9326083660125732, "learning_rate": 7.609577922077922e-06, "loss": 0.555, "step": 2140 }, { "epoch": 2.5656324582338903, "grad_norm": 0.7392826676368713, "learning_rate": 7.406655844155845e-06, "loss": 0.4957, "step": 2150 }, { "epoch": 2.577565632458234, "grad_norm": 56.22297668457031, "learning_rate": 7.203733766233767e-06, "loss": 0.8132, "step": 2160 }, { "epoch": 2.577565632458234, "eval_accuracy": 0.7261904761904762, "eval_loss": 0.6950424909591675, "eval_runtime": 2.2785, "eval_samples_per_second": 184.335, "eval_steps_per_second": 92.167, "step": 2160 }, { "epoch": 2.5894988066825775, "grad_norm": 35.17913818359375, "learning_rate": 7.000811688311689e-06, "loss": 0.1858, "step": 2170 }, { "epoch": 2.6014319809069213, "grad_norm": 14.962797164916992, "learning_rate": 6.79788961038961e-06, "loss": 0.7816, "step": 2180 }, { "epoch": 2.613365155131265, "grad_norm": 12.208561897277832, "learning_rate": 6.594967532467533e-06, "loss": 0.2369, "step": 2190 }, { "epoch": 2.613365155131265, "eval_accuracy": 0.7071428571428572, "eval_loss": 0.7078821063041687, "eval_runtime": 3.8631, "eval_samples_per_second": 108.721, "eval_steps_per_second": 54.36, "step": 2190 }, { "epoch": 2.6252983293556085, "grad_norm": 24.15699005126953, "learning_rate": 6.392045454545454e-06, "loss": 0.4773, "step": 2200 }, { "epoch": 2.6372315035799523, "grad_norm": 39.538246154785156, "learning_rate": 6.189123376623377e-06, "loss": 1.0734, "step": 2210 }, { "epoch": 2.649164677804296, "grad_norm": 4.4763078689575195, "learning_rate": 5.986201298701299e-06, "loss": 0.9035, "step": 2220 }, { "epoch": 2.649164677804296, "eval_accuracy": 0.7095238095238096, "eval_loss": 0.7065214514732361, "eval_runtime": 2.3446, "eval_samples_per_second": 179.131, "eval_steps_per_second": 89.566, "step": 2220 }, { "epoch": 2.6610978520286395, "grad_norm": 77.67285919189453, "learning_rate": 5.783279220779221e-06, "loss": 0.441, "step": 2230 }, { "epoch": 2.6730310262529833, "grad_norm": 33.727378845214844, "learning_rate": 5.580357142857144e-06, "loss": 0.8442, "step": 2240 }, { "epoch": 2.684964200477327, "grad_norm": 35.148902893066406, "learning_rate": 5.377435064935065e-06, "loss": 0.7039, "step": 2250 }, { "epoch": 2.684964200477327, "eval_accuracy": 0.7119047619047619, "eval_loss": 0.7021663188934326, "eval_runtime": 2.3192, "eval_samples_per_second": 181.1, "eval_steps_per_second": 90.55, "step": 2250 }, { "epoch": 2.6968973747016705, "grad_norm": 5.653315544128418, "learning_rate": 5.1745129870129875e-06, "loss": 0.4918, "step": 2260 }, { "epoch": 2.7088305489260143, "grad_norm": 8.575183868408203, "learning_rate": 4.9715909090909094e-06, "loss": 0.7543, "step": 2270 }, { "epoch": 2.720763723150358, "grad_norm": 2.2537381649017334, "learning_rate": 4.768668831168831e-06, "loss": 0.5493, "step": 2280 }, { "epoch": 2.720763723150358, "eval_accuracy": 0.7071428571428572, "eval_loss": 0.7015241384506226, "eval_runtime": 2.3982, "eval_samples_per_second": 175.129, "eval_steps_per_second": 87.564, "step": 2280 }, { "epoch": 2.7326968973747015, "grad_norm": 1.549895167350769, "learning_rate": 4.565746753246754e-06, "loss": 0.5912, "step": 2290 }, { "epoch": 2.7446300715990453, "grad_norm": 12.03734016418457, "learning_rate": 4.362824675324675e-06, "loss": 0.737, "step": 2300 }, { "epoch": 2.756563245823389, "grad_norm": 41.12443923950195, "learning_rate": 4.159902597402598e-06, "loss": 0.7036, "step": 2310 }, { "epoch": 2.756563245823389, "eval_accuracy": 0.7071428571428572, "eval_loss": 0.7000806331634521, "eval_runtime": 2.3423, "eval_samples_per_second": 179.309, "eval_steps_per_second": 89.655, "step": 2310 }, { "epoch": 2.7684964200477324, "grad_norm": 8.583761215209961, "learning_rate": 3.95698051948052e-06, "loss": 0.5139, "step": 2320 }, { "epoch": 2.7804295942720763, "grad_norm": 10.022917747497559, "learning_rate": 3.7540584415584417e-06, "loss": 0.6057, "step": 2330 }, { "epoch": 2.79236276849642, "grad_norm": 3.399099588394165, "learning_rate": 3.551136363636364e-06, "loss": 0.2663, "step": 2340 }, { "epoch": 2.79236276849642, "eval_accuracy": 0.7095238095238096, "eval_loss": 0.700248658657074, "eval_runtime": 2.2158, "eval_samples_per_second": 189.552, "eval_steps_per_second": 94.776, "step": 2340 }, { "epoch": 2.804295942720764, "grad_norm": 7.17982292175293, "learning_rate": 3.348214285714286e-06, "loss": 0.3956, "step": 2350 }, { "epoch": 2.8162291169451072, "grad_norm": 16.822969436645508, "learning_rate": 3.1452922077922083e-06, "loss": 0.7688, "step": 2360 }, { "epoch": 2.828162291169451, "grad_norm": 34.44697952270508, "learning_rate": 2.94237012987013e-06, "loss": 0.7124, "step": 2370 }, { "epoch": 2.828162291169451, "eval_accuracy": 0.7071428571428572, "eval_loss": 0.704397439956665, "eval_runtime": 2.2782, "eval_samples_per_second": 184.357, "eval_steps_per_second": 92.178, "step": 2370 }, { "epoch": 2.840095465393795, "grad_norm": 45.26344680786133, "learning_rate": 2.739448051948052e-06, "loss": 0.5108, "step": 2380 }, { "epoch": 2.8520286396181387, "grad_norm": 1.225951075553894, "learning_rate": 2.536525974025974e-06, "loss": 0.2401, "step": 2390 }, { "epoch": 2.863961813842482, "grad_norm": 41.542396545410156, "learning_rate": 2.333603896103896e-06, "loss": 0.6387, "step": 2400 }, { "epoch": 2.863961813842482, "eval_accuracy": 0.7071428571428572, "eval_loss": 0.7078101634979248, "eval_runtime": 2.3861, "eval_samples_per_second": 176.023, "eval_steps_per_second": 88.011, "step": 2400 }, { "epoch": 2.875894988066826, "grad_norm": 1.9556139707565308, "learning_rate": 2.1306818181818183e-06, "loss": 0.2796, "step": 2410 }, { "epoch": 2.8878281622911697, "grad_norm": 0.7640268802642822, "learning_rate": 1.9277597402597406e-06, "loss": 0.6271, "step": 2420 }, { "epoch": 2.899761336515513, "grad_norm": 0.8632296919822693, "learning_rate": 1.7248376623376625e-06, "loss": 0.4763, "step": 2430 }, { "epoch": 2.899761336515513, "eval_accuracy": 0.7095238095238096, "eval_loss": 0.708879292011261, "eval_runtime": 2.5831, "eval_samples_per_second": 162.593, "eval_steps_per_second": 81.297, "step": 2430 }, { "epoch": 2.911694510739857, "grad_norm": 32.62404251098633, "learning_rate": 1.5219155844155844e-06, "loss": 0.596, "step": 2440 }, { "epoch": 2.9236276849642007, "grad_norm": 29.240238189697266, "learning_rate": 1.3189935064935065e-06, "loss": 0.8641, "step": 2450 }, { "epoch": 2.935560859188544, "grad_norm": 37.549339294433594, "learning_rate": 1.1160714285714287e-06, "loss": 1.1935, "step": 2460 }, { "epoch": 2.935560859188544, "eval_accuracy": 0.7119047619047619, "eval_loss": 0.710150420665741, "eval_runtime": 2.4165, "eval_samples_per_second": 173.806, "eval_steps_per_second": 86.903, "step": 2460 }, { "epoch": 2.947494033412888, "grad_norm": 46.25457763671875, "learning_rate": 9.131493506493507e-07, "loss": 0.7683, "step": 2470 }, { "epoch": 2.9594272076372317, "grad_norm": 16.143142700195312, "learning_rate": 7.102272727272728e-07, "loss": 0.2425, "step": 2480 }, { "epoch": 2.971360381861575, "grad_norm": 1.7046856880187988, "learning_rate": 5.073051948051948e-07, "loss": 0.5129, "step": 2490 }, { "epoch": 2.971360381861575, "eval_accuracy": 0.7119047619047619, "eval_loss": 0.7110002636909485, "eval_runtime": 2.7123, "eval_samples_per_second": 154.848, "eval_steps_per_second": 77.424, "step": 2490 }, { "epoch": 2.983293556085919, "grad_norm": 0.6385570168495178, "learning_rate": 3.043831168831169e-07, "loss": 0.3595, "step": 2500 }, { "epoch": 2.9952267303102627, "grad_norm": 8.002150535583496, "learning_rate": 1.0146103896103895e-07, "loss": 0.4435, "step": 2510 } ], "logging_steps": 10, "max_steps": 2514, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 798501104640.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }