bert-tiny-zjb-sentiment / trainer_state.json
zjb522's picture
Upload folder using huggingface_hub
115047d verified
{
"best_global_step": 1860,
"best_metric": 0.7261904761904762,
"best_model_checkpoint": "/www/wwwroot/ai_project/model/checkpoint-1260",
"epoch": 3.0,
"eval_steps": 30,
"global_step": 2514,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011933174224343675,
"grad_norm": 7.105306148529053,
"learning_rate": 9e-06,
"loss": 0.6856,
"step": 10
},
{
"epoch": 0.02386634844868735,
"grad_norm": 3.5623154640197754,
"learning_rate": 1.9e-05,
"loss": 0.6737,
"step": 20
},
{
"epoch": 0.03579952267303103,
"grad_norm": 8.588749885559082,
"learning_rate": 2.9e-05,
"loss": 0.7264,
"step": 30
},
{
"epoch": 0.03579952267303103,
"eval_accuracy": 0.6285714285714286,
"eval_loss": 0.680474042892456,
"eval_runtime": 2.9486,
"eval_samples_per_second": 142.438,
"eval_steps_per_second": 71.219,
"step": 30
},
{
"epoch": 0.0477326968973747,
"grad_norm": 7.816359996795654,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.7096,
"step": 40
},
{
"epoch": 0.059665871121718374,
"grad_norm": 7.00941276550293,
"learning_rate": 4.9e-05,
"loss": 0.669,
"step": 50
},
{
"epoch": 0.07159904534606205,
"grad_norm": 5.085658073425293,
"learning_rate": 4.9817370129870134e-05,
"loss": 0.6139,
"step": 60
},
{
"epoch": 0.07159904534606205,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6627190113067627,
"eval_runtime": 2.8882,
"eval_samples_per_second": 145.422,
"eval_steps_per_second": 72.711,
"step": 60
},
{
"epoch": 0.08353221957040573,
"grad_norm": 4.888000011444092,
"learning_rate": 4.961444805194805e-05,
"loss": 0.5252,
"step": 70
},
{
"epoch": 0.0954653937947494,
"grad_norm": 9.089728355407715,
"learning_rate": 4.9411525974025976e-05,
"loss": 0.7837,
"step": 80
},
{
"epoch": 0.10739856801909307,
"grad_norm": 3.44459867477417,
"learning_rate": 4.92086038961039e-05,
"loss": 0.6985,
"step": 90
},
{
"epoch": 0.10739856801909307,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6658735275268555,
"eval_runtime": 2.5125,
"eval_samples_per_second": 167.162,
"eval_steps_per_second": 83.581,
"step": 90
},
{
"epoch": 0.11933174224343675,
"grad_norm": 3.5227367877960205,
"learning_rate": 4.900568181818182e-05,
"loss": 0.6999,
"step": 100
},
{
"epoch": 0.13126491646778043,
"grad_norm": 8.138216972351074,
"learning_rate": 4.880275974025974e-05,
"loss": 0.8203,
"step": 110
},
{
"epoch": 0.1431980906921241,
"grad_norm": 3.1432907581329346,
"learning_rate": 4.859983766233767e-05,
"loss": 0.6327,
"step": 120
},
{
"epoch": 0.1431980906921241,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6644813418388367,
"eval_runtime": 2.9572,
"eval_samples_per_second": 142.025,
"eval_steps_per_second": 71.013,
"step": 120
},
{
"epoch": 0.15513126491646778,
"grad_norm": 2.8620526790618896,
"learning_rate": 4.8396915584415585e-05,
"loss": 0.6161,
"step": 130
},
{
"epoch": 0.16706443914081145,
"grad_norm": 3.9090893268585205,
"learning_rate": 4.819399350649351e-05,
"loss": 0.6733,
"step": 140
},
{
"epoch": 0.17899761336515513,
"grad_norm": 8.569962501525879,
"learning_rate": 4.7991071428571433e-05,
"loss": 0.6391,
"step": 150
},
{
"epoch": 0.17899761336515513,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6607769131660461,
"eval_runtime": 3.3316,
"eval_samples_per_second": 126.065,
"eval_steps_per_second": 63.033,
"step": 150
},
{
"epoch": 0.1909307875894988,
"grad_norm": 5.559383392333984,
"learning_rate": 4.778814935064935e-05,
"loss": 0.6183,
"step": 160
},
{
"epoch": 0.20286396181384247,
"grad_norm": 3.5374624729156494,
"learning_rate": 4.7585227272727276e-05,
"loss": 0.6068,
"step": 170
},
{
"epoch": 0.21479713603818615,
"grad_norm": 5.332399368286133,
"learning_rate": 4.73823051948052e-05,
"loss": 0.5847,
"step": 180
},
{
"epoch": 0.21479713603818615,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6628832817077637,
"eval_runtime": 3.3183,
"eval_samples_per_second": 126.57,
"eval_steps_per_second": 63.285,
"step": 180
},
{
"epoch": 0.22673031026252982,
"grad_norm": 4.872141361236572,
"learning_rate": 4.717938311688312e-05,
"loss": 0.6055,
"step": 190
},
{
"epoch": 0.2386634844868735,
"grad_norm": 4.306164264678955,
"learning_rate": 4.697646103896104e-05,
"loss": 0.6895,
"step": 200
},
{
"epoch": 0.25059665871121717,
"grad_norm": 3.5854930877685547,
"learning_rate": 4.6773538961038967e-05,
"loss": 0.693,
"step": 210
},
{
"epoch": 0.25059665871121717,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6654404401779175,
"eval_runtime": 2.7449,
"eval_samples_per_second": 153.01,
"eval_steps_per_second": 76.505,
"step": 210
},
{
"epoch": 0.26252983293556087,
"grad_norm": 5.1247782707214355,
"learning_rate": 4.6570616883116884e-05,
"loss": 0.7442,
"step": 220
},
{
"epoch": 0.2744630071599045,
"grad_norm": 3.291513681411743,
"learning_rate": 4.636769480519481e-05,
"loss": 0.6717,
"step": 230
},
{
"epoch": 0.2863961813842482,
"grad_norm": 4.713298797607422,
"learning_rate": 4.616477272727273e-05,
"loss": 0.65,
"step": 240
},
{
"epoch": 0.2863961813842482,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.659385621547699,
"eval_runtime": 2.8781,
"eval_samples_per_second": 145.928,
"eval_steps_per_second": 72.964,
"step": 240
},
{
"epoch": 0.29832935560859186,
"grad_norm": 2.7979941368103027,
"learning_rate": 4.596185064935065e-05,
"loss": 0.65,
"step": 250
},
{
"epoch": 0.31026252983293556,
"grad_norm": 2.7257330417633057,
"learning_rate": 4.5758928571428575e-05,
"loss": 0.5902,
"step": 260
},
{
"epoch": 0.3221957040572792,
"grad_norm": 3.3188984394073486,
"learning_rate": 4.55560064935065e-05,
"loss": 0.5494,
"step": 270
},
{
"epoch": 0.3221957040572792,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6594758629798889,
"eval_runtime": 3.3768,
"eval_samples_per_second": 124.379,
"eval_steps_per_second": 62.189,
"step": 270
},
{
"epoch": 0.3341288782816229,
"grad_norm": 5.446512222290039,
"learning_rate": 4.535308441558442e-05,
"loss": 0.7618,
"step": 280
},
{
"epoch": 0.3460620525059666,
"grad_norm": 5.833828926086426,
"learning_rate": 4.5150162337662335e-05,
"loss": 0.6402,
"step": 290
},
{
"epoch": 0.35799522673031026,
"grad_norm": 3.265965223312378,
"learning_rate": 4.4947240259740266e-05,
"loss": 0.5794,
"step": 300
},
{
"epoch": 0.35799522673031026,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6596417427062988,
"eval_runtime": 2.8478,
"eval_samples_per_second": 147.481,
"eval_steps_per_second": 73.741,
"step": 300
},
{
"epoch": 0.36992840095465396,
"grad_norm": 3.3324408531188965,
"learning_rate": 4.4744318181818184e-05,
"loss": 0.6289,
"step": 310
},
{
"epoch": 0.3818615751789976,
"grad_norm": 3.396897315979004,
"learning_rate": 4.45413961038961e-05,
"loss": 0.7469,
"step": 320
},
{
"epoch": 0.3937947494033413,
"grad_norm": 5.416648864746094,
"learning_rate": 4.433847402597403e-05,
"loss": 0.6168,
"step": 330
},
{
"epoch": 0.3937947494033413,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.660291314125061,
"eval_runtime": 2.7128,
"eval_samples_per_second": 154.824,
"eval_steps_per_second": 77.412,
"step": 330
},
{
"epoch": 0.40572792362768495,
"grad_norm": 3.854402542114258,
"learning_rate": 4.413555194805195e-05,
"loss": 0.5962,
"step": 340
},
{
"epoch": 0.41766109785202865,
"grad_norm": 3.4200260639190674,
"learning_rate": 4.393262987012987e-05,
"loss": 0.6299,
"step": 350
},
{
"epoch": 0.4295942720763723,
"grad_norm": 4.237409591674805,
"learning_rate": 4.37297077922078e-05,
"loss": 0.7804,
"step": 360
},
{
"epoch": 0.4295942720763723,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6622642874717712,
"eval_runtime": 2.8864,
"eval_samples_per_second": 145.51,
"eval_steps_per_second": 72.755,
"step": 360
},
{
"epoch": 0.441527446300716,
"grad_norm": 4.508576393127441,
"learning_rate": 4.352678571428572e-05,
"loss": 0.7417,
"step": 370
},
{
"epoch": 0.45346062052505964,
"grad_norm": 2.8814594745635986,
"learning_rate": 4.3323863636363635e-05,
"loss": 0.6707,
"step": 380
},
{
"epoch": 0.46539379474940334,
"grad_norm": 6.597533702850342,
"learning_rate": 4.3120941558441566e-05,
"loss": 0.5901,
"step": 390
},
{
"epoch": 0.46539379474940334,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6637502312660217,
"eval_runtime": 3.1072,
"eval_samples_per_second": 135.172,
"eval_steps_per_second": 67.586,
"step": 390
},
{
"epoch": 0.477326968973747,
"grad_norm": 5.29579496383667,
"learning_rate": 4.2918019480519484e-05,
"loss": 0.701,
"step": 400
},
{
"epoch": 0.4892601431980907,
"grad_norm": 3.737311601638794,
"learning_rate": 4.27150974025974e-05,
"loss": 0.5864,
"step": 410
},
{
"epoch": 0.5011933174224343,
"grad_norm": 4.80122184753418,
"learning_rate": 4.2512175324675326e-05,
"loss": 0.6265,
"step": 420
},
{
"epoch": 0.5011933174224343,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.663756251335144,
"eval_runtime": 3.0077,
"eval_samples_per_second": 139.639,
"eval_steps_per_second": 69.82,
"step": 420
},
{
"epoch": 0.513126491646778,
"grad_norm": 3.392441749572754,
"learning_rate": 4.230925324675325e-05,
"loss": 0.7258,
"step": 430
},
{
"epoch": 0.5250596658711217,
"grad_norm": 2.653325319290161,
"learning_rate": 4.210633116883117e-05,
"loss": 0.7126,
"step": 440
},
{
"epoch": 0.5369928400954654,
"grad_norm": 2.605800151824951,
"learning_rate": 4.190340909090909e-05,
"loss": 0.6815,
"step": 450
},
{
"epoch": 0.5369928400954654,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6631113886833191,
"eval_runtime": 3.0366,
"eval_samples_per_second": 138.313,
"eval_steps_per_second": 69.157,
"step": 450
},
{
"epoch": 0.548926014319809,
"grad_norm": 5.945620536804199,
"learning_rate": 4.170048701298702e-05,
"loss": 0.6262,
"step": 460
},
{
"epoch": 0.5608591885441527,
"grad_norm": 4.985898494720459,
"learning_rate": 4.1497564935064935e-05,
"loss": 0.6651,
"step": 470
},
{
"epoch": 0.5727923627684964,
"grad_norm": 2.658013343811035,
"learning_rate": 4.129464285714286e-05,
"loss": 0.6781,
"step": 480
},
{
"epoch": 0.5727923627684964,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6566287875175476,
"eval_runtime": 3.8874,
"eval_samples_per_second": 108.042,
"eval_steps_per_second": 54.021,
"step": 480
},
{
"epoch": 0.5847255369928401,
"grad_norm": 7.678948879241943,
"learning_rate": 4.1091720779220783e-05,
"loss": 0.7041,
"step": 490
},
{
"epoch": 0.5966587112171837,
"grad_norm": 2.540349006652832,
"learning_rate": 4.08887987012987e-05,
"loss": 0.6513,
"step": 500
},
{
"epoch": 0.6085918854415274,
"grad_norm": 2.527493715286255,
"learning_rate": 4.0685876623376626e-05,
"loss": 0.6985,
"step": 510
},
{
"epoch": 0.6085918854415274,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6595126986503601,
"eval_runtime": 3.3607,
"eval_samples_per_second": 124.974,
"eval_steps_per_second": 62.487,
"step": 510
},
{
"epoch": 0.6205250596658711,
"grad_norm": 5.219762802124023,
"learning_rate": 4.048295454545455e-05,
"loss": 0.6298,
"step": 520
},
{
"epoch": 0.6324582338902148,
"grad_norm": 5.685338020324707,
"learning_rate": 4.028003246753247e-05,
"loss": 0.6654,
"step": 530
},
{
"epoch": 0.6443914081145584,
"grad_norm": 5.298613548278809,
"learning_rate": 4.007711038961039e-05,
"loss": 0.6984,
"step": 540
},
{
"epoch": 0.6443914081145584,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6536137461662292,
"eval_runtime": 4.863,
"eval_samples_per_second": 86.366,
"eval_steps_per_second": 43.183,
"step": 540
},
{
"epoch": 0.6563245823389021,
"grad_norm": 4.052762508392334,
"learning_rate": 3.9874188311688317e-05,
"loss": 0.6984,
"step": 550
},
{
"epoch": 0.6682577565632458,
"grad_norm": 8.290848731994629,
"learning_rate": 3.9671266233766234e-05,
"loss": 0.6689,
"step": 560
},
{
"epoch": 0.6801909307875895,
"grad_norm": 2.341036081314087,
"learning_rate": 3.946834415584416e-05,
"loss": 0.661,
"step": 570
},
{
"epoch": 0.6801909307875895,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6538602709770203,
"eval_runtime": 2.9233,
"eval_samples_per_second": 143.672,
"eval_steps_per_second": 71.836,
"step": 570
},
{
"epoch": 0.6921241050119332,
"grad_norm": 4.4188737869262695,
"learning_rate": 3.926542207792208e-05,
"loss": 0.6026,
"step": 580
},
{
"epoch": 0.7040572792362768,
"grad_norm": 4.814696788787842,
"learning_rate": 3.90625e-05,
"loss": 0.6565,
"step": 590
},
{
"epoch": 0.7159904534606205,
"grad_norm": 5.101158142089844,
"learning_rate": 3.8859577922077925e-05,
"loss": 0.5595,
"step": 600
},
{
"epoch": 0.7159904534606205,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6548095941543579,
"eval_runtime": 3.2011,
"eval_samples_per_second": 131.206,
"eval_steps_per_second": 65.603,
"step": 600
},
{
"epoch": 0.7279236276849642,
"grad_norm": 3.732052803039551,
"learning_rate": 3.865665584415585e-05,
"loss": 0.7697,
"step": 610
},
{
"epoch": 0.7398568019093079,
"grad_norm": 6.07219934463501,
"learning_rate": 3.845373376623377e-05,
"loss": 0.674,
"step": 620
},
{
"epoch": 0.7517899761336515,
"grad_norm": 3.234180212020874,
"learning_rate": 3.825081168831169e-05,
"loss": 0.638,
"step": 630
},
{
"epoch": 0.7517899761336515,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.647255003452301,
"eval_runtime": 3.0145,
"eval_samples_per_second": 139.325,
"eval_steps_per_second": 69.663,
"step": 630
},
{
"epoch": 0.7637231503579952,
"grad_norm": 4.247595310211182,
"learning_rate": 3.8047889610389616e-05,
"loss": 0.6979,
"step": 640
},
{
"epoch": 0.7756563245823389,
"grad_norm": 6.84116268157959,
"learning_rate": 3.7844967532467534e-05,
"loss": 0.6084,
"step": 650
},
{
"epoch": 0.7875894988066826,
"grad_norm": 5.435266017913818,
"learning_rate": 3.764204545454545e-05,
"loss": 0.6514,
"step": 660
},
{
"epoch": 0.7875894988066826,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6437537670135498,
"eval_runtime": 2.3127,
"eval_samples_per_second": 181.608,
"eval_steps_per_second": 90.804,
"step": 660
},
{
"epoch": 0.7995226730310262,
"grad_norm": 2.817640781402588,
"learning_rate": 3.743912337662338e-05,
"loss": 0.696,
"step": 670
},
{
"epoch": 0.8114558472553699,
"grad_norm": 2.5305979251861572,
"learning_rate": 3.72362012987013e-05,
"loss": 0.723,
"step": 680
},
{
"epoch": 0.8233890214797136,
"grad_norm": 8.556059837341309,
"learning_rate": 3.703327922077922e-05,
"loss": 0.661,
"step": 690
},
{
"epoch": 0.8233890214797136,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.644755482673645,
"eval_runtime": 2.2808,
"eval_samples_per_second": 184.148,
"eval_steps_per_second": 92.074,
"step": 690
},
{
"epoch": 0.8353221957040573,
"grad_norm": 2.903482675552368,
"learning_rate": 3.683035714285715e-05,
"loss": 0.6023,
"step": 700
},
{
"epoch": 0.847255369928401,
"grad_norm": 4.76421594619751,
"learning_rate": 3.662743506493507e-05,
"loss": 0.628,
"step": 710
},
{
"epoch": 0.8591885441527446,
"grad_norm": 4.305855751037598,
"learning_rate": 3.6424512987012985e-05,
"loss": 0.6648,
"step": 720
},
{
"epoch": 0.8591885441527446,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6386857032775879,
"eval_runtime": 2.3714,
"eval_samples_per_second": 177.11,
"eval_steps_per_second": 88.555,
"step": 720
},
{
"epoch": 0.8711217183770883,
"grad_norm": 6.615411758422852,
"learning_rate": 3.6221590909090916e-05,
"loss": 0.6341,
"step": 730
},
{
"epoch": 0.883054892601432,
"grad_norm": 6.593195915222168,
"learning_rate": 3.6018668831168834e-05,
"loss": 0.6311,
"step": 740
},
{
"epoch": 0.8949880668257757,
"grad_norm": 4.931580543518066,
"learning_rate": 3.581574675324675e-05,
"loss": 0.56,
"step": 750
},
{
"epoch": 0.8949880668257757,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6398793458938599,
"eval_runtime": 2.3361,
"eval_samples_per_second": 179.79,
"eval_steps_per_second": 89.895,
"step": 750
},
{
"epoch": 0.9069212410501193,
"grad_norm": 3.87886118888855,
"learning_rate": 3.561282467532468e-05,
"loss": 0.6329,
"step": 760
},
{
"epoch": 0.918854415274463,
"grad_norm": 3.161126136779785,
"learning_rate": 3.54099025974026e-05,
"loss": 0.7029,
"step": 770
},
{
"epoch": 0.9307875894988067,
"grad_norm": 7.052578449249268,
"learning_rate": 3.520698051948052e-05,
"loss": 0.6108,
"step": 780
},
{
"epoch": 0.9307875894988067,
"eval_accuracy": 0.6523809523809524,
"eval_loss": 0.6286986470222473,
"eval_runtime": 2.6937,
"eval_samples_per_second": 155.92,
"eval_steps_per_second": 77.96,
"step": 780
},
{
"epoch": 0.9427207637231504,
"grad_norm": 14.41779899597168,
"learning_rate": 3.500405844155844e-05,
"loss": 0.6523,
"step": 790
},
{
"epoch": 0.954653937947494,
"grad_norm": 4.530862331390381,
"learning_rate": 3.480113636363637e-05,
"loss": 0.6565,
"step": 800
},
{
"epoch": 0.9665871121718377,
"grad_norm": 3.9421231746673584,
"learning_rate": 3.4598214285714284e-05,
"loss": 0.5472,
"step": 810
},
{
"epoch": 0.9665871121718377,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6442738771438599,
"eval_runtime": 2.525,
"eval_samples_per_second": 166.334,
"eval_steps_per_second": 83.167,
"step": 810
},
{
"epoch": 0.9785202863961814,
"grad_norm": 3.9132273197174072,
"learning_rate": 3.439529220779221e-05,
"loss": 0.5797,
"step": 820
},
{
"epoch": 0.9904534606205251,
"grad_norm": 4.5334086418151855,
"learning_rate": 3.4192370129870133e-05,
"loss": 0.7948,
"step": 830
},
{
"epoch": 1.0023866348448687,
"grad_norm": 3.76124906539917,
"learning_rate": 3.398944805194805e-05,
"loss": 0.5284,
"step": 840
},
{
"epoch": 1.0023866348448687,
"eval_accuracy": 0.6238095238095238,
"eval_loss": 0.6406013369560242,
"eval_runtime": 2.4016,
"eval_samples_per_second": 174.884,
"eval_steps_per_second": 87.442,
"step": 840
},
{
"epoch": 1.0143198090692125,
"grad_norm": 10.1142578125,
"learning_rate": 3.3786525974025976e-05,
"loss": 0.8632,
"step": 850
},
{
"epoch": 1.026252983293556,
"grad_norm": 5.389456272125244,
"learning_rate": 3.35836038961039e-05,
"loss": 0.5965,
"step": 860
},
{
"epoch": 1.0381861575178997,
"grad_norm": 5.022064685821533,
"learning_rate": 3.338068181818182e-05,
"loss": 0.5518,
"step": 870
},
{
"epoch": 1.0381861575178997,
"eval_accuracy": 0.6214285714285714,
"eval_loss": 0.6387470960617065,
"eval_runtime": 2.4702,
"eval_samples_per_second": 170.027,
"eval_steps_per_second": 85.013,
"step": 870
},
{
"epoch": 1.0501193317422435,
"grad_norm": 3.8693058490753174,
"learning_rate": 3.317775974025974e-05,
"loss": 0.6257,
"step": 880
},
{
"epoch": 1.062052505966587,
"grad_norm": 6.848055839538574,
"learning_rate": 3.2974837662337667e-05,
"loss": 0.7654,
"step": 890
},
{
"epoch": 1.0739856801909309,
"grad_norm": 6.271612644195557,
"learning_rate": 3.2771915584415584e-05,
"loss": 0.5958,
"step": 900
},
{
"epoch": 1.0739856801909309,
"eval_accuracy": 0.6619047619047619,
"eval_loss": 0.6196722388267517,
"eval_runtime": 2.5711,
"eval_samples_per_second": 163.357,
"eval_steps_per_second": 81.678,
"step": 900
},
{
"epoch": 1.0859188544152745,
"grad_norm": 5.1813764572143555,
"learning_rate": 3.256899350649351e-05,
"loss": 0.6688,
"step": 910
},
{
"epoch": 1.097852028639618,
"grad_norm": 8.188650131225586,
"learning_rate": 3.236607142857143e-05,
"loss": 0.5075,
"step": 920
},
{
"epoch": 1.1097852028639619,
"grad_norm": 5.9253249168396,
"learning_rate": 3.216314935064935e-05,
"loss": 0.5268,
"step": 930
},
{
"epoch": 1.1097852028639619,
"eval_accuracy": 0.6547619047619048,
"eval_loss": 0.6201965808868408,
"eval_runtime": 2.6071,
"eval_samples_per_second": 161.097,
"eval_steps_per_second": 80.549,
"step": 930
},
{
"epoch": 1.1217183770883055,
"grad_norm": 8.462143898010254,
"learning_rate": 3.1960227272727275e-05,
"loss": 0.6805,
"step": 940
},
{
"epoch": 1.1336515513126493,
"grad_norm": 3.664438247680664,
"learning_rate": 3.17573051948052e-05,
"loss": 0.5489,
"step": 950
},
{
"epoch": 1.1455847255369929,
"grad_norm": 3.6083319187164307,
"learning_rate": 3.155438311688312e-05,
"loss": 0.5184,
"step": 960
},
{
"epoch": 1.1455847255369929,
"eval_accuracy": 0.638095238095238,
"eval_loss": 0.6298591494560242,
"eval_runtime": 2.2121,
"eval_samples_per_second": 189.863,
"eval_steps_per_second": 94.932,
"step": 960
},
{
"epoch": 1.1575178997613365,
"grad_norm": 4.299609184265137,
"learning_rate": 3.135146103896104e-05,
"loss": 0.5752,
"step": 970
},
{
"epoch": 1.1694510739856803,
"grad_norm": 4.7070183753967285,
"learning_rate": 3.1148538961038966e-05,
"loss": 0.6226,
"step": 980
},
{
"epoch": 1.1813842482100239,
"grad_norm": 4.521914005279541,
"learning_rate": 3.0945616883116884e-05,
"loss": 0.5337,
"step": 990
},
{
"epoch": 1.1813842482100239,
"eval_accuracy": 0.6428571428571429,
"eval_loss": 0.6394156217575073,
"eval_runtime": 2.1577,
"eval_samples_per_second": 194.651,
"eval_steps_per_second": 97.325,
"step": 990
},
{
"epoch": 1.1933174224343674,
"grad_norm": 5.360909461975098,
"learning_rate": 3.07426948051948e-05,
"loss": 0.7913,
"step": 1000
},
{
"epoch": 1.2052505966587113,
"grad_norm": 6.7155585289001465,
"learning_rate": 3.053977272727273e-05,
"loss": 0.6489,
"step": 1010
},
{
"epoch": 1.2171837708830548,
"grad_norm": 7.668753147125244,
"learning_rate": 3.033685064935065e-05,
"loss": 0.5592,
"step": 1020
},
{
"epoch": 1.2171837708830548,
"eval_accuracy": 0.6714285714285714,
"eval_loss": 0.6178216934204102,
"eval_runtime": 2.4819,
"eval_samples_per_second": 169.226,
"eval_steps_per_second": 84.613,
"step": 1020
},
{
"epoch": 1.2291169451073987,
"grad_norm": 3.813314914703369,
"learning_rate": 3.013392857142857e-05,
"loss": 0.5319,
"step": 1030
},
{
"epoch": 1.2410501193317423,
"grad_norm": 8.704251289367676,
"learning_rate": 2.9931006493506496e-05,
"loss": 0.6233,
"step": 1040
},
{
"epoch": 1.2529832935560858,
"grad_norm": 10.739197731018066,
"learning_rate": 2.9728084415584417e-05,
"loss": 0.6285,
"step": 1050
},
{
"epoch": 1.2529832935560858,
"eval_accuracy": 0.6761904761904762,
"eval_loss": 0.6063261032104492,
"eval_runtime": 2.5831,
"eval_samples_per_second": 162.595,
"eval_steps_per_second": 81.298,
"step": 1050
},
{
"epoch": 1.2649164677804297,
"grad_norm": 8.07175064086914,
"learning_rate": 2.9525162337662338e-05,
"loss": 0.5288,
"step": 1060
},
{
"epoch": 1.2768496420047732,
"grad_norm": 11.14047622680664,
"learning_rate": 2.9322240259740263e-05,
"loss": 0.6402,
"step": 1070
},
{
"epoch": 1.288782816229117,
"grad_norm": 17.40794563293457,
"learning_rate": 2.9119318181818184e-05,
"loss": 0.6681,
"step": 1080
},
{
"epoch": 1.288782816229117,
"eval_accuracy": 0.6738095238095239,
"eval_loss": 0.6034413576126099,
"eval_runtime": 2.1352,
"eval_samples_per_second": 196.701,
"eval_steps_per_second": 98.351,
"step": 1080
},
{
"epoch": 1.3007159904534606,
"grad_norm": 6.053748607635498,
"learning_rate": 2.8916396103896105e-05,
"loss": 0.6375,
"step": 1090
},
{
"epoch": 1.3126491646778042,
"grad_norm": 3.6662399768829346,
"learning_rate": 2.871347402597403e-05,
"loss": 0.4657,
"step": 1100
},
{
"epoch": 1.324582338902148,
"grad_norm": 7.520589351654053,
"learning_rate": 2.851055194805195e-05,
"loss": 0.6052,
"step": 1110
},
{
"epoch": 1.324582338902148,
"eval_accuracy": 0.680952380952381,
"eval_loss": 0.589663028717041,
"eval_runtime": 2.3348,
"eval_samples_per_second": 179.884,
"eval_steps_per_second": 89.942,
"step": 1110
},
{
"epoch": 1.3365155131264916,
"grad_norm": 5.6362409591674805,
"learning_rate": 2.830762987012987e-05,
"loss": 0.4708,
"step": 1120
},
{
"epoch": 1.3484486873508352,
"grad_norm": 6.884702205657959,
"learning_rate": 2.8104707792207796e-05,
"loss": 0.6569,
"step": 1130
},
{
"epoch": 1.360381861575179,
"grad_norm": 6.990943431854248,
"learning_rate": 2.7901785714285717e-05,
"loss": 0.5346,
"step": 1140
},
{
"epoch": 1.360381861575179,
"eval_accuracy": 0.6857142857142857,
"eval_loss": 0.574230432510376,
"eval_runtime": 2.5972,
"eval_samples_per_second": 161.712,
"eval_steps_per_second": 80.856,
"step": 1140
},
{
"epoch": 1.3723150357995226,
"grad_norm": 4.231634616851807,
"learning_rate": 2.7698863636363638e-05,
"loss": 0.6154,
"step": 1150
},
{
"epoch": 1.3842482100238662,
"grad_norm": 7.739737510681152,
"learning_rate": 2.7495941558441562e-05,
"loss": 0.5676,
"step": 1160
},
{
"epoch": 1.39618138424821,
"grad_norm": 9.585612297058105,
"learning_rate": 2.7293019480519483e-05,
"loss": 0.6113,
"step": 1170
},
{
"epoch": 1.39618138424821,
"eval_accuracy": 0.6785714285714286,
"eval_loss": 0.5705173015594482,
"eval_runtime": 2.4049,
"eval_samples_per_second": 174.642,
"eval_steps_per_second": 87.321,
"step": 1170
},
{
"epoch": 1.4081145584725536,
"grad_norm": 10.238170623779297,
"learning_rate": 2.7090097402597404e-05,
"loss": 0.5694,
"step": 1180
},
{
"epoch": 1.4200477326968974,
"grad_norm": 5.449009895324707,
"learning_rate": 2.6887175324675322e-05,
"loss": 0.5336,
"step": 1190
},
{
"epoch": 1.431980906921241,
"grad_norm": 6.410137176513672,
"learning_rate": 2.668425324675325e-05,
"loss": 0.709,
"step": 1200
},
{
"epoch": 1.431980906921241,
"eval_accuracy": 0.6928571428571428,
"eval_loss": 0.5699592232704163,
"eval_runtime": 2.6398,
"eval_samples_per_second": 159.102,
"eval_steps_per_second": 79.551,
"step": 1200
},
{
"epoch": 1.4439140811455848,
"grad_norm": 17.243120193481445,
"learning_rate": 2.648133116883117e-05,
"loss": 0.6117,
"step": 1210
},
{
"epoch": 1.4558472553699284,
"grad_norm": 2.6029751300811768,
"learning_rate": 2.627840909090909e-05,
"loss": 0.5584,
"step": 1220
},
{
"epoch": 1.467780429594272,
"grad_norm": 7.715820789337158,
"learning_rate": 2.6075487012987017e-05,
"loss": 0.4975,
"step": 1230
},
{
"epoch": 1.467780429594272,
"eval_accuracy": 0.680952380952381,
"eval_loss": 0.5647696256637573,
"eval_runtime": 2.6005,
"eval_samples_per_second": 161.51,
"eval_steps_per_second": 80.755,
"step": 1230
},
{
"epoch": 1.4797136038186158,
"grad_norm": 4.275643348693848,
"learning_rate": 2.5872564935064934e-05,
"loss": 0.4998,
"step": 1240
},
{
"epoch": 1.4916467780429594,
"grad_norm": 5.787468433380127,
"learning_rate": 2.5669642857142855e-05,
"loss": 0.5196,
"step": 1250
},
{
"epoch": 1.503579952267303,
"grad_norm": 8.201250076293945,
"learning_rate": 2.5466720779220783e-05,
"loss": 0.4744,
"step": 1260
},
{
"epoch": 1.503579952267303,
"eval_accuracy": 0.7047619047619048,
"eval_loss": 0.5737091302871704,
"eval_runtime": 2.7806,
"eval_samples_per_second": 151.045,
"eval_steps_per_second": 75.522,
"step": 1260
},
{
"epoch": 1.5155131264916468,
"grad_norm": 16.53989601135254,
"learning_rate": 2.52637987012987e-05,
"loss": 0.5824,
"step": 1270
},
{
"epoch": 1.5274463007159904,
"grad_norm": 4.177210807800293,
"learning_rate": 2.5060876623376622e-05,
"loss": 0.4096,
"step": 1280
},
{
"epoch": 1.539379474940334,
"grad_norm": 7.058141231536865,
"learning_rate": 2.4857954545454546e-05,
"loss": 0.4575,
"step": 1290
},
{
"epoch": 1.539379474940334,
"eval_accuracy": 0.7119047619047619,
"eval_loss": 0.5913795828819275,
"eval_runtime": 3.4982,
"eval_samples_per_second": 120.063,
"eval_steps_per_second": 60.031,
"step": 1290
},
{
"epoch": 1.5513126491646778,
"grad_norm": 6.7890849113464355,
"learning_rate": 2.4655032467532467e-05,
"loss": 0.6381,
"step": 1300
},
{
"epoch": 1.5632458233890216,
"grad_norm": 15.614151954650879,
"learning_rate": 2.4452110389610392e-05,
"loss": 0.5837,
"step": 1310
},
{
"epoch": 1.575178997613365,
"grad_norm": 3.2906830310821533,
"learning_rate": 2.4249188311688313e-05,
"loss": 0.5163,
"step": 1320
},
{
"epoch": 1.575178997613365,
"eval_accuracy": 0.7047619047619048,
"eval_loss": 0.5870974659919739,
"eval_runtime": 2.4203,
"eval_samples_per_second": 173.531,
"eval_steps_per_second": 86.766,
"step": 1320
},
{
"epoch": 1.5871121718377088,
"grad_norm": 8.762399673461914,
"learning_rate": 2.4046266233766234e-05,
"loss": 0.423,
"step": 1330
},
{
"epoch": 1.5990453460620526,
"grad_norm": 18.587299346923828,
"learning_rate": 2.384334415584416e-05,
"loss": 0.6604,
"step": 1340
},
{
"epoch": 1.6109785202863962,
"grad_norm": 10.328272819519043,
"learning_rate": 2.364042207792208e-05,
"loss": 0.4858,
"step": 1350
},
{
"epoch": 1.6109785202863962,
"eval_accuracy": 0.6976190476190476,
"eval_loss": 0.5857155323028564,
"eval_runtime": 2.5795,
"eval_samples_per_second": 162.825,
"eval_steps_per_second": 81.412,
"step": 1350
},
{
"epoch": 1.6229116945107398,
"grad_norm": 31.050983428955078,
"learning_rate": 2.34375e-05,
"loss": 0.5269,
"step": 1360
},
{
"epoch": 1.6348448687350836,
"grad_norm": 26.770954132080078,
"learning_rate": 2.3234577922077925e-05,
"loss": 0.7522,
"step": 1370
},
{
"epoch": 1.6467780429594272,
"grad_norm": 16.199596405029297,
"learning_rate": 2.3031655844155846e-05,
"loss": 0.4981,
"step": 1380
},
{
"epoch": 1.6467780429594272,
"eval_accuracy": 0.7071428571428572,
"eval_loss": 0.5910340547561646,
"eval_runtime": 2.4005,
"eval_samples_per_second": 174.965,
"eval_steps_per_second": 87.483,
"step": 1380
},
{
"epoch": 1.6587112171837708,
"grad_norm": 8.824426651000977,
"learning_rate": 2.2828733766233767e-05,
"loss": 0.6491,
"step": 1390
},
{
"epoch": 1.6706443914081146,
"grad_norm": 30.093074798583984,
"learning_rate": 2.262581168831169e-05,
"loss": 0.5058,
"step": 1400
},
{
"epoch": 1.6825775656324582,
"grad_norm": 5.514789581298828,
"learning_rate": 2.242288961038961e-05,
"loss": 0.495,
"step": 1410
},
{
"epoch": 1.6825775656324582,
"eval_accuracy": 0.6976190476190476,
"eval_loss": 0.6180436015129089,
"eval_runtime": 2.4224,
"eval_samples_per_second": 173.381,
"eval_steps_per_second": 86.691,
"step": 1410
},
{
"epoch": 1.6945107398568018,
"grad_norm": 3.9637222290039062,
"learning_rate": 2.2219967532467534e-05,
"loss": 0.4785,
"step": 1420
},
{
"epoch": 1.7064439140811456,
"grad_norm": 23.46233367919922,
"learning_rate": 2.2017045454545458e-05,
"loss": 0.9344,
"step": 1430
},
{
"epoch": 1.7183770883054894,
"grad_norm": 7.6329731941223145,
"learning_rate": 2.1814123376623376e-05,
"loss": 0.6106,
"step": 1440
},
{
"epoch": 1.7183770883054894,
"eval_accuracy": 0.6857142857142857,
"eval_loss": 0.6573019623756409,
"eval_runtime": 2.3562,
"eval_samples_per_second": 178.255,
"eval_steps_per_second": 89.127,
"step": 1440
},
{
"epoch": 1.7303102625298328,
"grad_norm": 2.4250106811523438,
"learning_rate": 2.16112012987013e-05,
"loss": 0.812,
"step": 1450
},
{
"epoch": 1.7422434367541766,
"grad_norm": 4.6379313468933105,
"learning_rate": 2.140827922077922e-05,
"loss": 0.4254,
"step": 1460
},
{
"epoch": 1.7541766109785204,
"grad_norm": 2.969158411026001,
"learning_rate": 2.1205357142857142e-05,
"loss": 0.4755,
"step": 1470
},
{
"epoch": 1.7541766109785204,
"eval_accuracy": 0.6904761904761905,
"eval_loss": 0.6418641805648804,
"eval_runtime": 2.8834,
"eval_samples_per_second": 145.661,
"eval_steps_per_second": 72.831,
"step": 1470
},
{
"epoch": 1.766109785202864,
"grad_norm": 30.389511108398438,
"learning_rate": 2.1002435064935067e-05,
"loss": 0.6588,
"step": 1480
},
{
"epoch": 1.7780429594272076,
"grad_norm": 10.750784873962402,
"learning_rate": 2.0799512987012988e-05,
"loss": 0.7834,
"step": 1490
},
{
"epoch": 1.7899761336515514,
"grad_norm": 26.425033569335938,
"learning_rate": 2.059659090909091e-05,
"loss": 0.6807,
"step": 1500
},
{
"epoch": 1.7899761336515514,
"eval_accuracy": 0.6976190476190476,
"eval_loss": 0.6332749724388123,
"eval_runtime": 2.3964,
"eval_samples_per_second": 175.265,
"eval_steps_per_second": 87.632,
"step": 1500
},
{
"epoch": 1.801909307875895,
"grad_norm": 1.6902508735656738,
"learning_rate": 2.0393668831168833e-05,
"loss": 0.6456,
"step": 1510
},
{
"epoch": 1.8138424821002386,
"grad_norm": 4.968941688537598,
"learning_rate": 2.0190746753246754e-05,
"loss": 0.6374,
"step": 1520
},
{
"epoch": 1.8257756563245824,
"grad_norm": 14.968162536621094,
"learning_rate": 1.9987824675324675e-05,
"loss": 0.4483,
"step": 1530
},
{
"epoch": 1.8257756563245824,
"eval_accuracy": 0.6976190476190476,
"eval_loss": 0.6345822215080261,
"eval_runtime": 2.3274,
"eval_samples_per_second": 180.458,
"eval_steps_per_second": 90.229,
"step": 1530
},
{
"epoch": 1.837708830548926,
"grad_norm": 13.755841255187988,
"learning_rate": 1.97849025974026e-05,
"loss": 0.7141,
"step": 1540
},
{
"epoch": 1.8496420047732696,
"grad_norm": 7.046818256378174,
"learning_rate": 1.958198051948052e-05,
"loss": 0.4692,
"step": 1550
},
{
"epoch": 1.8615751789976134,
"grad_norm": 8.693527221679688,
"learning_rate": 1.9379058441558442e-05,
"loss": 0.618,
"step": 1560
},
{
"epoch": 1.8615751789976134,
"eval_accuracy": 0.7023809523809523,
"eval_loss": 0.57932448387146,
"eval_runtime": 2.6212,
"eval_samples_per_second": 160.233,
"eval_steps_per_second": 80.117,
"step": 1560
},
{
"epoch": 1.8735083532219572,
"grad_norm": 17.399871826171875,
"learning_rate": 1.9176136363636366e-05,
"loss": 0.414,
"step": 1570
},
{
"epoch": 1.8854415274463006,
"grad_norm": 14.802628517150879,
"learning_rate": 1.8973214285714284e-05,
"loss": 0.47,
"step": 1580
},
{
"epoch": 1.8973747016706444,
"grad_norm": 2.645390748977661,
"learning_rate": 1.877029220779221e-05,
"loss": 0.2105,
"step": 1590
},
{
"epoch": 1.8973747016706444,
"eval_accuracy": 0.7166666666666667,
"eval_loss": 0.6054596900939941,
"eval_runtime": 2.5618,
"eval_samples_per_second": 163.946,
"eval_steps_per_second": 81.973,
"step": 1590
},
{
"epoch": 1.9093078758949882,
"grad_norm": 19.810827255249023,
"learning_rate": 1.8567370129870133e-05,
"loss": 0.4894,
"step": 1600
},
{
"epoch": 1.9212410501193318,
"grad_norm": 15.894791603088379,
"learning_rate": 1.836444805194805e-05,
"loss": 0.5804,
"step": 1610
},
{
"epoch": 1.9331742243436754,
"grad_norm": 2.4785335063934326,
"learning_rate": 1.8161525974025975e-05,
"loss": 0.791,
"step": 1620
},
{
"epoch": 1.9331742243436754,
"eval_accuracy": 0.6976190476190476,
"eval_loss": 0.6101633906364441,
"eval_runtime": 2.4503,
"eval_samples_per_second": 171.407,
"eval_steps_per_second": 85.704,
"step": 1620
},
{
"epoch": 1.9451073985680192,
"grad_norm": 1.90475594997406,
"learning_rate": 1.79586038961039e-05,
"loss": 0.3093,
"step": 1630
},
{
"epoch": 1.9570405727923628,
"grad_norm": 1.2917793989181519,
"learning_rate": 1.7755681818181817e-05,
"loss": 0.6421,
"step": 1640
},
{
"epoch": 1.9689737470167064,
"grad_norm": 29.986438751220703,
"learning_rate": 1.7552759740259742e-05,
"loss": 0.5379,
"step": 1650
},
{
"epoch": 1.9689737470167064,
"eval_accuracy": 0.7142857142857143,
"eval_loss": 0.6631202101707458,
"eval_runtime": 3.6446,
"eval_samples_per_second": 115.238,
"eval_steps_per_second": 57.619,
"step": 1650
},
{
"epoch": 1.9809069212410502,
"grad_norm": 7.635002136230469,
"learning_rate": 1.7349837662337663e-05,
"loss": 0.5357,
"step": 1660
},
{
"epoch": 1.9928400954653938,
"grad_norm": 12.643198013305664,
"learning_rate": 1.7146915584415584e-05,
"loss": 0.3115,
"step": 1670
},
{
"epoch": 2.0047732696897373,
"grad_norm": 23.04033088684082,
"learning_rate": 1.694399350649351e-05,
"loss": 0.5175,
"step": 1680
},
{
"epoch": 2.0047732696897373,
"eval_accuracy": 0.7,
"eval_loss": 0.7028768658638,
"eval_runtime": 2.5925,
"eval_samples_per_second": 162.009,
"eval_steps_per_second": 81.004,
"step": 1680
},
{
"epoch": 2.016706443914081,
"grad_norm": 7.8505048751831055,
"learning_rate": 1.674107142857143e-05,
"loss": 0.532,
"step": 1690
},
{
"epoch": 2.028639618138425,
"grad_norm": 16.253393173217773,
"learning_rate": 1.653814935064935e-05,
"loss": 0.565,
"step": 1700
},
{
"epoch": 2.0405727923627683,
"grad_norm": 12.83019733428955,
"learning_rate": 1.6335227272727275e-05,
"loss": 0.84,
"step": 1710
},
{
"epoch": 2.0405727923627683,
"eval_accuracy": 0.7142857142857143,
"eval_loss": 0.6765835881233215,
"eval_runtime": 2.7026,
"eval_samples_per_second": 155.407,
"eval_steps_per_second": 77.703,
"step": 1710
},
{
"epoch": 2.052505966587112,
"grad_norm": 28.117868423461914,
"learning_rate": 1.6132305194805196e-05,
"loss": 0.8001,
"step": 1720
},
{
"epoch": 2.064439140811456,
"grad_norm": 24.78237533569336,
"learning_rate": 1.5929383116883117e-05,
"loss": 0.517,
"step": 1730
},
{
"epoch": 2.0763723150357993,
"grad_norm": 10.502264022827148,
"learning_rate": 1.572646103896104e-05,
"loss": 0.6738,
"step": 1740
},
{
"epoch": 2.0763723150357993,
"eval_accuracy": 0.6976190476190476,
"eval_loss": 0.6353161334991455,
"eval_runtime": 3.0434,
"eval_samples_per_second": 138.005,
"eval_steps_per_second": 69.002,
"step": 1740
},
{
"epoch": 2.088305489260143,
"grad_norm": 7.912200927734375,
"learning_rate": 1.5523538961038963e-05,
"loss": 0.4832,
"step": 1750
},
{
"epoch": 2.100238663484487,
"grad_norm": 2.545396327972412,
"learning_rate": 1.5320616883116884e-05,
"loss": 0.4363,
"step": 1760
},
{
"epoch": 2.1121718377088303,
"grad_norm": 15.671684265136719,
"learning_rate": 1.5117694805194806e-05,
"loss": 0.5378,
"step": 1770
},
{
"epoch": 2.1121718377088303,
"eval_accuracy": 0.7023809523809523,
"eval_loss": 0.6347052454948425,
"eval_runtime": 4.5698,
"eval_samples_per_second": 91.907,
"eval_steps_per_second": 45.954,
"step": 1770
},
{
"epoch": 2.124105011933174,
"grad_norm": 9.285810470581055,
"learning_rate": 1.4914772727272727e-05,
"loss": 0.5598,
"step": 1780
},
{
"epoch": 2.136038186157518,
"grad_norm": 11.331174850463867,
"learning_rate": 1.471185064935065e-05,
"loss": 0.514,
"step": 1790
},
{
"epoch": 2.1479713603818618,
"grad_norm": 5.6925950050354,
"learning_rate": 1.4508928571428573e-05,
"loss": 0.8433,
"step": 1800
},
{
"epoch": 2.1479713603818618,
"eval_accuracy": 0.7047619047619048,
"eval_loss": 0.6480055451393127,
"eval_runtime": 2.2846,
"eval_samples_per_second": 183.84,
"eval_steps_per_second": 91.92,
"step": 1800
},
{
"epoch": 2.159904534606205,
"grad_norm": 1.9029499292373657,
"learning_rate": 1.4306006493506494e-05,
"loss": 0.5241,
"step": 1810
},
{
"epoch": 2.171837708830549,
"grad_norm": 1.1398459672927856,
"learning_rate": 1.4103084415584417e-05,
"loss": 0.4211,
"step": 1820
},
{
"epoch": 2.1837708830548928,
"grad_norm": 47.07643508911133,
"learning_rate": 1.390016233766234e-05,
"loss": 0.5026,
"step": 1830
},
{
"epoch": 2.1837708830548928,
"eval_accuracy": 0.7119047619047619,
"eval_loss": 0.6571480631828308,
"eval_runtime": 2.2313,
"eval_samples_per_second": 188.234,
"eval_steps_per_second": 94.117,
"step": 1830
},
{
"epoch": 2.195704057279236,
"grad_norm": 23.394567489624023,
"learning_rate": 1.369724025974026e-05,
"loss": 0.5248,
"step": 1840
},
{
"epoch": 2.20763723150358,
"grad_norm": 1.195875883102417,
"learning_rate": 1.3494318181818183e-05,
"loss": 0.6541,
"step": 1850
},
{
"epoch": 2.2195704057279237,
"grad_norm": 50.090850830078125,
"learning_rate": 1.3291396103896103e-05,
"loss": 0.5707,
"step": 1860
},
{
"epoch": 2.2195704057279237,
"eval_accuracy": 0.7261904761904762,
"eval_loss": 0.6893291473388672,
"eval_runtime": 2.3215,
"eval_samples_per_second": 180.916,
"eval_steps_per_second": 90.458,
"step": 1860
},
{
"epoch": 2.231503579952267,
"grad_norm": 10.432644844055176,
"learning_rate": 1.3088474025974025e-05,
"loss": 0.5073,
"step": 1870
},
{
"epoch": 2.243436754176611,
"grad_norm": 1.8977324962615967,
"learning_rate": 1.288555194805195e-05,
"loss": 0.4755,
"step": 1880
},
{
"epoch": 2.2553699284009547,
"grad_norm": 37.40773010253906,
"learning_rate": 1.268262987012987e-05,
"loss": 0.5586,
"step": 1890
},
{
"epoch": 2.2553699284009547,
"eval_accuracy": 0.7214285714285714,
"eval_loss": 0.7031128406524658,
"eval_runtime": 2.3304,
"eval_samples_per_second": 180.23,
"eval_steps_per_second": 90.115,
"step": 1890
},
{
"epoch": 2.2673031026252985,
"grad_norm": 0.6878061890602112,
"learning_rate": 1.2479707792207792e-05,
"loss": 0.4098,
"step": 1900
},
{
"epoch": 2.279236276849642,
"grad_norm": 39.32148361206055,
"learning_rate": 1.2276785714285715e-05,
"loss": 0.5517,
"step": 1910
},
{
"epoch": 2.2911694510739857,
"grad_norm": 22.189918518066406,
"learning_rate": 1.2073863636363638e-05,
"loss": 0.6187,
"step": 1920
},
{
"epoch": 2.2911694510739857,
"eval_accuracy": 0.719047619047619,
"eval_loss": 0.7172130942344666,
"eval_runtime": 2.1357,
"eval_samples_per_second": 196.656,
"eval_steps_per_second": 98.328,
"step": 1920
},
{
"epoch": 2.3031026252983295,
"grad_norm": 2.2258763313293457,
"learning_rate": 1.1870941558441559e-05,
"loss": 0.9511,
"step": 1930
},
{
"epoch": 2.315035799522673,
"grad_norm": 9.942549705505371,
"learning_rate": 1.1668019480519481e-05,
"loss": 1.1122,
"step": 1940
},
{
"epoch": 2.3269689737470167,
"grad_norm": 31.399171829223633,
"learning_rate": 1.1465097402597404e-05,
"loss": 0.2809,
"step": 1950
},
{
"epoch": 2.3269689737470167,
"eval_accuracy": 0.7142857142857143,
"eval_loss": 0.676558792591095,
"eval_runtime": 2.3986,
"eval_samples_per_second": 175.103,
"eval_steps_per_second": 87.552,
"step": 1950
},
{
"epoch": 2.3389021479713605,
"grad_norm": 2.8062965869903564,
"learning_rate": 1.1262175324675325e-05,
"loss": 0.2739,
"step": 1960
},
{
"epoch": 2.350835322195704,
"grad_norm": 3.7827553749084473,
"learning_rate": 1.1059253246753246e-05,
"loss": 0.2032,
"step": 1970
},
{
"epoch": 2.3627684964200477,
"grad_norm": 5.71705961227417,
"learning_rate": 1.085633116883117e-05,
"loss": 0.7962,
"step": 1980
},
{
"epoch": 2.3627684964200477,
"eval_accuracy": 0.7214285714285714,
"eval_loss": 0.70233154296875,
"eval_runtime": 2.5312,
"eval_samples_per_second": 165.93,
"eval_steps_per_second": 82.965,
"step": 1980
},
{
"epoch": 2.3747016706443915,
"grad_norm": 44.944766998291016,
"learning_rate": 1.0653409090909092e-05,
"loss": 0.3817,
"step": 1990
},
{
"epoch": 2.386634844868735,
"grad_norm": 4.525506019592285,
"learning_rate": 1.0450487012987013e-05,
"loss": 0.6826,
"step": 2000
},
{
"epoch": 2.3985680190930787,
"grad_norm": 55.05386734008789,
"learning_rate": 1.0247564935064936e-05,
"loss": 0.5505,
"step": 2010
},
{
"epoch": 2.3985680190930787,
"eval_accuracy": 0.7142857142857143,
"eval_loss": 0.6966074109077454,
"eval_runtime": 2.3118,
"eval_samples_per_second": 181.676,
"eval_steps_per_second": 90.838,
"step": 2010
},
{
"epoch": 2.4105011933174225,
"grad_norm": 31.68248748779297,
"learning_rate": 1.0044642857142858e-05,
"loss": 0.7979,
"step": 2020
},
{
"epoch": 2.422434367541766,
"grad_norm": 12.703028678894043,
"learning_rate": 9.84172077922078e-06,
"loss": 0.3713,
"step": 2030
},
{
"epoch": 2.4343675417661097,
"grad_norm": 32.34121322631836,
"learning_rate": 9.638798701298702e-06,
"loss": 0.6046,
"step": 2040
},
{
"epoch": 2.4343675417661097,
"eval_accuracy": 0.7119047619047619,
"eval_loss": 0.7015347480773926,
"eval_runtime": 2.7625,
"eval_samples_per_second": 152.038,
"eval_steps_per_second": 76.019,
"step": 2040
},
{
"epoch": 2.4463007159904535,
"grad_norm": 7.550265789031982,
"learning_rate": 9.435876623376625e-06,
"loss": 0.5042,
"step": 2050
},
{
"epoch": 2.4582338902147973,
"grad_norm": 25.04802703857422,
"learning_rate": 9.232954545454546e-06,
"loss": 0.7355,
"step": 2060
},
{
"epoch": 2.4701670644391407,
"grad_norm": 28.636362075805664,
"learning_rate": 9.030032467532467e-06,
"loss": 0.5901,
"step": 2070
},
{
"epoch": 2.4701670644391407,
"eval_accuracy": 0.7095238095238096,
"eval_loss": 0.6904149651527405,
"eval_runtime": 2.6961,
"eval_samples_per_second": 155.78,
"eval_steps_per_second": 77.89,
"step": 2070
},
{
"epoch": 2.4821002386634845,
"grad_norm": 0.8589219450950623,
"learning_rate": 8.827110389610391e-06,
"loss": 0.4257,
"step": 2080
},
{
"epoch": 2.4940334128878283,
"grad_norm": 19.250465393066406,
"learning_rate": 8.624188311688313e-06,
"loss": 0.4201,
"step": 2090
},
{
"epoch": 2.5059665871121717,
"grad_norm": 0.9123141765594482,
"learning_rate": 8.421266233766234e-06,
"loss": 0.3045,
"step": 2100
},
{
"epoch": 2.5059665871121717,
"eval_accuracy": 0.7095238095238096,
"eval_loss": 0.6907983422279358,
"eval_runtime": 4.0186,
"eval_samples_per_second": 104.514,
"eval_steps_per_second": 52.257,
"step": 2100
},
{
"epoch": 2.5178997613365155,
"grad_norm": 11.066191673278809,
"learning_rate": 8.218344155844156e-06,
"loss": 0.8202,
"step": 2110
},
{
"epoch": 2.5298329355608593,
"grad_norm": 36.017974853515625,
"learning_rate": 8.015422077922079e-06,
"loss": 0.5932,
"step": 2120
},
{
"epoch": 2.541766109785203,
"grad_norm": 52.19096374511719,
"learning_rate": 7.8125e-06,
"loss": 0.6463,
"step": 2130
},
{
"epoch": 2.541766109785203,
"eval_accuracy": 0.7238095238095238,
"eval_loss": 0.690886914730072,
"eval_runtime": 2.3106,
"eval_samples_per_second": 181.768,
"eval_steps_per_second": 90.884,
"step": 2130
},
{
"epoch": 2.5536992840095465,
"grad_norm": 3.9326083660125732,
"learning_rate": 7.609577922077922e-06,
"loss": 0.555,
"step": 2140
},
{
"epoch": 2.5656324582338903,
"grad_norm": 0.7392826676368713,
"learning_rate": 7.406655844155845e-06,
"loss": 0.4957,
"step": 2150
},
{
"epoch": 2.577565632458234,
"grad_norm": 56.22297668457031,
"learning_rate": 7.203733766233767e-06,
"loss": 0.8132,
"step": 2160
},
{
"epoch": 2.577565632458234,
"eval_accuracy": 0.7261904761904762,
"eval_loss": 0.6950424909591675,
"eval_runtime": 2.2785,
"eval_samples_per_second": 184.335,
"eval_steps_per_second": 92.167,
"step": 2160
},
{
"epoch": 2.5894988066825775,
"grad_norm": 35.17913818359375,
"learning_rate": 7.000811688311689e-06,
"loss": 0.1858,
"step": 2170
},
{
"epoch": 2.6014319809069213,
"grad_norm": 14.962797164916992,
"learning_rate": 6.79788961038961e-06,
"loss": 0.7816,
"step": 2180
},
{
"epoch": 2.613365155131265,
"grad_norm": 12.208561897277832,
"learning_rate": 6.594967532467533e-06,
"loss": 0.2369,
"step": 2190
},
{
"epoch": 2.613365155131265,
"eval_accuracy": 0.7071428571428572,
"eval_loss": 0.7078821063041687,
"eval_runtime": 3.8631,
"eval_samples_per_second": 108.721,
"eval_steps_per_second": 54.36,
"step": 2190
},
{
"epoch": 2.6252983293556085,
"grad_norm": 24.15699005126953,
"learning_rate": 6.392045454545454e-06,
"loss": 0.4773,
"step": 2200
},
{
"epoch": 2.6372315035799523,
"grad_norm": 39.538246154785156,
"learning_rate": 6.189123376623377e-06,
"loss": 1.0734,
"step": 2210
},
{
"epoch": 2.649164677804296,
"grad_norm": 4.4763078689575195,
"learning_rate": 5.986201298701299e-06,
"loss": 0.9035,
"step": 2220
},
{
"epoch": 2.649164677804296,
"eval_accuracy": 0.7095238095238096,
"eval_loss": 0.7065214514732361,
"eval_runtime": 2.3446,
"eval_samples_per_second": 179.131,
"eval_steps_per_second": 89.566,
"step": 2220
},
{
"epoch": 2.6610978520286395,
"grad_norm": 77.67285919189453,
"learning_rate": 5.783279220779221e-06,
"loss": 0.441,
"step": 2230
},
{
"epoch": 2.6730310262529833,
"grad_norm": 33.727378845214844,
"learning_rate": 5.580357142857144e-06,
"loss": 0.8442,
"step": 2240
},
{
"epoch": 2.684964200477327,
"grad_norm": 35.148902893066406,
"learning_rate": 5.377435064935065e-06,
"loss": 0.7039,
"step": 2250
},
{
"epoch": 2.684964200477327,
"eval_accuracy": 0.7119047619047619,
"eval_loss": 0.7021663188934326,
"eval_runtime": 2.3192,
"eval_samples_per_second": 181.1,
"eval_steps_per_second": 90.55,
"step": 2250
},
{
"epoch": 2.6968973747016705,
"grad_norm": 5.653315544128418,
"learning_rate": 5.1745129870129875e-06,
"loss": 0.4918,
"step": 2260
},
{
"epoch": 2.7088305489260143,
"grad_norm": 8.575183868408203,
"learning_rate": 4.9715909090909094e-06,
"loss": 0.7543,
"step": 2270
},
{
"epoch": 2.720763723150358,
"grad_norm": 2.2537381649017334,
"learning_rate": 4.768668831168831e-06,
"loss": 0.5493,
"step": 2280
},
{
"epoch": 2.720763723150358,
"eval_accuracy": 0.7071428571428572,
"eval_loss": 0.7015241384506226,
"eval_runtime": 2.3982,
"eval_samples_per_second": 175.129,
"eval_steps_per_second": 87.564,
"step": 2280
},
{
"epoch": 2.7326968973747015,
"grad_norm": 1.549895167350769,
"learning_rate": 4.565746753246754e-06,
"loss": 0.5912,
"step": 2290
},
{
"epoch": 2.7446300715990453,
"grad_norm": 12.03734016418457,
"learning_rate": 4.362824675324675e-06,
"loss": 0.737,
"step": 2300
},
{
"epoch": 2.756563245823389,
"grad_norm": 41.12443923950195,
"learning_rate": 4.159902597402598e-06,
"loss": 0.7036,
"step": 2310
},
{
"epoch": 2.756563245823389,
"eval_accuracy": 0.7071428571428572,
"eval_loss": 0.7000806331634521,
"eval_runtime": 2.3423,
"eval_samples_per_second": 179.309,
"eval_steps_per_second": 89.655,
"step": 2310
},
{
"epoch": 2.7684964200477324,
"grad_norm": 8.583761215209961,
"learning_rate": 3.95698051948052e-06,
"loss": 0.5139,
"step": 2320
},
{
"epoch": 2.7804295942720763,
"grad_norm": 10.022917747497559,
"learning_rate": 3.7540584415584417e-06,
"loss": 0.6057,
"step": 2330
},
{
"epoch": 2.79236276849642,
"grad_norm": 3.399099588394165,
"learning_rate": 3.551136363636364e-06,
"loss": 0.2663,
"step": 2340
},
{
"epoch": 2.79236276849642,
"eval_accuracy": 0.7095238095238096,
"eval_loss": 0.700248658657074,
"eval_runtime": 2.2158,
"eval_samples_per_second": 189.552,
"eval_steps_per_second": 94.776,
"step": 2340
},
{
"epoch": 2.804295942720764,
"grad_norm": 7.17982292175293,
"learning_rate": 3.348214285714286e-06,
"loss": 0.3956,
"step": 2350
},
{
"epoch": 2.8162291169451072,
"grad_norm": 16.822969436645508,
"learning_rate": 3.1452922077922083e-06,
"loss": 0.7688,
"step": 2360
},
{
"epoch": 2.828162291169451,
"grad_norm": 34.44697952270508,
"learning_rate": 2.94237012987013e-06,
"loss": 0.7124,
"step": 2370
},
{
"epoch": 2.828162291169451,
"eval_accuracy": 0.7071428571428572,
"eval_loss": 0.704397439956665,
"eval_runtime": 2.2782,
"eval_samples_per_second": 184.357,
"eval_steps_per_second": 92.178,
"step": 2370
},
{
"epoch": 2.840095465393795,
"grad_norm": 45.26344680786133,
"learning_rate": 2.739448051948052e-06,
"loss": 0.5108,
"step": 2380
},
{
"epoch": 2.8520286396181387,
"grad_norm": 1.225951075553894,
"learning_rate": 2.536525974025974e-06,
"loss": 0.2401,
"step": 2390
},
{
"epoch": 2.863961813842482,
"grad_norm": 41.542396545410156,
"learning_rate": 2.333603896103896e-06,
"loss": 0.6387,
"step": 2400
},
{
"epoch": 2.863961813842482,
"eval_accuracy": 0.7071428571428572,
"eval_loss": 0.7078101634979248,
"eval_runtime": 2.3861,
"eval_samples_per_second": 176.023,
"eval_steps_per_second": 88.011,
"step": 2400
},
{
"epoch": 2.875894988066826,
"grad_norm": 1.9556139707565308,
"learning_rate": 2.1306818181818183e-06,
"loss": 0.2796,
"step": 2410
},
{
"epoch": 2.8878281622911697,
"grad_norm": 0.7640268802642822,
"learning_rate": 1.9277597402597406e-06,
"loss": 0.6271,
"step": 2420
},
{
"epoch": 2.899761336515513,
"grad_norm": 0.8632296919822693,
"learning_rate": 1.7248376623376625e-06,
"loss": 0.4763,
"step": 2430
},
{
"epoch": 2.899761336515513,
"eval_accuracy": 0.7095238095238096,
"eval_loss": 0.708879292011261,
"eval_runtime": 2.5831,
"eval_samples_per_second": 162.593,
"eval_steps_per_second": 81.297,
"step": 2430
},
{
"epoch": 2.911694510739857,
"grad_norm": 32.62404251098633,
"learning_rate": 1.5219155844155844e-06,
"loss": 0.596,
"step": 2440
},
{
"epoch": 2.9236276849642007,
"grad_norm": 29.240238189697266,
"learning_rate": 1.3189935064935065e-06,
"loss": 0.8641,
"step": 2450
},
{
"epoch": 2.935560859188544,
"grad_norm": 37.549339294433594,
"learning_rate": 1.1160714285714287e-06,
"loss": 1.1935,
"step": 2460
},
{
"epoch": 2.935560859188544,
"eval_accuracy": 0.7119047619047619,
"eval_loss": 0.710150420665741,
"eval_runtime": 2.4165,
"eval_samples_per_second": 173.806,
"eval_steps_per_second": 86.903,
"step": 2460
},
{
"epoch": 2.947494033412888,
"grad_norm": 46.25457763671875,
"learning_rate": 9.131493506493507e-07,
"loss": 0.7683,
"step": 2470
},
{
"epoch": 2.9594272076372317,
"grad_norm": 16.143142700195312,
"learning_rate": 7.102272727272728e-07,
"loss": 0.2425,
"step": 2480
},
{
"epoch": 2.971360381861575,
"grad_norm": 1.7046856880187988,
"learning_rate": 5.073051948051948e-07,
"loss": 0.5129,
"step": 2490
},
{
"epoch": 2.971360381861575,
"eval_accuracy": 0.7119047619047619,
"eval_loss": 0.7110002636909485,
"eval_runtime": 2.7123,
"eval_samples_per_second": 154.848,
"eval_steps_per_second": 77.424,
"step": 2490
},
{
"epoch": 2.983293556085919,
"grad_norm": 0.6385570168495178,
"learning_rate": 3.043831168831169e-07,
"loss": 0.3595,
"step": 2500
},
{
"epoch": 2.9952267303102627,
"grad_norm": 8.002150535583496,
"learning_rate": 1.0146103896103895e-07,
"loss": 0.4435,
"step": 2510
}
],
"logging_steps": 10,
"max_steps": 2514,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 90,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 798501104640.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}