dzungpham's picture
upload checkpoints
34959f8 verified
{
"best_global_step": 1000,
"best_metric": 0.6724504812400831,
"best_model_checkpoint": "training/fourier-spectral-norm-classifier/checkpoint-1000",
"epoch": 1.5353121801432958,
"eval_steps": 500,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"SWA": "started",
"epoch": 0,
"step": 0
},
{
"epoch": 0.00511770726714432,
"grad_norm": 1.7937116622924805,
"learning_rate": 8.19672131147541e-09,
"loss": 0.8149,
"step": 5
},
{
"epoch": 0.01023541453428864,
"grad_norm": 1.8986879587173462,
"learning_rate": 1.844262295081967e-08,
"loss": 0.8145,
"step": 10
},
{
"epoch": 0.015353121801432957,
"grad_norm": 1.8692522048950195,
"learning_rate": 2.8688524590163933e-08,
"loss": 0.8031,
"step": 15
},
{
"epoch": 0.02047082906857728,
"grad_norm": 1.6589646339416504,
"learning_rate": 3.8934426229508196e-08,
"loss": 0.8208,
"step": 20
},
{
"epoch": 0.0255885363357216,
"grad_norm": 2.377978563308716,
"learning_rate": 4.918032786885246e-08,
"loss": 0.8054,
"step": 25
},
{
"epoch": 0.030706243602865915,
"grad_norm": 2.000364065170288,
"learning_rate": 5.9426229508196716e-08,
"loss": 0.8064,
"step": 30
},
{
"epoch": 0.03582395087001024,
"grad_norm": 1.8844542503356934,
"learning_rate": 6.967213114754098e-08,
"loss": 0.8047,
"step": 35
},
{
"epoch": 0.04094165813715456,
"grad_norm": 2.0933573246002197,
"learning_rate": 7.991803278688524e-08,
"loss": 0.8156,
"step": 40
},
{
"epoch": 0.04605936540429888,
"grad_norm": 1.8126033544540405,
"learning_rate": 9.01639344262295e-08,
"loss": 0.8074,
"step": 45
},
{
"epoch": 0.0511770726714432,
"grad_norm": 2.5709195137023926,
"learning_rate": 1.0040983606557377e-07,
"loss": 0.8124,
"step": 50
},
{
"epoch": 0.05629477993858751,
"grad_norm": 2.1875293254852295,
"learning_rate": 1.1065573770491803e-07,
"loss": 0.8143,
"step": 55
},
{
"epoch": 0.06141248720573183,
"grad_norm": 2.0810351371765137,
"learning_rate": 1.209016393442623e-07,
"loss": 0.8149,
"step": 60
},
{
"epoch": 0.06653019447287616,
"grad_norm": 1.7912037372589111,
"learning_rate": 1.3114754098360656e-07,
"loss": 0.8022,
"step": 65
},
{
"epoch": 0.07164790174002048,
"grad_norm": 1.7301534414291382,
"learning_rate": 1.413934426229508e-07,
"loss": 0.8149,
"step": 70
},
{
"epoch": 0.0767656090071648,
"grad_norm": 1.9520158767700195,
"learning_rate": 1.5163934426229508e-07,
"loss": 0.8201,
"step": 75
},
{
"epoch": 0.08188331627430911,
"grad_norm": 2.11938214302063,
"learning_rate": 1.6188524590163935e-07,
"loss": 0.8079,
"step": 80
},
{
"epoch": 0.08700102354145343,
"grad_norm": 2.1483607292175293,
"learning_rate": 1.7213114754098358e-07,
"loss": 0.8084,
"step": 85
},
{
"epoch": 0.09211873080859775,
"grad_norm": 2.1716372966766357,
"learning_rate": 1.8237704918032787e-07,
"loss": 0.8188,
"step": 90
},
{
"epoch": 0.09723643807574207,
"grad_norm": 2.3327996730804443,
"learning_rate": 1.926229508196721e-07,
"loss": 0.8153,
"step": 95
},
{
"epoch": 0.1023541453428864,
"grad_norm": 1.762168526649475,
"learning_rate": 2.028688524590164e-07,
"loss": 0.8064,
"step": 100
},
{
"epoch": 0.10747185261003071,
"grad_norm": 1.7200757265090942,
"learning_rate": 2.1311475409836064e-07,
"loss": 0.8063,
"step": 105
},
{
"epoch": 0.11258955987717502,
"grad_norm": 2.490513324737549,
"learning_rate": 2.233606557377049e-07,
"loss": 0.8192,
"step": 110
},
{
"epoch": 0.11770726714431934,
"grad_norm": 2.244020938873291,
"learning_rate": 2.336065573770492e-07,
"loss": 0.8153,
"step": 115
},
{
"epoch": 0.12282497441146366,
"grad_norm": 2.1315150260925293,
"learning_rate": 2.438524590163934e-07,
"loss": 0.807,
"step": 120
},
{
"epoch": 0.12794268167860798,
"grad_norm": 2.320936918258667,
"learning_rate": 2.540983606557377e-07,
"loss": 0.8163,
"step": 125
},
{
"epoch": 0.1330603889457523,
"grad_norm": 2.7143912315368652,
"learning_rate": 2.643442622950819e-07,
"loss": 0.8166,
"step": 130
},
{
"epoch": 0.13817809621289662,
"grad_norm": 1.649880290031433,
"learning_rate": 2.7459016393442624e-07,
"loss": 0.8113,
"step": 135
},
{
"epoch": 0.14329580348004095,
"grad_norm": 2.171790361404419,
"learning_rate": 2.848360655737705e-07,
"loss": 0.805,
"step": 140
},
{
"epoch": 0.14841351074718526,
"grad_norm": 2.093440294265747,
"learning_rate": 2.950819672131147e-07,
"loss": 0.8118,
"step": 145
},
{
"epoch": 0.1535312180143296,
"grad_norm": 1.9067059755325317,
"learning_rate": 3.05327868852459e-07,
"loss": 0.8047,
"step": 150
},
{
"epoch": 0.1586489252814739,
"grad_norm": 1.9988980293273926,
"learning_rate": 3.155737704918033e-07,
"loss": 0.8091,
"step": 155
},
{
"epoch": 0.16376663254861823,
"grad_norm": 1.696977972984314,
"learning_rate": 3.258196721311475e-07,
"loss": 0.8101,
"step": 160
},
{
"epoch": 0.16888433981576254,
"grad_norm": 2.098017454147339,
"learning_rate": 3.3606557377049177e-07,
"loss": 0.81,
"step": 165
},
{
"epoch": 0.17400204708290687,
"grad_norm": 2.0255584716796875,
"learning_rate": 3.463114754098361e-07,
"loss": 0.814,
"step": 170
},
{
"epoch": 0.17911975435005117,
"grad_norm": 1.8376339673995972,
"learning_rate": 3.565573770491803e-07,
"loss": 0.8053,
"step": 175
},
{
"epoch": 0.1842374616171955,
"grad_norm": 1.9230207204818726,
"learning_rate": 3.6680327868852456e-07,
"loss": 0.8022,
"step": 180
},
{
"epoch": 0.18935516888433981,
"grad_norm": 1.939705729484558,
"learning_rate": 3.770491803278688e-07,
"loss": 0.8075,
"step": 185
},
{
"epoch": 0.19447287615148415,
"grad_norm": 1.6276813745498657,
"learning_rate": 3.8729508196721314e-07,
"loss": 0.8097,
"step": 190
},
{
"epoch": 0.19959058341862845,
"grad_norm": 1.7544569969177246,
"learning_rate": 3.9754098360655735e-07,
"loss": 0.8046,
"step": 195
},
{
"epoch": 0.2047082906857728,
"grad_norm": 1.7406467199325562,
"learning_rate": 4.077868852459016e-07,
"loss": 0.8149,
"step": 200
},
{
"epoch": 0.2098259979529171,
"grad_norm": 1.7330560684204102,
"learning_rate": 4.180327868852459e-07,
"loss": 0.8077,
"step": 205
},
{
"epoch": 0.21494370522006143,
"grad_norm": 1.417546033859253,
"learning_rate": 4.2827868852459014e-07,
"loss": 0.807,
"step": 210
},
{
"epoch": 0.22006141248720573,
"grad_norm": 2.1064000129699707,
"learning_rate": 4.385245901639344e-07,
"loss": 0.8041,
"step": 215
},
{
"epoch": 0.22517911975435004,
"grad_norm": 1.637609601020813,
"learning_rate": 4.487704918032787e-07,
"loss": 0.7992,
"step": 220
},
{
"epoch": 0.23029682702149437,
"grad_norm": 1.659397840499878,
"learning_rate": 4.590163934426229e-07,
"loss": 0.802,
"step": 225
},
{
"epoch": 0.23541453428863868,
"grad_norm": 1.6912051439285278,
"learning_rate": 4.692622950819672e-07,
"loss": 0.8005,
"step": 230
},
{
"epoch": 0.240532241555783,
"grad_norm": 1.9433246850967407,
"learning_rate": 4.795081967213115e-07,
"loss": 0.8079,
"step": 235
},
{
"epoch": 0.24564994882292732,
"grad_norm": 1.9640270471572876,
"learning_rate": 4.897540983606557e-07,
"loss": 0.8127,
"step": 240
},
{
"epoch": 0.2507676560900716,
"grad_norm": 2.3167271614074707,
"learning_rate": 5e-07,
"loss": 0.8058,
"step": 245
},
{
"epoch": 0.25588536335721596,
"grad_norm": 1.6469106674194336,
"learning_rate": 5.102459016393442e-07,
"loss": 0.8011,
"step": 250
},
{
"epoch": 0.2610030706243603,
"grad_norm": 1.5691314935684204,
"learning_rate": 5.204918032786885e-07,
"loss": 0.7968,
"step": 255
},
{
"epoch": 0.2661207778915046,
"grad_norm": 1.663665533065796,
"learning_rate": 5.307377049180327e-07,
"loss": 0.8018,
"step": 260
},
{
"epoch": 0.2712384851586489,
"grad_norm": 1.99347984790802,
"learning_rate": 5.40983606557377e-07,
"loss": 0.8006,
"step": 265
},
{
"epoch": 0.27635619242579323,
"grad_norm": 1.4906947612762451,
"learning_rate": 5.512295081967213e-07,
"loss": 0.7977,
"step": 270
},
{
"epoch": 0.28147389969293757,
"grad_norm": 1.786527395248413,
"learning_rate": 5.614754098360656e-07,
"loss": 0.8041,
"step": 275
},
{
"epoch": 0.2865916069600819,
"grad_norm": 1.9175364971160889,
"learning_rate": 5.717213114754098e-07,
"loss": 0.8079,
"step": 280
},
{
"epoch": 0.2917093142272262,
"grad_norm": 1.678741216659546,
"learning_rate": 5.819672131147541e-07,
"loss": 0.7974,
"step": 285
},
{
"epoch": 0.2968270214943705,
"grad_norm": 2.0347344875335693,
"learning_rate": 5.922131147540983e-07,
"loss": 0.8011,
"step": 290
},
{
"epoch": 0.30194472876151485,
"grad_norm": 1.8914201259613037,
"learning_rate": 6.024590163934425e-07,
"loss": 0.8026,
"step": 295
},
{
"epoch": 0.3070624360286592,
"grad_norm": 1.6236293315887451,
"learning_rate": 6.127049180327869e-07,
"loss": 0.7981,
"step": 300
},
{
"epoch": 0.31218014329580346,
"grad_norm": 1.4731358289718628,
"learning_rate": 6.229508196721311e-07,
"loss": 0.7972,
"step": 305
},
{
"epoch": 0.3172978505629478,
"grad_norm": 1.7494508028030396,
"learning_rate": 6.331967213114754e-07,
"loss": 0.797,
"step": 310
},
{
"epoch": 0.3224155578300921,
"grad_norm": 1.696869134902954,
"learning_rate": 6.434426229508197e-07,
"loss": 0.7972,
"step": 315
},
{
"epoch": 0.32753326509723646,
"grad_norm": 1.5431866645812988,
"learning_rate": 6.536885245901639e-07,
"loss": 0.7919,
"step": 320
},
{
"epoch": 0.33265097236438074,
"grad_norm": 1.6396448612213135,
"learning_rate": 6.639344262295081e-07,
"loss": 0.7986,
"step": 325
},
{
"epoch": 0.33776867963152507,
"grad_norm": 1.7315205335617065,
"learning_rate": 6.741803278688525e-07,
"loss": 0.7966,
"step": 330
},
{
"epoch": 0.3428863868986694,
"grad_norm": 1.6142867803573608,
"learning_rate": 6.844262295081967e-07,
"loss": 0.7964,
"step": 335
},
{
"epoch": 0.34800409416581374,
"grad_norm": 1.332783818244934,
"learning_rate": 6.94672131147541e-07,
"loss": 0.7969,
"step": 340
},
{
"epoch": 0.353121801432958,
"grad_norm": 1.434688687324524,
"learning_rate": 7.049180327868852e-07,
"loss": 0.8015,
"step": 345
},
{
"epoch": 0.35823950870010235,
"grad_norm": 1.7243021726608276,
"learning_rate": 7.151639344262295e-07,
"loss": 0.791,
"step": 350
},
{
"epoch": 0.3633572159672467,
"grad_norm": 1.603244662284851,
"learning_rate": 7.254098360655737e-07,
"loss": 0.7926,
"step": 355
},
{
"epoch": 0.368474923234391,
"grad_norm": 1.645308256149292,
"learning_rate": 7.356557377049179e-07,
"loss": 0.7988,
"step": 360
},
{
"epoch": 0.3735926305015353,
"grad_norm": 1.3321951627731323,
"learning_rate": 7.459016393442623e-07,
"loss": 0.7923,
"step": 365
},
{
"epoch": 0.37871033776867963,
"grad_norm": 2.1083521842956543,
"learning_rate": 7.561475409836066e-07,
"loss": 0.7935,
"step": 370
},
{
"epoch": 0.38382804503582396,
"grad_norm": 1.3414019346237183,
"learning_rate": 7.663934426229508e-07,
"loss": 0.7894,
"step": 375
},
{
"epoch": 0.3889457523029683,
"grad_norm": 1.8279671669006348,
"learning_rate": 7.766393442622951e-07,
"loss": 0.7916,
"step": 380
},
{
"epoch": 0.3940634595701126,
"grad_norm": 1.6233114004135132,
"learning_rate": 7.868852459016393e-07,
"loss": 0.7886,
"step": 385
},
{
"epoch": 0.3991811668372569,
"grad_norm": 1.4336532354354858,
"learning_rate": 7.971311475409835e-07,
"loss": 0.7884,
"step": 390
},
{
"epoch": 0.40429887410440124,
"grad_norm": 1.597020149230957,
"learning_rate": 8.073770491803278e-07,
"loss": 0.7904,
"step": 395
},
{
"epoch": 0.4094165813715456,
"grad_norm": 1.3191157579421997,
"learning_rate": 8.176229508196721e-07,
"loss": 0.787,
"step": 400
},
{
"epoch": 0.41453428863868985,
"grad_norm": 1.6425617933273315,
"learning_rate": 8.278688524590164e-07,
"loss": 0.7887,
"step": 405
},
{
"epoch": 0.4196519959058342,
"grad_norm": 1.3924281597137451,
"learning_rate": 8.381147540983607e-07,
"loss": 0.7976,
"step": 410
},
{
"epoch": 0.4247697031729785,
"grad_norm": 1.2975757122039795,
"learning_rate": 8.483606557377049e-07,
"loss": 0.7895,
"step": 415
},
{
"epoch": 0.42988741044012285,
"grad_norm": 1.3045737743377686,
"learning_rate": 8.586065573770491e-07,
"loss": 0.7894,
"step": 420
},
{
"epoch": 0.43500511770726713,
"grad_norm": 1.9618183374404907,
"learning_rate": 8.688524590163933e-07,
"loss": 0.7865,
"step": 425
},
{
"epoch": 0.44012282497441146,
"grad_norm": 1.3976588249206543,
"learning_rate": 8.790983606557376e-07,
"loss": 0.7896,
"step": 430
},
{
"epoch": 0.4452405322415558,
"grad_norm": 1.1260899305343628,
"learning_rate": 8.89344262295082e-07,
"loss": 0.7861,
"step": 435
},
{
"epoch": 0.4503582395087001,
"grad_norm": 1.293816089630127,
"learning_rate": 8.995901639344262e-07,
"loss": 0.7826,
"step": 440
},
{
"epoch": 0.4554759467758444,
"grad_norm": 1.4861347675323486,
"learning_rate": 9.098360655737705e-07,
"loss": 0.7822,
"step": 445
},
{
"epoch": 0.46059365404298874,
"grad_norm": 1.378319501876831,
"learning_rate": 9.200819672131147e-07,
"loss": 0.778,
"step": 450
},
{
"epoch": 0.4657113613101331,
"grad_norm": 1.2947815656661987,
"learning_rate": 9.303278688524589e-07,
"loss": 0.7853,
"step": 455
},
{
"epoch": 0.47082906857727735,
"grad_norm": 0.9865773916244507,
"learning_rate": 9.405737704918032e-07,
"loss": 0.7797,
"step": 460
},
{
"epoch": 0.4759467758444217,
"grad_norm": 1.4883133172988892,
"learning_rate": 9.508196721311474e-07,
"loss": 0.7804,
"step": 465
},
{
"epoch": 0.481064483111566,
"grad_norm": 1.1394942998886108,
"learning_rate": 9.610655737704918e-07,
"loss": 0.7818,
"step": 470
},
{
"epoch": 0.48618219037871035,
"grad_norm": 1.104995846748352,
"learning_rate": 9.71311475409836e-07,
"loss": 0.7775,
"step": 475
},
{
"epoch": 0.49129989764585463,
"grad_norm": 1.258623719215393,
"learning_rate": 9.815573770491803e-07,
"loss": 0.7731,
"step": 480
},
{
"epoch": 0.49641760491299897,
"grad_norm": 1.4409220218658447,
"learning_rate": 9.918032786885245e-07,
"loss": 0.7811,
"step": 485
},
{
"epoch": 0.5015353121801432,
"grad_norm": 0.9952474236488342,
"learning_rate": 9.999994895105863e-07,
"loss": 0.7821,
"step": 490
},
{
"epoch": 0.5066530194472876,
"grad_norm": 1.2250083684921265,
"learning_rate": 9.99981622490561e-07,
"loss": 0.7822,
"step": 495
},
{
"epoch": 0.5117707267144319,
"grad_norm": 1.1539254188537598,
"learning_rate": 9.999382320422427e-07,
"loss": 0.776,
"step": 500
},
{
"epoch": 0.5117707267144319,
"eval_accuracy": 0.59523,
"eval_loss": 0.6936843991279602,
"eval_macro_f1": 0.5690192634397302,
"eval_precision": 0.6518208624514151,
"eval_recall": 0.6078906162164894,
"eval_runtime": 73.7478,
"eval_samples_per_second": 1355.972,
"eval_steps_per_second": 1.329,
"step": 500
},
{
"epoch": 0.5168884339815762,
"grad_norm": 1.2244267463684082,
"learning_rate": 9.998693203806588e-07,
"loss": 0.7771,
"step": 505
},
{
"epoch": 0.5220061412487206,
"grad_norm": 1.1900156736373901,
"learning_rate": 9.997748910236623e-07,
"loss": 0.7815,
"step": 510
},
{
"epoch": 0.5271238485158649,
"grad_norm": 1.2272601127624512,
"learning_rate": 9.996549487917522e-07,
"loss": 0.7829,
"step": 515
},
{
"epoch": 0.5322415557830092,
"grad_norm": 1.160675287246704,
"learning_rate": 9.995094998078276e-07,
"loss": 0.7785,
"step": 520
},
{
"epoch": 0.5373592630501536,
"grad_norm": 1.2759345769882202,
"learning_rate": 9.993385514968745e-07,
"loss": 0.7755,
"step": 525
},
{
"epoch": 0.5424769703172978,
"grad_norm": 1.0531632900238037,
"learning_rate": 9.99142112585588e-07,
"loss": 0.7781,
"step": 530
},
{
"epoch": 0.5475946775844421,
"grad_norm": 1.0040606260299683,
"learning_rate": 9.989201931019251e-07,
"loss": 0.7744,
"step": 535
},
{
"epoch": 0.5527123848515865,
"grad_norm": 1.2468197345733643,
"learning_rate": 9.98672804374595e-07,
"loss": 0.7712,
"step": 540
},
{
"epoch": 0.5578300921187308,
"grad_norm": 1.1564112901687622,
"learning_rate": 9.983999590324778e-07,
"loss": 0.7797,
"step": 545
},
{
"epoch": 0.5629477993858751,
"grad_norm": 0.8854450583457947,
"learning_rate": 9.981016710039832e-07,
"loss": 0.7723,
"step": 550
},
{
"epoch": 0.5680655066530195,
"grad_norm": 1.142919659614563,
"learning_rate": 9.977779555163369e-07,
"loss": 0.7739,
"step": 555
},
{
"epoch": 0.5731832139201638,
"grad_norm": 1.058153748512268,
"learning_rate": 9.974288290948042e-07,
"loss": 0.774,
"step": 560
},
{
"epoch": 0.5783009211873081,
"grad_norm": 1.1157392263412476,
"learning_rate": 9.970543095618468e-07,
"loss": 0.7742,
"step": 565
},
{
"epoch": 0.5834186284544524,
"grad_norm": 1.0850578546524048,
"learning_rate": 9.96654416036212e-07,
"loss": 0.7734,
"step": 570
},
{
"epoch": 0.5885363357215967,
"grad_norm": 0.9722121953964233,
"learning_rate": 9.96229168931958e-07,
"loss": 0.77,
"step": 575
},
{
"epoch": 0.593654042988741,
"grad_norm": 1.332795262336731,
"learning_rate": 9.957785899574102e-07,
"loss": 0.7725,
"step": 580
},
{
"epoch": 0.5987717502558854,
"grad_norm": 0.8639675378799438,
"learning_rate": 9.953027021140543e-07,
"loss": 0.7646,
"step": 585
},
{
"epoch": 0.6038894575230297,
"grad_norm": 0.9253244400024414,
"learning_rate": 9.948015296953623e-07,
"loss": 0.7743,
"step": 590
},
{
"epoch": 0.609007164790174,
"grad_norm": 0.8843643069267273,
"learning_rate": 9.942750982855503e-07,
"loss": 0.7717,
"step": 595
},
{
"epoch": 0.6141248720573184,
"grad_norm": 1.046048879623413,
"learning_rate": 9.937234347582753e-07,
"loss": 0.7721,
"step": 600
},
{
"epoch": 0.6192425793244627,
"grad_norm": 0.8906111717224121,
"learning_rate": 9.931465672752613e-07,
"loss": 0.7657,
"step": 605
},
{
"epoch": 0.6243602865916069,
"grad_norm": 0.9637787342071533,
"learning_rate": 9.925445252848621e-07,
"loss": 0.7666,
"step": 610
},
{
"epoch": 0.6294779938587513,
"grad_norm": 0.9004104733467102,
"learning_rate": 9.919173395205584e-07,
"loss": 0.7664,
"step": 615
},
{
"epoch": 0.6345957011258956,
"grad_norm": 1.4724570512771606,
"learning_rate": 9.912650419993893e-07,
"loss": 0.7679,
"step": 620
},
{
"epoch": 0.6397134083930399,
"grad_norm": 0.8644343614578247,
"learning_rate": 9.905876660203161e-07,
"loss": 0.7671,
"step": 625
},
{
"epoch": 0.6448311156601843,
"grad_norm": 0.8368955254554749,
"learning_rate": 9.898852461625245e-07,
"loss": 0.7717,
"step": 630
},
{
"epoch": 0.6499488229273286,
"grad_norm": 0.9413282871246338,
"learning_rate": 9.891578182836583e-07,
"loss": 0.7693,
"step": 635
},
{
"epoch": 0.6550665301944729,
"grad_norm": 0.9777762293815613,
"learning_rate": 9.884054195179886e-07,
"loss": 0.7656,
"step": 640
},
{
"epoch": 0.6601842374616171,
"grad_norm": 0.8983454704284668,
"learning_rate": 9.876280882745193e-07,
"loss": 0.7605,
"step": 645
},
{
"epoch": 0.6653019447287615,
"grad_norm": 0.8708799481391907,
"learning_rate": 9.868258642350254e-07,
"loss": 0.7673,
"step": 650
},
{
"epoch": 0.6704196519959058,
"grad_norm": 0.8354130387306213,
"learning_rate": 9.859987883520275e-07,
"loss": 0.767,
"step": 655
},
{
"epoch": 0.6755373592630501,
"grad_norm": 0.868485152721405,
"learning_rate": 9.851469028467015e-07,
"loss": 0.7647,
"step": 660
},
{
"epoch": 0.6806550665301945,
"grad_norm": 0.9445936679840088,
"learning_rate": 9.84270251206723e-07,
"loss": 0.7605,
"step": 665
},
{
"epoch": 0.6857727737973388,
"grad_norm": 0.7952156662940979,
"learning_rate": 9.833688781840475e-07,
"loss": 0.7664,
"step": 670
},
{
"epoch": 0.6908904810644831,
"grad_norm": 1.1992422342300415,
"learning_rate": 9.824428297926254e-07,
"loss": 0.7617,
"step": 675
},
{
"epoch": 0.6960081883316275,
"grad_norm": 0.8914986252784729,
"learning_rate": 9.81492153306054e-07,
"loss": 0.764,
"step": 680
},
{
"epoch": 0.7011258955987717,
"grad_norm": 0.7945632338523865,
"learning_rate": 9.80516897255163e-07,
"loss": 0.7617,
"step": 685
},
{
"epoch": 0.706243602865916,
"grad_norm": 0.7822641134262085,
"learning_rate": 9.795171114255384e-07,
"loss": 0.7613,
"step": 690
},
{
"epoch": 0.7113613101330604,
"grad_norm": 0.7989721298217773,
"learning_rate": 9.784928468549793e-07,
"loss": 0.7615,
"step": 695
},
{
"epoch": 0.7164790174002047,
"grad_norm": 0.7325178980827332,
"learning_rate": 9.77444155830895e-07,
"loss": 0.7572,
"step": 700
},
{
"epoch": 0.721596724667349,
"grad_norm": 0.8934036493301392,
"learning_rate": 9.763710918876329e-07,
"loss": 0.7589,
"step": 705
},
{
"epoch": 0.7267144319344934,
"grad_norm": 0.7769590616226196,
"learning_rate": 9.752737098037477e-07,
"loss": 0.7573,
"step": 710
},
{
"epoch": 0.7318321392016377,
"grad_norm": 1.0458475351333618,
"learning_rate": 9.741520655992047e-07,
"loss": 0.759,
"step": 715
},
{
"epoch": 0.736949846468782,
"grad_norm": 0.649872899055481,
"learning_rate": 9.730062165325185e-07,
"loss": 0.7607,
"step": 720
},
{
"epoch": 0.7420675537359263,
"grad_norm": 0.7517932057380676,
"learning_rate": 9.718362210978329e-07,
"loss": 0.7567,
"step": 725
},
{
"epoch": 0.7471852610030706,
"grad_norm": 0.9947759509086609,
"learning_rate": 9.706421390219315e-07,
"loss": 0.7593,
"step": 730
},
{
"epoch": 0.7523029682702149,
"grad_norm": 0.719109833240509,
"learning_rate": 9.694240312611917e-07,
"loss": 0.7615,
"step": 735
},
{
"epoch": 0.7574206755373593,
"grad_norm": 1.0175235271453857,
"learning_rate": 9.681819599984712e-07,
"loss": 0.7555,
"step": 740
},
{
"epoch": 0.7625383828045036,
"grad_norm": 0.8200032711029053,
"learning_rate": 9.66915988639934e-07,
"loss": 0.7565,
"step": 745
},
{
"epoch": 0.7676560900716479,
"grad_norm": 0.926680326461792,
"learning_rate": 9.656261818118139e-07,
"loss": 0.7628,
"step": 750
},
{
"epoch": 0.7727737973387923,
"grad_norm": 0.6904947757720947,
"learning_rate": 9.64312605357115e-07,
"loss": 0.7584,
"step": 755
},
{
"epoch": 0.7778915046059366,
"grad_norm": 0.7391018867492676,
"learning_rate": 9.62975326332251e-07,
"loss": 0.7582,
"step": 760
},
{
"epoch": 0.7830092118730808,
"grad_norm": 0.7193120121955872,
"learning_rate": 9.616144130036214e-07,
"loss": 0.7557,
"step": 765
},
{
"epoch": 0.7881269191402251,
"grad_norm": 0.8275336623191833,
"learning_rate": 9.602299348441277e-07,
"loss": 0.7575,
"step": 770
},
{
"epoch": 0.7932446264073695,
"grad_norm": 0.9943181276321411,
"learning_rate": 9.58821962529625e-07,
"loss": 0.7568,
"step": 775
},
{
"epoch": 0.7983623336745138,
"grad_norm": 0.7646188139915466,
"learning_rate": 9.573905679353166e-07,
"loss": 0.752,
"step": 780
},
{
"epoch": 0.8034800409416581,
"grad_norm": 0.7356329560279846,
"learning_rate": 9.55935824132082e-07,
"loss": 0.7552,
"step": 785
},
{
"epoch": 0.8085977482088025,
"grad_norm": 0.795838475227356,
"learning_rate": 9.544578053827495e-07,
"loss": 0.7543,
"step": 790
},
{
"epoch": 0.8137154554759468,
"grad_norm": 0.9953216314315796,
"learning_rate": 9.529565871383034e-07,
"loss": 0.7558,
"step": 795
},
{
"epoch": 0.8188331627430911,
"grad_norm": 0.797937273979187,
"learning_rate": 9.514322460340329e-07,
"loss": 0.7542,
"step": 800
},
{
"epoch": 0.8239508700102354,
"grad_norm": 0.7371375560760498,
"learning_rate": 9.498848598856198e-07,
"loss": 0.7532,
"step": 805
},
{
"epoch": 0.8290685772773797,
"grad_norm": 0.8336758613586426,
"learning_rate": 9.48314507685166e-07,
"loss": 0.756,
"step": 810
},
{
"epoch": 0.834186284544524,
"grad_norm": 0.7204869389533997,
"learning_rate": 9.467212695971619e-07,
"loss": 0.7564,
"step": 815
},
{
"epoch": 0.8393039918116684,
"grad_norm": 0.6758232712745667,
"learning_rate": 9.451052269543929e-07,
"loss": 0.7548,
"step": 820
},
{
"epoch": 0.8444216990788127,
"grad_norm": 0.7348074913024902,
"learning_rate": 9.434664622537883e-07,
"loss": 0.7535,
"step": 825
},
{
"epoch": 0.849539406345957,
"grad_norm": 0.747559130191803,
"learning_rate": 9.418050591522093e-07,
"loss": 0.752,
"step": 830
},
{
"epoch": 0.8546571136131014,
"grad_norm": 0.7392817735671997,
"learning_rate": 9.401211024621792e-07,
"loss": 0.7492,
"step": 835
},
{
"epoch": 0.8597748208802457,
"grad_norm": 0.6318978071212769,
"learning_rate": 9.384146781475533e-07,
"loss": 0.7577,
"step": 840
},
{
"epoch": 0.8648925281473899,
"grad_norm": 0.5832816362380981,
"learning_rate": 9.366858733191307e-07,
"loss": 0.7506,
"step": 845
},
{
"epoch": 0.8700102354145343,
"grad_norm": 0.6932022571563721,
"learning_rate": 9.349347762302071e-07,
"loss": 0.7523,
"step": 850
},
{
"epoch": 0.8751279426816786,
"grad_norm": 0.7047157287597656,
"learning_rate": 9.331614762720703e-07,
"loss": 0.7487,
"step": 855
},
{
"epoch": 0.8802456499488229,
"grad_norm": 0.6591235995292664,
"learning_rate": 9.313660639694358e-07,
"loss": 0.7538,
"step": 860
},
{
"epoch": 0.8853633572159673,
"grad_norm": 0.66665118932724,
"learning_rate": 9.295486309758269e-07,
"loss": 0.7518,
"step": 865
},
{
"epoch": 0.8904810644831116,
"grad_norm": 0.6165961027145386,
"learning_rate": 9.277092700688951e-07,
"loss": 0.7495,
"step": 870
},
{
"epoch": 0.8955987717502559,
"grad_norm": 0.7449588179588318,
"learning_rate": 9.258480751456838e-07,
"loss": 0.7515,
"step": 875
},
{
"epoch": 0.9007164790174002,
"grad_norm": 0.7553215622901917,
"learning_rate": 9.239651412178357e-07,
"loss": 0.7534,
"step": 880
},
{
"epoch": 0.9058341862845445,
"grad_norm": 0.747010350227356,
"learning_rate": 9.220605644067419e-07,
"loss": 0.7548,
"step": 885
},
{
"epoch": 0.9109518935516888,
"grad_norm": 0.7272236347198486,
"learning_rate": 9.20134441938635e-07,
"loss": 0.7531,
"step": 890
},
{
"epoch": 0.9160696008188332,
"grad_norm": 0.8726323246955872,
"learning_rate": 9.181868721396266e-07,
"loss": 0.7479,
"step": 895
},
{
"epoch": 0.9211873080859775,
"grad_norm": 0.7914009094238281,
"learning_rate": 9.16217954430687e-07,
"loss": 0.7522,
"step": 900
},
{
"epoch": 0.9263050153531218,
"grad_norm": 0.6367310285568237,
"learning_rate": 9.142277893225708e-07,
"loss": 0.7497,
"step": 905
},
{
"epoch": 0.9314227226202662,
"grad_norm": 0.8285405039787292,
"learning_rate": 9.122164784106842e-07,
"loss": 0.753,
"step": 910
},
{
"epoch": 0.9365404298874105,
"grad_norm": 0.7742036581039429,
"learning_rate": 9.101841243699015e-07,
"loss": 0.7534,
"step": 915
},
{
"epoch": 0.9416581371545547,
"grad_norm": 0.7512480020523071,
"learning_rate": 9.081308309493209e-07,
"loss": 0.747,
"step": 920
},
{
"epoch": 0.946775844421699,
"grad_norm": 0.5556691288948059,
"learning_rate": 9.060567029669699e-07,
"loss": 0.7465,
"step": 925
},
{
"epoch": 0.9518935516888434,
"grad_norm": 1.0232101678848267,
"learning_rate": 9.039618463044536e-07,
"loss": 0.7485,
"step": 930
},
{
"epoch": 0.9570112589559877,
"grad_norm": 0.8321600556373596,
"learning_rate": 9.018463679015505e-07,
"loss": 0.7488,
"step": 935
},
{
"epoch": 0.962128966223132,
"grad_norm": 0.7009038329124451,
"learning_rate": 8.997103757507521e-07,
"loss": 0.7483,
"step": 940
},
{
"epoch": 0.9672466734902764,
"grad_norm": 0.6939564347267151,
"learning_rate": 8.975539788917514e-07,
"loss": 0.7485,
"step": 945
},
{
"epoch": 0.9723643807574207,
"grad_norm": 0.7738851308822632,
"learning_rate": 8.953772874058757e-07,
"loss": 0.7479,
"step": 950
},
{
"epoch": 0.977482088024565,
"grad_norm": 0.5913597941398621,
"learning_rate": 8.931804124104672e-07,
"loss": 0.7473,
"step": 955
},
{
"epoch": 0.9825997952917093,
"grad_norm": 0.8486027717590332,
"learning_rate": 8.909634660532106e-07,
"loss": 0.7479,
"step": 960
},
{
"epoch": 0.9877175025588536,
"grad_norm": 0.6463382840156555,
"learning_rate": 8.887265615064083e-07,
"loss": 0.7486,
"step": 965
},
{
"epoch": 0.9928352098259979,
"grad_norm": 0.6264991164207458,
"learning_rate": 8.864698129612031e-07,
"loss": 0.7467,
"step": 970
},
{
"epoch": 0.9979529170931423,
"grad_norm": 0.7566510438919067,
"learning_rate": 8.841933356217488e-07,
"loss": 0.7463,
"step": 975
},
{
"epoch": 1.0030706243602865,
"grad_norm": 0.7290503978729248,
"learning_rate": 8.818972456993288e-07,
"loss": 0.7504,
"step": 980
},
{
"epoch": 1.008188331627431,
"grad_norm": 0.8277891874313354,
"learning_rate": 8.795816604064241e-07,
"loss": 0.7472,
"step": 985
},
{
"epoch": 1.0133060388945752,
"grad_norm": 0.6427952647209167,
"learning_rate": 8.772466979507302e-07,
"loss": 0.7487,
"step": 990
},
{
"epoch": 1.0184237461617196,
"grad_norm": 0.6775041818618774,
"learning_rate": 8.748924775291216e-07,
"loss": 0.745,
"step": 995
},
{
"epoch": 1.0235414534288638,
"grad_norm": 0.6815404891967773,
"learning_rate": 8.725191193215675e-07,
"loss": 0.7485,
"step": 1000
},
{
"epoch": 1.0235414534288638,
"eval_accuracy": 0.67557,
"eval_loss": 0.6936712265014648,
"eval_macro_f1": 0.6724504812400831,
"eval_precision": 0.6760463081581009,
"eval_recall": 0.6725003053739838,
"eval_runtime": 73.7408,
"eval_samples_per_second": 1356.102,
"eval_steps_per_second": 1.329,
"step": 1000
},
{
"epoch": 1.0286591606960083,
"grad_norm": 0.8586804866790771,
"learning_rate": 8.701267444849974e-07,
"loss": 0.7457,
"step": 1005
},
{
"epoch": 1.0337768679631525,
"grad_norm": 0.5989358425140381,
"learning_rate": 8.677154751471152e-07,
"loss": 0.7443,
"step": 1010
},
{
"epoch": 1.0388945752302967,
"grad_norm": 0.6888963580131531,
"learning_rate": 8.65285434400165e-07,
"loss": 0.7458,
"step": 1015
},
{
"epoch": 1.0440122824974412,
"grad_norm": 0.6407850384712219,
"learning_rate": 8.628367462946482e-07,
"loss": 0.7493,
"step": 1020
},
{
"epoch": 1.0491299897645854,
"grad_norm": 0.6202091574668884,
"learning_rate": 8.603695358329896e-07,
"loss": 0.7471,
"step": 1025
},
{
"epoch": 1.0542476970317298,
"grad_norm": 0.7456187605857849,
"learning_rate": 8.57883928963157e-07,
"loss": 0.7431,
"step": 1030
},
{
"epoch": 1.059365404298874,
"grad_norm": 0.6171067357063293,
"learning_rate": 8.553800525722317e-07,
"loss": 0.7435,
"step": 1035
},
{
"epoch": 1.0644831115660185,
"grad_norm": 0.8527712821960449,
"learning_rate": 8.528580344799305e-07,
"loss": 0.7453,
"step": 1040
},
{
"epoch": 1.0696008188331627,
"grad_norm": 0.6724162697792053,
"learning_rate": 8.503180034320816e-07,
"loss": 0.7467,
"step": 1045
},
{
"epoch": 1.0747185261003072,
"grad_norm": 0.581979513168335,
"learning_rate": 8.477600890940513e-07,
"loss": 0.7508,
"step": 1050
},
{
"epoch": 1.0798362333674514,
"grad_norm": 0.6551439166069031,
"learning_rate": 8.451844220441253e-07,
"loss": 0.7469,
"step": 1055
},
{
"epoch": 1.0849539406345956,
"grad_norm": 0.6437426209449768,
"learning_rate": 8.42591133766843e-07,
"loss": 0.7468,
"step": 1060
},
{
"epoch": 1.09007164790174,
"grad_norm": 0.5788704752922058,
"learning_rate": 8.39980356646285e-07,
"loss": 0.7424,
"step": 1065
},
{
"epoch": 1.0951893551688843,
"grad_norm": 0.5575606226921082,
"learning_rate": 8.373522239593149e-07,
"loss": 0.7396,
"step": 1070
},
{
"epoch": 1.1003070624360287,
"grad_norm": 0.737180769443512,
"learning_rate": 8.347068698687765e-07,
"loss": 0.744,
"step": 1075
},
{
"epoch": 1.105424769703173,
"grad_norm": 0.592766284942627,
"learning_rate": 8.320444294166439e-07,
"loss": 0.7469,
"step": 1080
},
{
"epoch": 1.1105424769703174,
"grad_norm": 0.63823401927948,
"learning_rate": 8.293650385171287e-07,
"loss": 0.7447,
"step": 1085
},
{
"epoch": 1.1156601842374616,
"grad_norm": 0.6114454865455627,
"learning_rate": 8.266688339497412e-07,
"loss": 0.7475,
"step": 1090
},
{
"epoch": 1.120777891504606,
"grad_norm": 0.53263258934021,
"learning_rate": 8.239559533523082e-07,
"loss": 0.7455,
"step": 1095
},
{
"epoch": 1.1258955987717503,
"grad_norm": 0.7016158699989319,
"learning_rate": 8.212265352139466e-07,
"loss": 0.742,
"step": 1100
},
{
"epoch": 1.1310133060388945,
"grad_norm": 0.6125472784042358,
"learning_rate": 8.184807188679939e-07,
"loss": 0.7383,
"step": 1105
},
{
"epoch": 1.136131013306039,
"grad_norm": 0.6008788347244263,
"learning_rate": 8.157186444848952e-07,
"loss": 0.7435,
"step": 1110
},
{
"epoch": 1.1412487205731832,
"grad_norm": 0.6357280015945435,
"learning_rate": 8.129404530650479e-07,
"loss": 0.7443,
"step": 1115
},
{
"epoch": 1.1463664278403276,
"grad_norm": 0.6422165036201477,
"learning_rate": 8.101462864316038e-07,
"loss": 0.7449,
"step": 1120
},
{
"epoch": 1.1514841351074718,
"grad_norm": 0.6852079629898071,
"learning_rate": 8.07336287223229e-07,
"loss": 0.7428,
"step": 1125
},
{
"epoch": 1.156601842374616,
"grad_norm": 0.5539452433586121,
"learning_rate": 8.045105988868224e-07,
"loss": 0.7455,
"step": 1130
},
{
"epoch": 1.1617195496417605,
"grad_norm": 0.5939313173294067,
"learning_rate": 8.016693656701931e-07,
"loss": 0.7376,
"step": 1135
},
{
"epoch": 1.1668372569089047,
"grad_norm": 0.7522106766700745,
"learning_rate": 7.98812732614697e-07,
"loss": 0.7464,
"step": 1140
},
{
"epoch": 1.1719549641760492,
"grad_norm": 0.6572809815406799,
"learning_rate": 7.959408455478313e-07,
"loss": 0.7448,
"step": 1145
},
{
"epoch": 1.1770726714431934,
"grad_norm": 0.5842403173446655,
"learning_rate": 7.93053851075792e-07,
"loss": 0.7396,
"step": 1150
},
{
"epoch": 1.1821903787103378,
"grad_norm": 0.5845000147819519,
"learning_rate": 7.901518965759888e-07,
"loss": 0.7438,
"step": 1155
},
{
"epoch": 1.187308085977482,
"grad_norm": 0.5873178839683533,
"learning_rate": 7.872351301895217e-07,
"loss": 0.7421,
"step": 1160
},
{
"epoch": 1.1924257932446265,
"grad_norm": 0.6385728120803833,
"learning_rate": 7.843037008136189e-07,
"loss": 0.7431,
"step": 1165
},
{
"epoch": 1.1975435005117707,
"grad_norm": 0.5818535685539246,
"learning_rate": 7.813577580940356e-07,
"loss": 0.7416,
"step": 1170
},
{
"epoch": 1.202661207778915,
"grad_norm": 0.5611526370048523,
"learning_rate": 7.783974524174149e-07,
"loss": 0.743,
"step": 1175
},
{
"epoch": 1.2077789150460594,
"grad_norm": 0.6002296805381775,
"learning_rate": 7.754229349036102e-07,
"loss": 0.7407,
"step": 1180
},
{
"epoch": 1.2128966223132036,
"grad_norm": 0.6006008982658386,
"learning_rate": 7.724343573979718e-07,
"loss": 0.7437,
"step": 1185
},
{
"epoch": 1.218014329580348,
"grad_norm": 0.6336845755577087,
"learning_rate": 7.694318724635945e-07,
"loss": 0.7405,
"step": 1190
},
{
"epoch": 1.2231320368474923,
"grad_norm": 0.6916839480400085,
"learning_rate": 7.664156333735293e-07,
"loss": 0.7468,
"step": 1195
},
{
"epoch": 1.2282497441146367,
"grad_norm": 0.5944891571998596,
"learning_rate": 7.633857941029602e-07,
"loss": 0.7485,
"step": 1200
},
{
"epoch": 1.233367451381781,
"grad_norm": 0.5755409598350525,
"learning_rate": 7.603425093213429e-07,
"loss": 0.7418,
"step": 1205
},
{
"epoch": 1.2384851586489254,
"grad_norm": 0.6128578186035156,
"learning_rate": 7.572859343845092e-07,
"loss": 0.7396,
"step": 1210
},
{
"epoch": 1.2436028659160696,
"grad_norm": 0.6123960614204407,
"learning_rate": 7.542162253267363e-07,
"loss": 0.7363,
"step": 1215
},
{
"epoch": 1.2487205731832138,
"grad_norm": 0.6969608664512634,
"learning_rate": 7.511335388527822e-07,
"loss": 0.7406,
"step": 1220
},
{
"epoch": 1.2538382804503583,
"grad_norm": 0.6491796970367432,
"learning_rate": 7.480380323298851e-07,
"loss": 0.7429,
"step": 1225
},
{
"epoch": 1.2589559877175025,
"grad_norm": 0.5883914828300476,
"learning_rate": 7.449298637797309e-07,
"loss": 0.7375,
"step": 1230
},
{
"epoch": 1.264073694984647,
"grad_norm": 0.6160842776298523,
"learning_rate": 7.418091918703854e-07,
"loss": 0.7393,
"step": 1235
},
{
"epoch": 1.2691914022517912,
"grad_norm": 0.5568389892578125,
"learning_rate": 7.386761759081954e-07,
"loss": 0.7387,
"step": 1240
},
{
"epoch": 1.2743091095189354,
"grad_norm": 0.532599151134491,
"learning_rate": 7.35530975829656e-07,
"loss": 0.741,
"step": 1245
},
{
"epoch": 1.2794268167860798,
"grad_norm": 0.5400995016098022,
"learning_rate": 7.323737521932457e-07,
"loss": 0.7367,
"step": 1250
},
{
"epoch": 1.2845445240532243,
"grad_norm": 0.5307775735855103,
"learning_rate": 7.292046661712307e-07,
"loss": 0.7399,
"step": 1255
},
{
"epoch": 1.2896622313203685,
"grad_norm": 0.5908007621765137,
"learning_rate": 7.260238795414366e-07,
"loss": 0.74,
"step": 1260
},
{
"epoch": 1.2947799385875127,
"grad_norm": 0.5410370826721191,
"learning_rate": 7.228315546789907e-07,
"loss": 0.7388,
"step": 1265
},
{
"epoch": 1.2998976458546572,
"grad_norm": 0.5406989455223083,
"learning_rate": 7.19627854548032e-07,
"loss": 0.7337,
"step": 1270
},
{
"epoch": 1.3050153531218014,
"grad_norm": 0.589767575263977,
"learning_rate": 7.164129426933927e-07,
"loss": 0.7426,
"step": 1275
},
{
"epoch": 1.3101330603889458,
"grad_norm": 0.5926154255867004,
"learning_rate": 7.131869832322496e-07,
"loss": 0.7374,
"step": 1280
},
{
"epoch": 1.31525076765609,
"grad_norm": 0.7507414817810059,
"learning_rate": 7.099501408457452e-07,
"loss": 0.7375,
"step": 1285
},
{
"epoch": 1.3203684749232343,
"grad_norm": 0.6162967681884766,
"learning_rate": 7.06702580770582e-07,
"loss": 0.7381,
"step": 1290
},
{
"epoch": 1.3254861821903787,
"grad_norm": 0.5118803977966309,
"learning_rate": 7.034444687905868e-07,
"loss": 0.7344,
"step": 1295
},
{
"epoch": 1.330603889457523,
"grad_norm": 0.5982370972633362,
"learning_rate": 7.001759712282478e-07,
"loss": 0.7382,
"step": 1300
},
{
"epoch": 1.3357215967246674,
"grad_norm": 0.6339845657348633,
"learning_rate": 6.968972549362238e-07,
"loss": 0.7386,
"step": 1305
},
{
"epoch": 1.3408393039918116,
"grad_norm": 0.5755071043968201,
"learning_rate": 6.936084872888271e-07,
"loss": 0.7349,
"step": 1310
},
{
"epoch": 1.345957011258956,
"grad_norm": 0.6089357137680054,
"learning_rate": 6.90309836173479e-07,
"loss": 0.7377,
"step": 1315
},
{
"epoch": 1.3510747185261003,
"grad_norm": 0.6137183308601379,
"learning_rate": 6.87001469982139e-07,
"loss": 0.7417,
"step": 1320
},
{
"epoch": 1.3561924257932447,
"grad_norm": 0.6864479184150696,
"learning_rate": 6.836835576027093e-07,
"loss": 0.7321,
"step": 1325
},
{
"epoch": 1.361310133060389,
"grad_norm": 0.5657494068145752,
"learning_rate": 6.803562684104125e-07,
"loss": 0.7411,
"step": 1330
},
{
"epoch": 1.3664278403275332,
"grad_norm": 0.6047109365463257,
"learning_rate": 6.770197722591456e-07,
"loss": 0.7399,
"step": 1335
},
{
"epoch": 1.3715455475946776,
"grad_norm": 0.5772355198860168,
"learning_rate": 6.736742394728097e-07,
"loss": 0.7374,
"step": 1340
},
{
"epoch": 1.3766632548618218,
"grad_norm": 0.7158586382865906,
"learning_rate": 6.703198408366142e-07,
"loss": 0.739,
"step": 1345
},
{
"epoch": 1.3817809621289663,
"grad_norm": 0.5718494057655334,
"learning_rate": 6.669567475883592e-07,
"loss": 0.7435,
"step": 1350
},
{
"epoch": 1.3868986693961105,
"grad_norm": 0.6494776606559753,
"learning_rate": 6.635851314096935e-07,
"loss": 0.7358,
"step": 1355
},
{
"epoch": 1.3920163766632547,
"grad_norm": 0.5958154201507568,
"learning_rate": 6.602051644173509e-07,
"loss": 0.7375,
"step": 1360
},
{
"epoch": 1.3971340839303992,
"grad_norm": 0.5509739518165588,
"learning_rate": 6.568170191543634e-07,
"loss": 0.7412,
"step": 1365
},
{
"epoch": 1.4022517911975436,
"grad_norm": 0.5368937253952026,
"learning_rate": 6.534208685812536e-07,
"loss": 0.7393,
"step": 1370
},
{
"epoch": 1.4073694984646878,
"grad_norm": 0.5369133353233337,
"learning_rate": 6.500168860672047e-07,
"loss": 0.7398,
"step": 1375
},
{
"epoch": 1.412487205731832,
"grad_norm": 0.5789251327514648,
"learning_rate": 6.466052453812111e-07,
"loss": 0.7371,
"step": 1380
},
{
"epoch": 1.4176049129989765,
"grad_norm": 0.5568552017211914,
"learning_rate": 6.431861206832069e-07,
"loss": 0.7363,
"step": 1385
},
{
"epoch": 1.4227226202661207,
"grad_norm": 0.5325226783752441,
"learning_rate": 6.397596865151752e-07,
"loss": 0.7348,
"step": 1390
},
{
"epoch": 1.4278403275332652,
"grad_norm": 0.5849957466125488,
"learning_rate": 6.363261177922388e-07,
"loss": 0.7363,
"step": 1395
},
{
"epoch": 1.4329580348004094,
"grad_norm": 0.6208518743515015,
"learning_rate": 6.328855897937303e-07,
"loss": 0.7365,
"step": 1400
},
{
"epoch": 1.4380757420675536,
"grad_norm": 0.5599240064620972,
"learning_rate": 6.294382781542445e-07,
"loss": 0.7371,
"step": 1405
},
{
"epoch": 1.443193449334698,
"grad_norm": 0.5623425841331482,
"learning_rate": 6.25984358854672e-07,
"loss": 0.74,
"step": 1410
},
{
"epoch": 1.4483111566018425,
"grad_norm": 0.6866716146469116,
"learning_rate": 6.225240082132172e-07,
"loss": 0.7383,
"step": 1415
},
{
"epoch": 1.4534288638689867,
"grad_norm": 0.5852178931236267,
"learning_rate": 6.190574028763952e-07,
"loss": 0.7381,
"step": 1420
},
{
"epoch": 1.458546571136131,
"grad_norm": 0.5319634079933167,
"learning_rate": 6.15584719810016e-07,
"loss": 0.7349,
"step": 1425
},
{
"epoch": 1.4636642784032754,
"grad_norm": 0.5798255205154419,
"learning_rate": 6.121061362901498e-07,
"loss": 0.7331,
"step": 1430
},
{
"epoch": 1.4687819856704196,
"grad_norm": 0.4803605079650879,
"learning_rate": 6.086218298940778e-07,
"loss": 0.7356,
"step": 1435
},
{
"epoch": 1.473899692937564,
"grad_norm": 0.7146285772323608,
"learning_rate": 6.051319784912261e-07,
"loss": 0.7384,
"step": 1440
},
{
"epoch": 1.4790174002047083,
"grad_norm": 0.47007301449775696,
"learning_rate": 6.016367602340868e-07,
"loss": 0.7332,
"step": 1445
},
{
"epoch": 1.4841351074718525,
"grad_norm": 0.6568506956100464,
"learning_rate": 5.981363535491233e-07,
"loss": 0.7378,
"step": 1450
},
{
"epoch": 1.489252814738997,
"grad_norm": 0.5178249478340149,
"learning_rate": 5.946309371276614e-07,
"loss": 0.7338,
"step": 1455
},
{
"epoch": 1.4943705220061412,
"grad_norm": 0.5785830616950989,
"learning_rate": 5.911206899167676e-07,
"loss": 0.7392,
"step": 1460
},
{
"epoch": 1.4994882292732856,
"grad_norm": 0.5021066665649414,
"learning_rate": 5.87605791110114e-07,
"loss": 0.7342,
"step": 1465
},
{
"epoch": 1.5046059365404298,
"grad_norm": 0.5594333410263062,
"learning_rate": 5.840864201388312e-07,
"loss": 0.7351,
"step": 1470
},
{
"epoch": 1.509723643807574,
"grad_norm": 0.5204704999923706,
"learning_rate": 5.805627566623475e-07,
"loss": 0.7375,
"step": 1475
},
{
"epoch": 1.5148413510747185,
"grad_norm": 0.6187242865562439,
"learning_rate": 5.770349805592185e-07,
"loss": 0.7351,
"step": 1480
},
{
"epoch": 1.519959058341863,
"grad_norm": 0.5294100046157837,
"learning_rate": 5.735032719179443e-07,
"loss": 0.7383,
"step": 1485
},
{
"epoch": 1.5250767656090072,
"grad_norm": 0.5450606942176819,
"learning_rate": 5.699678110277762e-07,
"loss": 0.7365,
"step": 1490
},
{
"epoch": 1.5301944728761514,
"grad_norm": 0.5091442465782166,
"learning_rate": 5.664287783695122e-07,
"loss": 0.7343,
"step": 1495
},
{
"epoch": 1.5353121801432958,
"grad_norm": 0.557119607925415,
"learning_rate": 5.628863546062856e-07,
"loss": 0.7298,
"step": 1500
},
{
"epoch": 1.5353121801432958,
"eval_accuracy": 0.67304,
"eval_loss": 0.6938837766647339,
"eval_macro_f1": 0.6609359830000188,
"eval_precision": 0.685850518502884,
"eval_recall": 0.6657447133221994,
"eval_runtime": 73.8645,
"eval_samples_per_second": 1353.83,
"eval_steps_per_second": 1.327,
"step": 1500
}
],
"logging_steps": 5,
"max_steps": 4885,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.0201035364007936e+17,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}