phq8-classifier-koelectra-base / trainer_state.json
lkw2025's picture
Upload 3 files
cc6bce5 verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_metric": 0.9310132935790502,
"best_model_checkpoint": "../models/phq_cls\\checkpoint-20500",
"epoch": 4.929577464788732,
"eval_steps": 500,
"global_step": 21000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.5858579277992249,
"learning_rate": 4.997652582159625e-05,
"loss": 0.6356,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 0.6038872599601746,
"learning_rate": 4.995305164319249e-05,
"loss": 0.526,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 3.9445767402648926,
"learning_rate": 4.992957746478874e-05,
"loss": 0.4502,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 0.5415595769882202,
"learning_rate": 4.990610328638498e-05,
"loss": 0.4029,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 0.5211673378944397,
"learning_rate": 4.988262910798122e-05,
"loss": 0.3697,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 0.3754492402076721,
"learning_rate": 4.9859154929577466e-05,
"loss": 0.3559,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 0.48204684257507324,
"learning_rate": 4.9835680751173713e-05,
"loss": 0.3499,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 0.3834236264228821,
"learning_rate": 4.9812206572769954e-05,
"loss": 0.3487,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 0.40969759225845337,
"learning_rate": 4.97887323943662e-05,
"loss": 0.3514,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 0.3595544695854187,
"learning_rate": 4.976525821596245e-05,
"loss": 0.3505,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 0.41510289907455444,
"learning_rate": 4.974178403755869e-05,
"loss": 0.3538,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 0.40279024839401245,
"learning_rate": 4.971830985915493e-05,
"loss": 0.3504,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 0.43963101506233215,
"learning_rate": 4.969483568075118e-05,
"loss": 0.3474,
"step": 130
},
{
"epoch": 0.03,
"grad_norm": 0.514215886592865,
"learning_rate": 4.967136150234742e-05,
"loss": 0.3488,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 0.49667927622795105,
"learning_rate": 4.9647887323943665e-05,
"loss": 0.343,
"step": 150
},
{
"epoch": 0.04,
"grad_norm": 0.8667863607406616,
"learning_rate": 4.962441314553991e-05,
"loss": 0.3427,
"step": 160
},
{
"epoch": 0.04,
"grad_norm": 0.5492832660675049,
"learning_rate": 4.960093896713615e-05,
"loss": 0.3432,
"step": 170
},
{
"epoch": 0.04,
"grad_norm": 0.45692935585975647,
"learning_rate": 4.95774647887324e-05,
"loss": 0.3328,
"step": 180
},
{
"epoch": 0.04,
"grad_norm": 0.5771167874336243,
"learning_rate": 4.955399061032864e-05,
"loss": 0.3274,
"step": 190
},
{
"epoch": 0.05,
"grad_norm": 0.586593747138977,
"learning_rate": 4.953051643192488e-05,
"loss": 0.3427,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 0.45685452222824097,
"learning_rate": 4.950704225352113e-05,
"loss": 0.3278,
"step": 210
},
{
"epoch": 0.05,
"grad_norm": 0.40281644463539124,
"learning_rate": 4.9483568075117376e-05,
"loss": 0.3412,
"step": 220
},
{
"epoch": 0.05,
"grad_norm": 0.6066292524337769,
"learning_rate": 4.946009389671362e-05,
"loss": 0.3256,
"step": 230
},
{
"epoch": 0.06,
"grad_norm": 1.0048617124557495,
"learning_rate": 4.9436619718309864e-05,
"loss": 0.3296,
"step": 240
},
{
"epoch": 0.06,
"grad_norm": 0.4742206633090973,
"learning_rate": 4.941314553990611e-05,
"loss": 0.3181,
"step": 250
},
{
"epoch": 0.06,
"grad_norm": 0.44790124893188477,
"learning_rate": 4.938967136150235e-05,
"loss": 0.3116,
"step": 260
},
{
"epoch": 0.06,
"grad_norm": 0.3388730585575104,
"learning_rate": 4.936619718309859e-05,
"loss": 0.3311,
"step": 270
},
{
"epoch": 0.07,
"grad_norm": 1.0921857357025146,
"learning_rate": 4.934272300469484e-05,
"loss": 0.3167,
"step": 280
},
{
"epoch": 0.07,
"grad_norm": 0.29578328132629395,
"learning_rate": 4.931924882629108e-05,
"loss": 0.3104,
"step": 290
},
{
"epoch": 0.07,
"grad_norm": 0.5786669254302979,
"learning_rate": 4.929577464788733e-05,
"loss": 0.3032,
"step": 300
},
{
"epoch": 0.07,
"grad_norm": 0.3513028919696808,
"learning_rate": 4.927230046948357e-05,
"loss": 0.3187,
"step": 310
},
{
"epoch": 0.08,
"grad_norm": 0.6156922578811646,
"learning_rate": 4.9248826291079816e-05,
"loss": 0.3145,
"step": 320
},
{
"epoch": 0.08,
"grad_norm": 0.43208229541778564,
"learning_rate": 4.9225352112676056e-05,
"loss": 0.3178,
"step": 330
},
{
"epoch": 0.08,
"grad_norm": 0.3528173863887787,
"learning_rate": 4.92018779342723e-05,
"loss": 0.3074,
"step": 340
},
{
"epoch": 0.08,
"grad_norm": 0.6173387765884399,
"learning_rate": 4.9178403755868544e-05,
"loss": 0.3121,
"step": 350
},
{
"epoch": 0.08,
"grad_norm": 1.8469752073287964,
"learning_rate": 4.915492957746479e-05,
"loss": 0.3007,
"step": 360
},
{
"epoch": 0.09,
"grad_norm": 0.5218201875686646,
"learning_rate": 4.913145539906103e-05,
"loss": 0.3022,
"step": 370
},
{
"epoch": 0.09,
"grad_norm": 0.46562883257865906,
"learning_rate": 4.910798122065728e-05,
"loss": 0.3074,
"step": 380
},
{
"epoch": 0.09,
"grad_norm": 0.3317212760448456,
"learning_rate": 4.908450704225353e-05,
"loss": 0.2938,
"step": 390
},
{
"epoch": 0.09,
"grad_norm": 0.5221473574638367,
"learning_rate": 4.906103286384977e-05,
"loss": 0.2981,
"step": 400
},
{
"epoch": 0.1,
"grad_norm": 0.4833567142486572,
"learning_rate": 4.903755868544601e-05,
"loss": 0.2911,
"step": 410
},
{
"epoch": 0.1,
"grad_norm": 0.3665942847728729,
"learning_rate": 4.9014084507042255e-05,
"loss": 0.2836,
"step": 420
},
{
"epoch": 0.1,
"grad_norm": 0.9034627079963684,
"learning_rate": 4.8990610328638496e-05,
"loss": 0.2948,
"step": 430
},
{
"epoch": 0.1,
"grad_norm": 0.40714317560195923,
"learning_rate": 4.896713615023474e-05,
"loss": 0.3087,
"step": 440
},
{
"epoch": 0.11,
"grad_norm": 0.4227840304374695,
"learning_rate": 4.894366197183099e-05,
"loss": 0.3115,
"step": 450
},
{
"epoch": 0.11,
"grad_norm": 0.471699982881546,
"learning_rate": 4.892018779342723e-05,
"loss": 0.2815,
"step": 460
},
{
"epoch": 0.11,
"grad_norm": 0.5239991545677185,
"learning_rate": 4.889671361502348e-05,
"loss": 0.2906,
"step": 470
},
{
"epoch": 0.11,
"grad_norm": 0.5088714361190796,
"learning_rate": 4.887323943661972e-05,
"loss": 0.2837,
"step": 480
},
{
"epoch": 0.12,
"grad_norm": 0.568097710609436,
"learning_rate": 4.884976525821596e-05,
"loss": 0.2848,
"step": 490
},
{
"epoch": 0.12,
"grad_norm": 0.5387840867042542,
"learning_rate": 4.882629107981221e-05,
"loss": 0.2885,
"step": 500
},
{
"epoch": 0.12,
"eval_loss": 0.27685049176216125,
"eval_macro/f1": 0.09752839011356046,
"eval_macro/precision": 0.10432297249017507,
"eval_macro/recall": 0.09156475384132957,
"eval_micro/f1": 0.21273753870438952,
"eval_micro/precision": 0.9389067524115756,
"eval_micro/recall": 0.11995891817870592,
"eval_runtime": 27.609,
"eval_samples/accuracy": 0.11995891817870592,
"eval_samples_per_second": 528.994,
"eval_steps_per_second": 16.553,
"step": 500
},
{
"epoch": 0.12,
"grad_norm": 1.117136001586914,
"learning_rate": 4.8802816901408454e-05,
"loss": 0.3016,
"step": 510
},
{
"epoch": 0.12,
"grad_norm": 0.5608183145523071,
"learning_rate": 4.8779342723004695e-05,
"loss": 0.2758,
"step": 520
},
{
"epoch": 0.12,
"grad_norm": 1.1747539043426514,
"learning_rate": 4.875586854460094e-05,
"loss": 0.2871,
"step": 530
},
{
"epoch": 0.13,
"grad_norm": 1.1754446029663086,
"learning_rate": 4.873239436619719e-05,
"loss": 0.27,
"step": 540
},
{
"epoch": 0.13,
"grad_norm": 0.45414578914642334,
"learning_rate": 4.870892018779343e-05,
"loss": 0.2702,
"step": 550
},
{
"epoch": 0.13,
"grad_norm": 0.651540219783783,
"learning_rate": 4.868544600938967e-05,
"loss": 0.265,
"step": 560
},
{
"epoch": 0.13,
"grad_norm": 0.6937834024429321,
"learning_rate": 4.866197183098592e-05,
"loss": 0.278,
"step": 570
},
{
"epoch": 0.14,
"grad_norm": 1.1861662864685059,
"learning_rate": 4.863849765258216e-05,
"loss": 0.26,
"step": 580
},
{
"epoch": 0.14,
"grad_norm": 0.8335360884666443,
"learning_rate": 4.8615023474178406e-05,
"loss": 0.2595,
"step": 590
},
{
"epoch": 0.14,
"grad_norm": 0.48560401797294617,
"learning_rate": 4.8591549295774653e-05,
"loss": 0.2769,
"step": 600
},
{
"epoch": 0.14,
"grad_norm": 1.0077592134475708,
"learning_rate": 4.8568075117370894e-05,
"loss": 0.2473,
"step": 610
},
{
"epoch": 0.15,
"grad_norm": 0.5798254013061523,
"learning_rate": 4.854460093896714e-05,
"loss": 0.2486,
"step": 620
},
{
"epoch": 0.15,
"grad_norm": 1.9621915817260742,
"learning_rate": 4.852112676056338e-05,
"loss": 0.264,
"step": 630
},
{
"epoch": 0.15,
"grad_norm": 0.764593780040741,
"learning_rate": 4.849765258215962e-05,
"loss": 0.2519,
"step": 640
},
{
"epoch": 0.15,
"grad_norm": 0.6093132495880127,
"learning_rate": 4.847417840375587e-05,
"loss": 0.2366,
"step": 650
},
{
"epoch": 0.15,
"grad_norm": 5.682877063751221,
"learning_rate": 4.845774647887324e-05,
"loss": 0.2781,
"step": 660
},
{
"epoch": 0.16,
"grad_norm": 1.4239857196807861,
"learning_rate": 4.843427230046948e-05,
"loss": 0.2548,
"step": 670
},
{
"epoch": 0.16,
"grad_norm": 1.2523099184036255,
"learning_rate": 4.841079812206573e-05,
"loss": 0.24,
"step": 680
},
{
"epoch": 0.16,
"grad_norm": 1.2140272855758667,
"learning_rate": 4.838732394366197e-05,
"loss": 0.2195,
"step": 690
},
{
"epoch": 0.16,
"grad_norm": 0.9842355847358704,
"learning_rate": 4.836384976525822e-05,
"loss": 0.2268,
"step": 700
},
{
"epoch": 0.17,
"grad_norm": 16.670461654663086,
"learning_rate": 4.8340375586854466e-05,
"loss": 0.2344,
"step": 710
},
{
"epoch": 0.17,
"grad_norm": 1.6551809310913086,
"learning_rate": 4.8316901408450706e-05,
"loss": 0.2304,
"step": 720
},
{
"epoch": 0.17,
"grad_norm": 1.0985769033432007,
"learning_rate": 4.8293427230046953e-05,
"loss": 0.2318,
"step": 730
},
{
"epoch": 0.17,
"grad_norm": 2.296762228012085,
"learning_rate": 4.8269953051643194e-05,
"loss": 0.233,
"step": 740
},
{
"epoch": 0.18,
"grad_norm": 1.6605547666549683,
"learning_rate": 4.8246478873239435e-05,
"loss": 0.2134,
"step": 750
},
{
"epoch": 0.18,
"grad_norm": 1.3795205354690552,
"learning_rate": 4.822300469483568e-05,
"loss": 0.2099,
"step": 760
},
{
"epoch": 0.18,
"grad_norm": 2.0488667488098145,
"learning_rate": 4.819953051643193e-05,
"loss": 0.2432,
"step": 770
},
{
"epoch": 0.18,
"grad_norm": 2.1046578884124756,
"learning_rate": 4.817605633802817e-05,
"loss": 0.2393,
"step": 780
},
{
"epoch": 0.19,
"grad_norm": 1.8910871744155884,
"learning_rate": 4.815258215962442e-05,
"loss": 0.2256,
"step": 790
},
{
"epoch": 0.19,
"grad_norm": 2.1071548461914062,
"learning_rate": 4.8129107981220665e-05,
"loss": 0.231,
"step": 800
},
{
"epoch": 0.19,
"grad_norm": 0.6893961429595947,
"learning_rate": 4.8105633802816905e-05,
"loss": 0.2251,
"step": 810
},
{
"epoch": 0.19,
"grad_norm": 0.9333511590957642,
"learning_rate": 4.8082159624413146e-05,
"loss": 0.2202,
"step": 820
},
{
"epoch": 0.19,
"grad_norm": 1.001318335533142,
"learning_rate": 4.805868544600939e-05,
"loss": 0.2055,
"step": 830
},
{
"epoch": 0.2,
"grad_norm": 1.0097473859786987,
"learning_rate": 4.8035211267605634e-05,
"loss": 0.2062,
"step": 840
},
{
"epoch": 0.2,
"grad_norm": 1.3152905702590942,
"learning_rate": 4.801173708920188e-05,
"loss": 0.193,
"step": 850
},
{
"epoch": 0.2,
"grad_norm": 2.033956527709961,
"learning_rate": 4.798826291079813e-05,
"loss": 0.2181,
"step": 860
},
{
"epoch": 0.2,
"grad_norm": 0.7312398552894592,
"learning_rate": 4.796478873239437e-05,
"loss": 0.1852,
"step": 870
},
{
"epoch": 0.21,
"grad_norm": 1.898751139640808,
"learning_rate": 4.794131455399061e-05,
"loss": 0.2077,
"step": 880
},
{
"epoch": 0.21,
"grad_norm": 0.6988235712051392,
"learning_rate": 4.791784037558686e-05,
"loss": 0.2234,
"step": 890
},
{
"epoch": 0.21,
"grad_norm": 0.8618746399879456,
"learning_rate": 4.78943661971831e-05,
"loss": 0.1829,
"step": 900
},
{
"epoch": 0.21,
"grad_norm": 2.307588577270508,
"learning_rate": 4.7870892018779345e-05,
"loss": 0.1791,
"step": 910
},
{
"epoch": 0.22,
"grad_norm": 0.8917971849441528,
"learning_rate": 4.784741784037559e-05,
"loss": 0.1817,
"step": 920
},
{
"epoch": 0.22,
"grad_norm": 1.6609673500061035,
"learning_rate": 4.782394366197183e-05,
"loss": 0.2039,
"step": 930
},
{
"epoch": 0.22,
"grad_norm": 1.1030490398406982,
"learning_rate": 4.780046948356808e-05,
"loss": 0.1924,
"step": 940
},
{
"epoch": 0.22,
"grad_norm": 0.8905339241027832,
"learning_rate": 4.777699530516432e-05,
"loss": 0.1902,
"step": 950
},
{
"epoch": 0.23,
"grad_norm": 1.4277052879333496,
"learning_rate": 4.775352112676056e-05,
"loss": 0.1888,
"step": 960
},
{
"epoch": 0.23,
"grad_norm": 1.2068595886230469,
"learning_rate": 4.773004694835681e-05,
"loss": 0.1963,
"step": 970
},
{
"epoch": 0.23,
"grad_norm": 2.084404706954956,
"learning_rate": 4.7706572769953056e-05,
"loss": 0.18,
"step": 980
},
{
"epoch": 0.23,
"grad_norm": 0.7552419900894165,
"learning_rate": 4.7683098591549296e-05,
"loss": 0.1897,
"step": 990
},
{
"epoch": 0.23,
"grad_norm": 1.4683459997177124,
"learning_rate": 4.7659624413145544e-05,
"loss": 0.19,
"step": 1000
},
{
"epoch": 0.23,
"eval_loss": 0.17012138664722443,
"eval_macro/f1": 0.6974816804549718,
"eval_macro/precision": 0.814657194389621,
"eval_macro/recall": 0.6377254831311044,
"eval_micro/f1": 0.7211732077048745,
"eval_micro/precision": 0.8100887674974394,
"eval_micro/recall": 0.6498459431701472,
"eval_runtime": 27.2589,
"eval_samples/accuracy": 0.6475179732968162,
"eval_samples_per_second": 535.787,
"eval_steps_per_second": 16.765,
"step": 1000
},
{
"epoch": 0.24,
"grad_norm": 0.9634861350059509,
"learning_rate": 4.763615023474179e-05,
"loss": 0.1842,
"step": 1010
},
{
"epoch": 0.24,
"grad_norm": 2.3621294498443604,
"learning_rate": 4.761267605633803e-05,
"loss": 0.1755,
"step": 1020
},
{
"epoch": 0.24,
"grad_norm": 1.732932686805725,
"learning_rate": 4.758920187793427e-05,
"loss": 0.1896,
"step": 1030
},
{
"epoch": 0.24,
"grad_norm": 3.365919589996338,
"learning_rate": 4.756572769953052e-05,
"loss": 0.1627,
"step": 1040
},
{
"epoch": 0.25,
"grad_norm": 0.9022215604782104,
"learning_rate": 4.754225352112676e-05,
"loss": 0.177,
"step": 1050
},
{
"epoch": 0.25,
"grad_norm": 1.9240831136703491,
"learning_rate": 4.751877934272301e-05,
"loss": 0.1495,
"step": 1060
},
{
"epoch": 0.25,
"grad_norm": 1.7557075023651123,
"learning_rate": 4.7495305164319255e-05,
"loss": 0.1884,
"step": 1070
},
{
"epoch": 0.25,
"grad_norm": 1.368794322013855,
"learning_rate": 4.7471830985915495e-05,
"loss": 0.1852,
"step": 1080
},
{
"epoch": 0.26,
"grad_norm": 3.1905553340911865,
"learning_rate": 4.744835680751174e-05,
"loss": 0.1558,
"step": 1090
},
{
"epoch": 0.26,
"grad_norm": 1.3366222381591797,
"learning_rate": 4.742488262910798e-05,
"loss": 0.1658,
"step": 1100
},
{
"epoch": 0.26,
"grad_norm": 0.9878789782524109,
"learning_rate": 4.7401408450704224e-05,
"loss": 0.1988,
"step": 1110
},
{
"epoch": 0.26,
"grad_norm": 1.590110182762146,
"learning_rate": 4.737793427230047e-05,
"loss": 0.1652,
"step": 1120
},
{
"epoch": 0.27,
"grad_norm": 1.6078647375106812,
"learning_rate": 4.735446009389671e-05,
"loss": 0.1874,
"step": 1130
},
{
"epoch": 0.27,
"grad_norm": 1.7055974006652832,
"learning_rate": 4.733098591549296e-05,
"loss": 0.1679,
"step": 1140
},
{
"epoch": 0.27,
"grad_norm": 2.226369619369507,
"learning_rate": 4.7307511737089206e-05,
"loss": 0.1648,
"step": 1150
},
{
"epoch": 0.27,
"grad_norm": 0.7244293689727783,
"learning_rate": 4.728403755868545e-05,
"loss": 0.1594,
"step": 1160
},
{
"epoch": 0.27,
"grad_norm": 2.184124231338501,
"learning_rate": 4.7260563380281694e-05,
"loss": 0.1648,
"step": 1170
},
{
"epoch": 0.28,
"grad_norm": 1.080694556236267,
"learning_rate": 4.7237089201877935e-05,
"loss": 0.1849,
"step": 1180
},
{
"epoch": 0.28,
"grad_norm": 2.224308490753174,
"learning_rate": 4.7213615023474176e-05,
"loss": 0.1697,
"step": 1190
},
{
"epoch": 0.28,
"grad_norm": 2.618312120437622,
"learning_rate": 4.719014084507042e-05,
"loss": 0.1416,
"step": 1200
},
{
"epoch": 0.28,
"grad_norm": 1.3569130897521973,
"learning_rate": 4.716666666666667e-05,
"loss": 0.1557,
"step": 1210
},
{
"epoch": 0.29,
"grad_norm": 2.247316598892212,
"learning_rate": 4.714319248826291e-05,
"loss": 0.1846,
"step": 1220
},
{
"epoch": 0.29,
"grad_norm": 1.6453408002853394,
"learning_rate": 4.711971830985916e-05,
"loss": 0.1483,
"step": 1230
},
{
"epoch": 0.29,
"grad_norm": 1.5334419012069702,
"learning_rate": 4.7096244131455405e-05,
"loss": 0.1663,
"step": 1240
},
{
"epoch": 0.29,
"grad_norm": 1.6418590545654297,
"learning_rate": 4.7072769953051646e-05,
"loss": 0.147,
"step": 1250
},
{
"epoch": 0.3,
"grad_norm": 2.0844509601593018,
"learning_rate": 4.704929577464789e-05,
"loss": 0.1543,
"step": 1260
},
{
"epoch": 0.3,
"grad_norm": 1.806553602218628,
"learning_rate": 4.7025821596244134e-05,
"loss": 0.1799,
"step": 1270
},
{
"epoch": 0.3,
"grad_norm": 1.5278924703598022,
"learning_rate": 4.7002347417840375e-05,
"loss": 0.1527,
"step": 1280
},
{
"epoch": 0.3,
"grad_norm": 0.797590434551239,
"learning_rate": 4.697887323943662e-05,
"loss": 0.1468,
"step": 1290
},
{
"epoch": 0.31,
"grad_norm": 0.9068496823310852,
"learning_rate": 4.695539906103287e-05,
"loss": 0.1606,
"step": 1300
},
{
"epoch": 0.31,
"grad_norm": 2.6918179988861084,
"learning_rate": 4.693192488262911e-05,
"loss": 0.1602,
"step": 1310
},
{
"epoch": 0.31,
"grad_norm": 2.412083864212036,
"learning_rate": 4.690845070422536e-05,
"loss": 0.1583,
"step": 1320
},
{
"epoch": 0.31,
"grad_norm": 2.339053153991699,
"learning_rate": 4.68849765258216e-05,
"loss": 0.1834,
"step": 1330
},
{
"epoch": 0.31,
"grad_norm": 4.236501693725586,
"learning_rate": 4.686150234741784e-05,
"loss": 0.1337,
"step": 1340
},
{
"epoch": 0.32,
"grad_norm": 4.246243000030518,
"learning_rate": 4.6838028169014086e-05,
"loss": 0.156,
"step": 1350
},
{
"epoch": 0.32,
"grad_norm": 2.2402584552764893,
"learning_rate": 4.681455399061033e-05,
"loss": 0.1529,
"step": 1360
},
{
"epoch": 0.32,
"grad_norm": 4.787749767303467,
"learning_rate": 4.6791079812206574e-05,
"loss": 0.1547,
"step": 1370
},
{
"epoch": 0.32,
"grad_norm": 0.6329225897789001,
"learning_rate": 4.676760563380282e-05,
"loss": 0.1333,
"step": 1380
},
{
"epoch": 0.33,
"grad_norm": 1.7299346923828125,
"learning_rate": 4.674413145539907e-05,
"loss": 0.1383,
"step": 1390
},
{
"epoch": 0.33,
"grad_norm": 1.7260222434997559,
"learning_rate": 4.672065727699531e-05,
"loss": 0.1457,
"step": 1400
},
{
"epoch": 0.33,
"grad_norm": 2.683088779449463,
"learning_rate": 4.669718309859155e-05,
"loss": 0.1243,
"step": 1410
},
{
"epoch": 0.33,
"grad_norm": 2.2119216918945312,
"learning_rate": 4.66737089201878e-05,
"loss": 0.1469,
"step": 1420
},
{
"epoch": 0.34,
"grad_norm": 1.2248650789260864,
"learning_rate": 4.665023474178404e-05,
"loss": 0.1628,
"step": 1430
},
{
"epoch": 0.34,
"grad_norm": 1.9862676858901978,
"learning_rate": 4.6626760563380285e-05,
"loss": 0.1234,
"step": 1440
},
{
"epoch": 0.34,
"grad_norm": 3.197272539138794,
"learning_rate": 4.660328638497653e-05,
"loss": 0.1371,
"step": 1450
},
{
"epoch": 0.34,
"grad_norm": 2.4832003116607666,
"learning_rate": 4.657981220657277e-05,
"loss": 0.1424,
"step": 1460
},
{
"epoch": 0.35,
"grad_norm": 1.848587989807129,
"learning_rate": 4.655633802816901e-05,
"loss": 0.1151,
"step": 1470
},
{
"epoch": 0.35,
"grad_norm": 2.1081154346466064,
"learning_rate": 4.653286384976526e-05,
"loss": 0.1193,
"step": 1480
},
{
"epoch": 0.35,
"grad_norm": 3.711557388305664,
"learning_rate": 4.65093896713615e-05,
"loss": 0.1575,
"step": 1490
},
{
"epoch": 0.35,
"grad_norm": 2.7196974754333496,
"learning_rate": 4.648591549295775e-05,
"loss": 0.1425,
"step": 1500
},
{
"epoch": 0.35,
"eval_loss": 0.13336510956287384,
"eval_macro/f1": 0.77789474745571,
"eval_macro/precision": 0.8167214646472566,
"eval_macro/recall": 0.7518631015976748,
"eval_micro/f1": 0.7850268139361438,
"eval_micro/precision": 0.8155253837072018,
"eval_micro/recall": 0.7567271482369051,
"eval_runtime": 27.3098,
"eval_samples/accuracy": 0.7492639507018144,
"eval_samples_per_second": 534.79,
"eval_steps_per_second": 16.734,
"step": 1500
},
{
"epoch": 0.35,
"grad_norm": 1.3112411499023438,
"learning_rate": 4.6462441314553996e-05,
"loss": 0.1383,
"step": 1510
},
{
"epoch": 0.36,
"grad_norm": 1.371739387512207,
"learning_rate": 4.6438967136150236e-05,
"loss": 0.169,
"step": 1520
},
{
"epoch": 0.36,
"grad_norm": 2.0370471477508545,
"learning_rate": 4.6415492957746484e-05,
"loss": 0.1304,
"step": 1530
},
{
"epoch": 0.36,
"grad_norm": 2.2070887088775635,
"learning_rate": 4.6392018779342724e-05,
"loss": 0.156,
"step": 1540
},
{
"epoch": 0.36,
"grad_norm": 10.297237396240234,
"learning_rate": 4.6368544600938965e-05,
"loss": 0.1527,
"step": 1550
},
{
"epoch": 0.37,
"grad_norm": 1.114290475845337,
"learning_rate": 4.634507042253521e-05,
"loss": 0.1351,
"step": 1560
},
{
"epoch": 0.37,
"grad_norm": 1.2772897481918335,
"learning_rate": 4.632159624413146e-05,
"loss": 0.1397,
"step": 1570
},
{
"epoch": 0.37,
"grad_norm": 0.7256558537483215,
"learning_rate": 4.62981220657277e-05,
"loss": 0.1394,
"step": 1580
},
{
"epoch": 0.37,
"grad_norm": 2.603022575378418,
"learning_rate": 4.627464788732395e-05,
"loss": 0.1641,
"step": 1590
},
{
"epoch": 0.38,
"grad_norm": 1.835552453994751,
"learning_rate": 4.6251173708920195e-05,
"loss": 0.1645,
"step": 1600
},
{
"epoch": 0.38,
"grad_norm": 3.006970167160034,
"learning_rate": 4.6227699530516435e-05,
"loss": 0.1458,
"step": 1610
},
{
"epoch": 0.38,
"grad_norm": 2.926814079284668,
"learning_rate": 4.6204225352112676e-05,
"loss": 0.164,
"step": 1620
},
{
"epoch": 0.38,
"grad_norm": 3.104510545730591,
"learning_rate": 4.618075117370892e-05,
"loss": 0.1687,
"step": 1630
},
{
"epoch": 0.38,
"grad_norm": 2.2601518630981445,
"learning_rate": 4.6157276995305164e-05,
"loss": 0.1615,
"step": 1640
},
{
"epoch": 0.39,
"grad_norm": 1.0919325351715088,
"learning_rate": 4.613380281690141e-05,
"loss": 0.1519,
"step": 1650
},
{
"epoch": 0.39,
"grad_norm": 3.507404327392578,
"learning_rate": 4.611032863849766e-05,
"loss": 0.133,
"step": 1660
},
{
"epoch": 0.39,
"grad_norm": 1.8300056457519531,
"learning_rate": 4.60868544600939e-05,
"loss": 0.1772,
"step": 1670
},
{
"epoch": 0.39,
"grad_norm": 1.9435104131698608,
"learning_rate": 4.6063380281690146e-05,
"loss": 0.1298,
"step": 1680
},
{
"epoch": 0.4,
"grad_norm": 2.371901035308838,
"learning_rate": 4.603990610328639e-05,
"loss": 0.1652,
"step": 1690
},
{
"epoch": 0.4,
"grad_norm": 2.4579296112060547,
"learning_rate": 4.601643192488263e-05,
"loss": 0.1144,
"step": 1700
},
{
"epoch": 0.4,
"grad_norm": 2.280348300933838,
"learning_rate": 4.5992957746478875e-05,
"loss": 0.1363,
"step": 1710
},
{
"epoch": 0.4,
"grad_norm": 3.6556637287139893,
"learning_rate": 4.5969483568075115e-05,
"loss": 0.1297,
"step": 1720
},
{
"epoch": 0.41,
"grad_norm": 0.8788136839866638,
"learning_rate": 4.594600938967136e-05,
"loss": 0.134,
"step": 1730
},
{
"epoch": 0.41,
"grad_norm": 0.5975239276885986,
"learning_rate": 4.592253521126761e-05,
"loss": 0.1058,
"step": 1740
},
{
"epoch": 0.41,
"grad_norm": 3.037933588027954,
"learning_rate": 4.589906103286385e-05,
"loss": 0.1139,
"step": 1750
},
{
"epoch": 0.41,
"grad_norm": 0.6861093044281006,
"learning_rate": 4.58755868544601e-05,
"loss": 0.134,
"step": 1760
},
{
"epoch": 0.42,
"grad_norm": 2.547356128692627,
"learning_rate": 4.585211267605634e-05,
"loss": 0.1278,
"step": 1770
},
{
"epoch": 0.42,
"grad_norm": 2.218045473098755,
"learning_rate": 4.582863849765258e-05,
"loss": 0.1167,
"step": 1780
},
{
"epoch": 0.42,
"grad_norm": 1.8614739179611206,
"learning_rate": 4.5805164319248827e-05,
"loss": 0.1134,
"step": 1790
},
{
"epoch": 0.42,
"grad_norm": 2.04622483253479,
"learning_rate": 4.5781690140845074e-05,
"loss": 0.1331,
"step": 1800
},
{
"epoch": 0.42,
"grad_norm": 1.9503240585327148,
"learning_rate": 4.5758215962441315e-05,
"loss": 0.1151,
"step": 1810
},
{
"epoch": 0.43,
"grad_norm": 2.473653554916382,
"learning_rate": 4.573474178403756e-05,
"loss": 0.1188,
"step": 1820
},
{
"epoch": 0.43,
"grad_norm": 4.5746307373046875,
"learning_rate": 4.571126760563381e-05,
"loss": 0.109,
"step": 1830
},
{
"epoch": 0.43,
"grad_norm": 2.2897071838378906,
"learning_rate": 4.568779342723005e-05,
"loss": 0.135,
"step": 1840
},
{
"epoch": 0.43,
"grad_norm": 2.783514976501465,
"learning_rate": 4.566431924882629e-05,
"loss": 0.1216,
"step": 1850
},
{
"epoch": 0.44,
"grad_norm": 0.9944408535957336,
"learning_rate": 4.564084507042254e-05,
"loss": 0.1105,
"step": 1860
},
{
"epoch": 0.44,
"grad_norm": 2.4792580604553223,
"learning_rate": 4.561737089201878e-05,
"loss": 0.1215,
"step": 1870
},
{
"epoch": 0.44,
"grad_norm": 3.559095621109009,
"learning_rate": 4.5593896713615026e-05,
"loss": 0.1097,
"step": 1880
},
{
"epoch": 0.44,
"grad_norm": 2.6624674797058105,
"learning_rate": 4.557042253521127e-05,
"loss": 0.1206,
"step": 1890
},
{
"epoch": 0.45,
"grad_norm": 2.1126134395599365,
"learning_rate": 4.5546948356807514e-05,
"loss": 0.1314,
"step": 1900
},
{
"epoch": 0.45,
"grad_norm": 2.4518086910247803,
"learning_rate": 4.552347417840376e-05,
"loss": 0.1317,
"step": 1910
},
{
"epoch": 0.45,
"grad_norm": 2.835390329360962,
"learning_rate": 4.55e-05,
"loss": 0.1224,
"step": 1920
},
{
"epoch": 0.45,
"grad_norm": 1.0483720302581787,
"learning_rate": 4.547652582159624e-05,
"loss": 0.1148,
"step": 1930
},
{
"epoch": 0.46,
"grad_norm": 1.8186525106430054,
"learning_rate": 4.545305164319249e-05,
"loss": 0.1364,
"step": 1940
},
{
"epoch": 0.46,
"grad_norm": 1.4727530479431152,
"learning_rate": 4.542957746478874e-05,
"loss": 0.1349,
"step": 1950
},
{
"epoch": 0.46,
"grad_norm": 2.574378490447998,
"learning_rate": 4.540610328638498e-05,
"loss": 0.128,
"step": 1960
},
{
"epoch": 0.46,
"grad_norm": 2.9332356452941895,
"learning_rate": 4.5382629107981225e-05,
"loss": 0.1286,
"step": 1970
},
{
"epoch": 0.46,
"grad_norm": 2.3091564178466797,
"learning_rate": 4.535915492957747e-05,
"loss": 0.1274,
"step": 1980
},
{
"epoch": 0.47,
"grad_norm": 1.5543984174728394,
"learning_rate": 4.533568075117371e-05,
"loss": 0.1205,
"step": 1990
},
{
"epoch": 0.47,
"grad_norm": 4.30750846862793,
"learning_rate": 4.531220657276995e-05,
"loss": 0.1399,
"step": 2000
},
{
"epoch": 0.47,
"eval_loss": 0.11542148888111115,
"eval_macro/f1": 0.8150834145504557,
"eval_macro/precision": 0.8413019593078327,
"eval_macro/recall": 0.7991287240846648,
"eval_micro/f1": 0.8217569126437585,
"eval_micro/precision": 0.8394515069275817,
"eval_micro/recall": 0.8047928791509757,
"eval_runtime": 28.9918,
"eval_samples/accuracy": 0.796644984594317,
"eval_samples_per_second": 503.764,
"eval_steps_per_second": 15.763,
"step": 2000
},
{
"epoch": 0.47,
"grad_norm": 2.2433559894561768,
"learning_rate": 4.52887323943662e-05,
"loss": 0.1168,
"step": 2010
},
{
"epoch": 0.47,
"grad_norm": 1.0208545923233032,
"learning_rate": 4.526525821596244e-05,
"loss": 0.1003,
"step": 2020
},
{
"epoch": 0.48,
"grad_norm": 0.44397255778312683,
"learning_rate": 4.524178403755869e-05,
"loss": 0.0873,
"step": 2030
},
{
"epoch": 0.48,
"grad_norm": 1.4248991012573242,
"learning_rate": 4.5218309859154936e-05,
"loss": 0.0988,
"step": 2040
},
{
"epoch": 0.48,
"grad_norm": 1.7967787981033325,
"learning_rate": 4.5194835680751176e-05,
"loss": 0.1254,
"step": 2050
},
{
"epoch": 0.48,
"grad_norm": 0.8488617539405823,
"learning_rate": 4.517136150234742e-05,
"loss": 0.1011,
"step": 2060
},
{
"epoch": 0.49,
"grad_norm": 2.6594550609588623,
"learning_rate": 4.5147887323943664e-05,
"loss": 0.121,
"step": 2070
},
{
"epoch": 0.49,
"grad_norm": 2.3737144470214844,
"learning_rate": 4.5124413145539905e-05,
"loss": 0.1021,
"step": 2080
},
{
"epoch": 0.49,
"grad_norm": 1.1709132194519043,
"learning_rate": 4.510093896713615e-05,
"loss": 0.112,
"step": 2090
},
{
"epoch": 0.49,
"grad_norm": 3.5635571479797363,
"learning_rate": 4.50774647887324e-05,
"loss": 0.0889,
"step": 2100
},
{
"epoch": 0.5,
"grad_norm": 1.1667410135269165,
"learning_rate": 4.505399061032864e-05,
"loss": 0.1027,
"step": 2110
},
{
"epoch": 0.5,
"grad_norm": 3.7648849487304688,
"learning_rate": 4.503051643192489e-05,
"loss": 0.1391,
"step": 2120
},
{
"epoch": 0.5,
"grad_norm": 1.023391604423523,
"learning_rate": 4.500704225352113e-05,
"loss": 0.1178,
"step": 2130
},
{
"epoch": 0.5,
"grad_norm": 1.9849759340286255,
"learning_rate": 4.498356807511737e-05,
"loss": 0.1134,
"step": 2140
},
{
"epoch": 0.5,
"grad_norm": 2.2011280059814453,
"learning_rate": 4.4960093896713616e-05,
"loss": 0.1243,
"step": 2150
},
{
"epoch": 0.51,
"grad_norm": 5.394866943359375,
"learning_rate": 4.493661971830986e-05,
"loss": 0.1278,
"step": 2160
},
{
"epoch": 0.51,
"grad_norm": 2.788053512573242,
"learning_rate": 4.4913145539906104e-05,
"loss": 0.1166,
"step": 2170
},
{
"epoch": 0.51,
"grad_norm": 1.9931975603103638,
"learning_rate": 4.488967136150235e-05,
"loss": 0.1044,
"step": 2180
},
{
"epoch": 0.51,
"grad_norm": 2.5912959575653076,
"learning_rate": 4.48661971830986e-05,
"loss": 0.1193,
"step": 2190
},
{
"epoch": 0.52,
"grad_norm": 2.024529218673706,
"learning_rate": 4.484272300469484e-05,
"loss": 0.1286,
"step": 2200
},
{
"epoch": 0.52,
"grad_norm": 1.237317442893982,
"learning_rate": 4.481924882629108e-05,
"loss": 0.1266,
"step": 2210
},
{
"epoch": 0.52,
"grad_norm": 1.3059284687042236,
"learning_rate": 4.479577464788733e-05,
"loss": 0.1151,
"step": 2220
},
{
"epoch": 0.52,
"grad_norm": 2.9779324531555176,
"learning_rate": 4.477230046948357e-05,
"loss": 0.0948,
"step": 2230
},
{
"epoch": 0.53,
"grad_norm": 2.533174991607666,
"learning_rate": 4.4748826291079815e-05,
"loss": 0.1128,
"step": 2240
},
{
"epoch": 0.53,
"grad_norm": 1.6116623878479004,
"learning_rate": 4.472535211267606e-05,
"loss": 0.0999,
"step": 2250
},
{
"epoch": 0.53,
"grad_norm": 2.3070313930511475,
"learning_rate": 4.47018779342723e-05,
"loss": 0.1286,
"step": 2260
},
{
"epoch": 0.53,
"grad_norm": 0.7318587899208069,
"learning_rate": 4.467840375586855e-05,
"loss": 0.0921,
"step": 2270
},
{
"epoch": 0.54,
"grad_norm": 0.8361902832984924,
"learning_rate": 4.465492957746479e-05,
"loss": 0.1057,
"step": 2280
},
{
"epoch": 0.54,
"grad_norm": 1.4417285919189453,
"learning_rate": 4.463145539906103e-05,
"loss": 0.1166,
"step": 2290
},
{
"epoch": 0.54,
"grad_norm": 2.194974184036255,
"learning_rate": 4.460798122065728e-05,
"loss": 0.1176,
"step": 2300
},
{
"epoch": 0.54,
"grad_norm": 3.3667984008789062,
"learning_rate": 4.4584507042253526e-05,
"loss": 0.1341,
"step": 2310
},
{
"epoch": 0.54,
"grad_norm": 1.3247355222702026,
"learning_rate": 4.4561032863849767e-05,
"loss": 0.113,
"step": 2320
},
{
"epoch": 0.55,
"grad_norm": 2.8170759677886963,
"learning_rate": 4.4537558685446014e-05,
"loss": 0.1253,
"step": 2330
},
{
"epoch": 0.55,
"grad_norm": 2.810574769973755,
"learning_rate": 4.4514084507042254e-05,
"loss": 0.1287,
"step": 2340
},
{
"epoch": 0.55,
"grad_norm": 1.5883653163909912,
"learning_rate": 4.44906103286385e-05,
"loss": 0.1312,
"step": 2350
},
{
"epoch": 0.55,
"grad_norm": 1.6999726295471191,
"learning_rate": 4.446713615023474e-05,
"loss": 0.1092,
"step": 2360
},
{
"epoch": 0.56,
"grad_norm": 1.7833150625228882,
"learning_rate": 4.444366197183098e-05,
"loss": 0.1055,
"step": 2370
},
{
"epoch": 0.56,
"grad_norm": 1.1052982807159424,
"learning_rate": 4.442018779342723e-05,
"loss": 0.1007,
"step": 2380
},
{
"epoch": 0.56,
"grad_norm": 1.897437334060669,
"learning_rate": 4.439671361502348e-05,
"loss": 0.1058,
"step": 2390
},
{
"epoch": 0.56,
"grad_norm": 1.2218818664550781,
"learning_rate": 4.437323943661972e-05,
"loss": 0.1223,
"step": 2400
},
{
"epoch": 0.57,
"grad_norm": 2.2371973991394043,
"learning_rate": 4.4349765258215966e-05,
"loss": 0.13,
"step": 2410
},
{
"epoch": 0.57,
"grad_norm": 2.4405248165130615,
"learning_rate": 4.432629107981221e-05,
"loss": 0.1087,
"step": 2420
},
{
"epoch": 0.57,
"grad_norm": 3.625314712524414,
"learning_rate": 4.4302816901408453e-05,
"loss": 0.1129,
"step": 2430
},
{
"epoch": 0.57,
"grad_norm": 0.4910762310028076,
"learning_rate": 4.4279342723004694e-05,
"loss": 0.101,
"step": 2440
},
{
"epoch": 0.58,
"grad_norm": 3.9263997077941895,
"learning_rate": 4.425586854460094e-05,
"loss": 0.1112,
"step": 2450
},
{
"epoch": 0.58,
"grad_norm": 2.6563661098480225,
"learning_rate": 4.423239436619718e-05,
"loss": 0.1268,
"step": 2460
},
{
"epoch": 0.58,
"grad_norm": 2.192418098449707,
"learning_rate": 4.420892018779343e-05,
"loss": 0.1131,
"step": 2470
},
{
"epoch": 0.58,
"grad_norm": 2.195281505584717,
"learning_rate": 4.418544600938968e-05,
"loss": 0.1094,
"step": 2480
},
{
"epoch": 0.58,
"grad_norm": 2.4269959926605225,
"learning_rate": 4.416197183098592e-05,
"loss": 0.101,
"step": 2490
},
{
"epoch": 0.59,
"grad_norm": 1.625380516052246,
"learning_rate": 4.4138497652582165e-05,
"loss": 0.0976,
"step": 2500
},
{
"epoch": 0.59,
"eval_loss": 0.10971714556217194,
"eval_macro/f1": 0.8284099206756298,
"eval_macro/precision": 0.8512250265581908,
"eval_macro/recall": 0.8153409474616256,
"eval_micro/f1": 0.8356522648812638,
"eval_micro/precision": 0.8514781125639568,
"eval_micro/recall": 0.8204039712427251,
"eval_runtime": 28.2017,
"eval_samples/accuracy": 0.8112290311537145,
"eval_samples_per_second": 517.876,
"eval_steps_per_second": 16.205,
"step": 2500
},
{
"epoch": 0.59,
"grad_norm": 3.6350128650665283,
"learning_rate": 4.4115023474178405e-05,
"loss": 0.1133,
"step": 2510
},
{
"epoch": 0.59,
"grad_norm": 2.4444243907928467,
"learning_rate": 4.4091549295774646e-05,
"loss": 0.1404,
"step": 2520
},
{
"epoch": 0.59,
"grad_norm": 1.2758675813674927,
"learning_rate": 4.406807511737089e-05,
"loss": 0.1049,
"step": 2530
},
{
"epoch": 0.6,
"grad_norm": 1.4315747022628784,
"learning_rate": 4.404460093896714e-05,
"loss": 0.1005,
"step": 2540
},
{
"epoch": 0.6,
"grad_norm": 2.812558174133301,
"learning_rate": 4.402112676056338e-05,
"loss": 0.1143,
"step": 2550
},
{
"epoch": 0.6,
"grad_norm": 2.0793673992156982,
"learning_rate": 4.399765258215963e-05,
"loss": 0.1275,
"step": 2560
},
{
"epoch": 0.6,
"grad_norm": 1.506557583808899,
"learning_rate": 4.3974178403755876e-05,
"loss": 0.0949,
"step": 2570
},
{
"epoch": 0.61,
"grad_norm": 2.994401216506958,
"learning_rate": 4.395070422535211e-05,
"loss": 0.11,
"step": 2580
},
{
"epoch": 0.61,
"grad_norm": 3.2654285430908203,
"learning_rate": 4.392723004694836e-05,
"loss": 0.1286,
"step": 2590
},
{
"epoch": 0.61,
"grad_norm": 1.5115245580673218,
"learning_rate": 4.3903755868544604e-05,
"loss": 0.0963,
"step": 2600
},
{
"epoch": 0.61,
"grad_norm": 1.503602147102356,
"learning_rate": 4.3880281690140845e-05,
"loss": 0.1148,
"step": 2610
},
{
"epoch": 0.62,
"grad_norm": 2.3735108375549316,
"learning_rate": 4.385680751173709e-05,
"loss": 0.1222,
"step": 2620
},
{
"epoch": 0.62,
"grad_norm": 2.604314088821411,
"learning_rate": 4.383333333333334e-05,
"loss": 0.104,
"step": 2630
},
{
"epoch": 0.62,
"grad_norm": 2.7355499267578125,
"learning_rate": 4.380985915492958e-05,
"loss": 0.1145,
"step": 2640
},
{
"epoch": 0.62,
"grad_norm": 3.0043768882751465,
"learning_rate": 4.378638497652582e-05,
"loss": 0.1341,
"step": 2650
},
{
"epoch": 0.62,
"grad_norm": 2.7208704948425293,
"learning_rate": 4.376291079812207e-05,
"loss": 0.1132,
"step": 2660
},
{
"epoch": 0.63,
"grad_norm": 1.2890592813491821,
"learning_rate": 4.373943661971831e-05,
"loss": 0.1036,
"step": 2670
},
{
"epoch": 0.63,
"grad_norm": 1.7100173234939575,
"learning_rate": 4.3715962441314556e-05,
"loss": 0.1012,
"step": 2680
},
{
"epoch": 0.63,
"grad_norm": 3.7833025455474854,
"learning_rate": 4.36924882629108e-05,
"loss": 0.0968,
"step": 2690
},
{
"epoch": 0.63,
"grad_norm": 1.8149163722991943,
"learning_rate": 4.3669014084507044e-05,
"loss": 0.111,
"step": 2700
},
{
"epoch": 0.64,
"grad_norm": 1.475219488143921,
"learning_rate": 4.364553990610329e-05,
"loss": 0.0996,
"step": 2710
},
{
"epoch": 0.64,
"grad_norm": 2.110396146774292,
"learning_rate": 4.362206572769953e-05,
"loss": 0.0994,
"step": 2720
},
{
"epoch": 0.64,
"grad_norm": 1.823807716369629,
"learning_rate": 4.359859154929577e-05,
"loss": 0.1095,
"step": 2730
},
{
"epoch": 0.64,
"grad_norm": 1.6784656047821045,
"learning_rate": 4.357511737089202e-05,
"loss": 0.1321,
"step": 2740
},
{
"epoch": 0.65,
"grad_norm": 1.6397862434387207,
"learning_rate": 4.355164319248827e-05,
"loss": 0.1109,
"step": 2750
},
{
"epoch": 0.65,
"grad_norm": 2.6760945320129395,
"learning_rate": 4.352816901408451e-05,
"loss": 0.1076,
"step": 2760
},
{
"epoch": 0.65,
"grad_norm": 2.7687478065490723,
"learning_rate": 4.3504694835680755e-05,
"loss": 0.1094,
"step": 2770
},
{
"epoch": 0.65,
"grad_norm": 1.2985103130340576,
"learning_rate": 4.3481220657277e-05,
"loss": 0.0959,
"step": 2780
},
{
"epoch": 0.65,
"grad_norm": 1.3562726974487305,
"learning_rate": 4.345774647887324e-05,
"loss": 0.091,
"step": 2790
},
{
"epoch": 0.66,
"grad_norm": 1.858865737915039,
"learning_rate": 4.343427230046948e-05,
"loss": 0.1,
"step": 2800
},
{
"epoch": 0.66,
"grad_norm": 4.3310136795043945,
"learning_rate": 4.341079812206573e-05,
"loss": 0.0756,
"step": 2810
},
{
"epoch": 0.66,
"grad_norm": 2.1979329586029053,
"learning_rate": 4.338732394366197e-05,
"loss": 0.0742,
"step": 2820
},
{
"epoch": 0.66,
"grad_norm": 1.7807092666625977,
"learning_rate": 4.336384976525822e-05,
"loss": 0.0804,
"step": 2830
},
{
"epoch": 0.67,
"grad_norm": 3.8421008586883545,
"learning_rate": 4.3340375586854466e-05,
"loss": 0.0784,
"step": 2840
},
{
"epoch": 0.67,
"grad_norm": 2.183363199234009,
"learning_rate": 4.3316901408450707e-05,
"loss": 0.1078,
"step": 2850
},
{
"epoch": 0.67,
"grad_norm": 1.3300306797027588,
"learning_rate": 4.3293427230046954e-05,
"loss": 0.1222,
"step": 2860
},
{
"epoch": 0.67,
"grad_norm": 1.2840020656585693,
"learning_rate": 4.3269953051643194e-05,
"loss": 0.1102,
"step": 2870
},
{
"epoch": 0.68,
"grad_norm": 1.7970964908599854,
"learning_rate": 4.3246478873239435e-05,
"loss": 0.0765,
"step": 2880
},
{
"epoch": 0.68,
"grad_norm": 1.5625609159469604,
"learning_rate": 4.322300469483568e-05,
"loss": 0.116,
"step": 2890
},
{
"epoch": 0.68,
"grad_norm": 1.1956239938735962,
"learning_rate": 4.319953051643193e-05,
"loss": 0.1024,
"step": 2900
},
{
"epoch": 0.68,
"grad_norm": 1.1357309818267822,
"learning_rate": 4.317605633802817e-05,
"loss": 0.1128,
"step": 2910
},
{
"epoch": 0.69,
"grad_norm": 1.9150160551071167,
"learning_rate": 4.315258215962442e-05,
"loss": 0.1298,
"step": 2920
},
{
"epoch": 0.69,
"grad_norm": 2.3721535205841064,
"learning_rate": 4.312910798122066e-05,
"loss": 0.0973,
"step": 2930
},
{
"epoch": 0.69,
"grad_norm": 3.3223726749420166,
"learning_rate": 4.3105633802816906e-05,
"loss": 0.1276,
"step": 2940
},
{
"epoch": 0.69,
"grad_norm": 2.278697967529297,
"learning_rate": 4.3082159624413146e-05,
"loss": 0.1016,
"step": 2950
},
{
"epoch": 0.69,
"grad_norm": 2.2410049438476562,
"learning_rate": 4.305868544600939e-05,
"loss": 0.0774,
"step": 2960
},
{
"epoch": 0.7,
"grad_norm": 0.6037675142288208,
"learning_rate": 4.3035211267605634e-05,
"loss": 0.0978,
"step": 2970
},
{
"epoch": 0.7,
"grad_norm": 2.202610969543457,
"learning_rate": 4.301173708920188e-05,
"loss": 0.0845,
"step": 2980
},
{
"epoch": 0.7,
"grad_norm": 1.6229338645935059,
"learning_rate": 4.298826291079812e-05,
"loss": 0.1009,
"step": 2990
},
{
"epoch": 0.7,
"grad_norm": 2.605353355407715,
"learning_rate": 4.296478873239437e-05,
"loss": 0.154,
"step": 3000
},
{
"epoch": 0.7,
"eval_loss": 0.09926234930753708,
"eval_macro/f1": 0.8497144892634887,
"eval_macro/precision": 0.866627156962533,
"eval_macro/recall": 0.8443244774442097,
"eval_micro/f1": 0.8532182916307162,
"eval_micro/precision": 0.8601948503827418,
"eval_micro/recall": 0.8463539883601506,
"eval_runtime": 27.4955,
"eval_samples/accuracy": 0.8319753509072235,
"eval_samples_per_second": 531.178,
"eval_steps_per_second": 16.621,
"step": 3000
},
{
"epoch": 0.71,
"grad_norm": 1.2412047386169434,
"learning_rate": 4.294131455399062e-05,
"loss": 0.09,
"step": 3010
},
{
"epoch": 0.71,
"grad_norm": 2.657113790512085,
"learning_rate": 4.291784037558686e-05,
"loss": 0.0903,
"step": 3020
},
{
"epoch": 0.71,
"grad_norm": 0.5215303897857666,
"learning_rate": 4.28943661971831e-05,
"loss": 0.0713,
"step": 3030
},
{
"epoch": 0.71,
"grad_norm": 4.21558141708374,
"learning_rate": 4.2870892018779345e-05,
"loss": 0.0989,
"step": 3040
},
{
"epoch": 0.72,
"grad_norm": 4.106936454772949,
"learning_rate": 4.2847417840375586e-05,
"loss": 0.0929,
"step": 3050
},
{
"epoch": 0.72,
"grad_norm": 1.9461392164230347,
"learning_rate": 4.282394366197183e-05,
"loss": 0.102,
"step": 3060
},
{
"epoch": 0.72,
"grad_norm": 1.7253563404083252,
"learning_rate": 4.280046948356808e-05,
"loss": 0.1059,
"step": 3070
},
{
"epoch": 0.72,
"grad_norm": 1.55907142162323,
"learning_rate": 4.277699530516432e-05,
"loss": 0.0858,
"step": 3080
},
{
"epoch": 0.73,
"grad_norm": 1.3519967794418335,
"learning_rate": 4.275352112676057e-05,
"loss": 0.1043,
"step": 3090
},
{
"epoch": 0.73,
"grad_norm": 0.8086038827896118,
"learning_rate": 4.273004694835681e-05,
"loss": 0.0993,
"step": 3100
},
{
"epoch": 0.73,
"grad_norm": 1.8035309314727783,
"learning_rate": 4.270657276995305e-05,
"loss": 0.1009,
"step": 3110
},
{
"epoch": 0.73,
"grad_norm": 1.5046508312225342,
"learning_rate": 4.26830985915493e-05,
"loss": 0.0797,
"step": 3120
},
{
"epoch": 0.73,
"grad_norm": 2.27606463432312,
"learning_rate": 4.2659624413145544e-05,
"loss": 0.0974,
"step": 3130
},
{
"epoch": 0.74,
"grad_norm": 0.8333800435066223,
"learning_rate": 4.2636150234741785e-05,
"loss": 0.0811,
"step": 3140
},
{
"epoch": 0.74,
"grad_norm": 1.6786904335021973,
"learning_rate": 4.261267605633803e-05,
"loss": 0.1141,
"step": 3150
},
{
"epoch": 0.74,
"grad_norm": 2.6342034339904785,
"learning_rate": 4.258920187793428e-05,
"loss": 0.0979,
"step": 3160
},
{
"epoch": 0.74,
"grad_norm": 1.9367939233779907,
"learning_rate": 4.256572769953051e-05,
"loss": 0.0953,
"step": 3170
},
{
"epoch": 0.75,
"grad_norm": 1.6788667440414429,
"learning_rate": 4.254225352112676e-05,
"loss": 0.0959,
"step": 3180
},
{
"epoch": 0.75,
"grad_norm": 1.9545440673828125,
"learning_rate": 4.251877934272301e-05,
"loss": 0.1059,
"step": 3190
},
{
"epoch": 0.75,
"grad_norm": 1.865529179573059,
"learning_rate": 4.249530516431925e-05,
"loss": 0.1049,
"step": 3200
},
{
"epoch": 0.75,
"grad_norm": 2.6784844398498535,
"learning_rate": 4.2471830985915496e-05,
"loss": 0.1029,
"step": 3210
},
{
"epoch": 0.76,
"grad_norm": 1.8157294988632202,
"learning_rate": 4.244835680751174e-05,
"loss": 0.1113,
"step": 3220
},
{
"epoch": 0.76,
"grad_norm": 2.7033796310424805,
"learning_rate": 4.2424882629107984e-05,
"loss": 0.1181,
"step": 3230
},
{
"epoch": 0.76,
"grad_norm": 1.7018625736236572,
"learning_rate": 4.2401408450704224e-05,
"loss": 0.0979,
"step": 3240
},
{
"epoch": 0.76,
"grad_norm": 2.8627302646636963,
"learning_rate": 4.237793427230047e-05,
"loss": 0.1359,
"step": 3250
},
{
"epoch": 0.77,
"grad_norm": 3.45668363571167,
"learning_rate": 4.235446009389671e-05,
"loss": 0.0977,
"step": 3260
},
{
"epoch": 0.77,
"grad_norm": 0.8866633772850037,
"learning_rate": 4.233098591549296e-05,
"loss": 0.1024,
"step": 3270
},
{
"epoch": 0.77,
"grad_norm": 2.3429834842681885,
"learning_rate": 4.230751173708921e-05,
"loss": 0.1123,
"step": 3280
},
{
"epoch": 0.77,
"grad_norm": 34.54017639160156,
"learning_rate": 4.228403755868545e-05,
"loss": 0.1015,
"step": 3290
},
{
"epoch": 0.77,
"grad_norm": 1.1823272705078125,
"learning_rate": 4.2260563380281695e-05,
"loss": 0.119,
"step": 3300
},
{
"epoch": 0.78,
"grad_norm": 1.785077452659607,
"learning_rate": 4.2237089201877935e-05,
"loss": 0.1086,
"step": 3310
},
{
"epoch": 0.78,
"grad_norm": 2.6036972999572754,
"learning_rate": 4.2213615023474176e-05,
"loss": 0.1138,
"step": 3320
},
{
"epoch": 0.78,
"grad_norm": 8.906991004943848,
"learning_rate": 4.219014084507042e-05,
"loss": 0.1071,
"step": 3330
},
{
"epoch": 0.78,
"grad_norm": 1.8581056594848633,
"learning_rate": 4.216666666666667e-05,
"loss": 0.0766,
"step": 3340
},
{
"epoch": 0.79,
"grad_norm": 7.854555130004883,
"learning_rate": 4.214319248826291e-05,
"loss": 0.1175,
"step": 3350
},
{
"epoch": 0.79,
"grad_norm": 2.5612292289733887,
"learning_rate": 4.211971830985916e-05,
"loss": 0.0868,
"step": 3360
},
{
"epoch": 0.79,
"grad_norm": 5.368772983551025,
"learning_rate": 4.2096244131455406e-05,
"loss": 0.1131,
"step": 3370
},
{
"epoch": 0.79,
"grad_norm": 1.4319589138031006,
"learning_rate": 4.2072769953051646e-05,
"loss": 0.1003,
"step": 3380
},
{
"epoch": 0.8,
"grad_norm": 2.323556661605835,
"learning_rate": 4.204929577464789e-05,
"loss": 0.1189,
"step": 3390
},
{
"epoch": 0.8,
"grad_norm": 2.330096483230591,
"learning_rate": 4.2025821596244134e-05,
"loss": 0.0943,
"step": 3400
},
{
"epoch": 0.8,
"grad_norm": 2.2431845664978027,
"learning_rate": 4.2002347417840375e-05,
"loss": 0.1233,
"step": 3410
},
{
"epoch": 0.8,
"grad_norm": 1.7425540685653687,
"learning_rate": 4.197887323943662e-05,
"loss": 0.1105,
"step": 3420
},
{
"epoch": 0.81,
"grad_norm": 2.7905280590057373,
"learning_rate": 4.195539906103287e-05,
"loss": 0.1138,
"step": 3430
},
{
"epoch": 0.81,
"grad_norm": 1.4292432069778442,
"learning_rate": 4.193192488262911e-05,
"loss": 0.0902,
"step": 3440
},
{
"epoch": 0.81,
"grad_norm": 2.4348337650299072,
"learning_rate": 4.190845070422536e-05,
"loss": 0.1234,
"step": 3450
},
{
"epoch": 0.81,
"grad_norm": 3.0525081157684326,
"learning_rate": 4.18849765258216e-05,
"loss": 0.0942,
"step": 3460
},
{
"epoch": 0.81,
"grad_norm": 1.9704358577728271,
"learning_rate": 4.186150234741784e-05,
"loss": 0.1039,
"step": 3470
},
{
"epoch": 0.82,
"grad_norm": 1.6399110555648804,
"learning_rate": 4.1838028169014086e-05,
"loss": 0.0806,
"step": 3480
},
{
"epoch": 0.82,
"grad_norm": 1.9704186916351318,
"learning_rate": 4.1814553990610333e-05,
"loss": 0.0744,
"step": 3490
},
{
"epoch": 0.82,
"grad_norm": 2.6479651927948,
"learning_rate": 4.1791079812206574e-05,
"loss": 0.1027,
"step": 3500
},
{
"epoch": 0.82,
"eval_loss": 0.1158306822180748,
"eval_macro/f1": 0.8195684810983835,
"eval_macro/precision": 0.8380379108916444,
"eval_macro/recall": 0.8105275723641988,
"eval_micro/f1": 0.8262169680111265,
"eval_micro/precision": 0.839350052984811,
"eval_micro/recall": 0.8134885313248887,
"eval_runtime": 27.9369,
"eval_samples/accuracy": 0.8013009243409791,
"eval_samples_per_second": 522.784,
"eval_steps_per_second": 16.358,
"step": 3500
},
{
"epoch": 0.82,
"grad_norm": 2.463714122772217,
"learning_rate": 4.176760563380282e-05,
"loss": 0.126,
"step": 3510
},
{
"epoch": 0.83,
"grad_norm": 2.4390833377838135,
"learning_rate": 4.174413145539906e-05,
"loss": 0.1077,
"step": 3520
},
{
"epoch": 0.83,
"grad_norm": 1.8290138244628906,
"learning_rate": 4.172065727699531e-05,
"loss": 0.1226,
"step": 3530
},
{
"epoch": 0.83,
"grad_norm": 7.175014495849609,
"learning_rate": 4.169718309859155e-05,
"loss": 0.1347,
"step": 3540
},
{
"epoch": 0.83,
"grad_norm": 2.000025987625122,
"learning_rate": 4.167370892018779e-05,
"loss": 0.1227,
"step": 3550
},
{
"epoch": 0.84,
"grad_norm": 2.2527687549591064,
"learning_rate": 4.165023474178404e-05,
"loss": 0.0835,
"step": 3560
},
{
"epoch": 0.84,
"grad_norm": 3.775047540664673,
"learning_rate": 4.1626760563380285e-05,
"loss": 0.1268,
"step": 3570
},
{
"epoch": 0.84,
"grad_norm": 1.8067278861999512,
"learning_rate": 4.1603286384976526e-05,
"loss": 0.1167,
"step": 3580
},
{
"epoch": 0.84,
"grad_norm": 7.179357051849365,
"learning_rate": 4.158215962441315e-05,
"loss": 0.1395,
"step": 3590
},
{
"epoch": 0.85,
"grad_norm": 14.411430358886719,
"learning_rate": 4.155868544600939e-05,
"loss": 0.1371,
"step": 3600
},
{
"epoch": 0.85,
"grad_norm": 4.457677841186523,
"learning_rate": 4.153755868544601e-05,
"loss": 0.0889,
"step": 3610
},
{
"epoch": 0.85,
"grad_norm": 3.6479544639587402,
"learning_rate": 4.1514084507042256e-05,
"loss": 0.0836,
"step": 3620
},
{
"epoch": 0.85,
"grad_norm": 1.3600658178329468,
"learning_rate": 4.1490610328638503e-05,
"loss": 0.0988,
"step": 3630
},
{
"epoch": 0.85,
"grad_norm": 1.5027318000793457,
"learning_rate": 4.1467136150234744e-05,
"loss": 0.0974,
"step": 3640
},
{
"epoch": 0.86,
"grad_norm": 3.9970905780792236,
"learning_rate": 4.144366197183099e-05,
"loss": 0.0921,
"step": 3650
},
{
"epoch": 0.86,
"grad_norm": 3.5798285007476807,
"learning_rate": 4.142018779342723e-05,
"loss": 0.0596,
"step": 3660
},
{
"epoch": 0.86,
"grad_norm": 2.3576560020446777,
"learning_rate": 4.139671361502347e-05,
"loss": 0.0888,
"step": 3670
},
{
"epoch": 0.86,
"grad_norm": 1.278519630432129,
"learning_rate": 4.137323943661972e-05,
"loss": 0.0754,
"step": 3680
},
{
"epoch": 0.87,
"grad_norm": 2.012185573577881,
"learning_rate": 4.134976525821597e-05,
"loss": 0.0949,
"step": 3690
},
{
"epoch": 0.87,
"grad_norm": 2.5081164836883545,
"learning_rate": 4.132629107981221e-05,
"loss": 0.1032,
"step": 3700
},
{
"epoch": 0.87,
"grad_norm": 1.6326484680175781,
"learning_rate": 4.1302816901408455e-05,
"loss": 0.0848,
"step": 3710
},
{
"epoch": 0.87,
"grad_norm": 2.9340505599975586,
"learning_rate": 4.12793427230047e-05,
"loss": 0.0902,
"step": 3720
},
{
"epoch": 0.88,
"grad_norm": 1.5919960737228394,
"learning_rate": 4.125586854460094e-05,
"loss": 0.1134,
"step": 3730
},
{
"epoch": 0.88,
"grad_norm": 0.7612159848213196,
"learning_rate": 4.1232394366197184e-05,
"loss": 0.1145,
"step": 3740
},
{
"epoch": 0.88,
"grad_norm": 0.8657788634300232,
"learning_rate": 4.120892018779343e-05,
"loss": 0.0921,
"step": 3750
},
{
"epoch": 0.88,
"grad_norm": 2.0533640384674072,
"learning_rate": 4.118544600938967e-05,
"loss": 0.1018,
"step": 3760
},
{
"epoch": 0.88,
"grad_norm": 2.4787542819976807,
"learning_rate": 4.116197183098592e-05,
"loss": 0.0953,
"step": 3770
},
{
"epoch": 0.89,
"grad_norm": 2.6822516918182373,
"learning_rate": 4.1138497652582166e-05,
"loss": 0.0891,
"step": 3780
},
{
"epoch": 0.89,
"grad_norm": 3.1771156787872314,
"learning_rate": 4.111502347417841e-05,
"loss": 0.108,
"step": 3790
},
{
"epoch": 0.89,
"grad_norm": 2.4537932872772217,
"learning_rate": 4.109154929577465e-05,
"loss": 0.0889,
"step": 3800
},
{
"epoch": 0.89,
"grad_norm": 1.7221810817718506,
"learning_rate": 4.1068075117370895e-05,
"loss": 0.0918,
"step": 3810
},
{
"epoch": 0.9,
"grad_norm": 1.9804364442825317,
"learning_rate": 4.1044600938967135e-05,
"loss": 0.0967,
"step": 3820
},
{
"epoch": 0.9,
"grad_norm": 1.1556975841522217,
"learning_rate": 4.102112676056338e-05,
"loss": 0.1045,
"step": 3830
},
{
"epoch": 0.9,
"grad_norm": 2.696667194366455,
"learning_rate": 4.099765258215963e-05,
"loss": 0.1076,
"step": 3840
},
{
"epoch": 0.9,
"grad_norm": 4.1449432373046875,
"learning_rate": 4.097417840375587e-05,
"loss": 0.1015,
"step": 3850
},
{
"epoch": 0.91,
"grad_norm": 0.9939990043640137,
"learning_rate": 4.095070422535212e-05,
"loss": 0.1221,
"step": 3860
},
{
"epoch": 0.91,
"grad_norm": 1.6629542112350464,
"learning_rate": 4.092723004694836e-05,
"loss": 0.13,
"step": 3870
},
{
"epoch": 0.91,
"grad_norm": 1.9935959577560425,
"learning_rate": 4.09037558685446e-05,
"loss": 0.091,
"step": 3880
},
{
"epoch": 0.91,
"grad_norm": 1.3307180404663086,
"learning_rate": 4.0880281690140846e-05,
"loss": 0.0947,
"step": 3890
},
{
"epoch": 0.92,
"grad_norm": 2.5752205848693848,
"learning_rate": 4.085680751173709e-05,
"loss": 0.0781,
"step": 3900
},
{
"epoch": 0.92,
"grad_norm": 3.055501699447632,
"learning_rate": 4.0833333333333334e-05,
"loss": 0.117,
"step": 3910
},
{
"epoch": 0.92,
"grad_norm": 1.8486618995666504,
"learning_rate": 4.080985915492958e-05,
"loss": 0.092,
"step": 3920
},
{
"epoch": 0.92,
"grad_norm": 2.8639369010925293,
"learning_rate": 4.078638497652582e-05,
"loss": 0.0883,
"step": 3930
},
{
"epoch": 0.92,
"grad_norm": 2.1113569736480713,
"learning_rate": 4.076291079812207e-05,
"loss": 0.1089,
"step": 3940
},
{
"epoch": 0.93,
"grad_norm": 1.2119710445404053,
"learning_rate": 4.073943661971831e-05,
"loss": 0.1111,
"step": 3950
},
{
"epoch": 0.93,
"grad_norm": 3.3505148887634277,
"learning_rate": 4.071596244131455e-05,
"loss": 0.1011,
"step": 3960
},
{
"epoch": 0.93,
"grad_norm": 1.162949562072754,
"learning_rate": 4.06924882629108e-05,
"loss": 0.1035,
"step": 3970
},
{
"epoch": 0.93,
"grad_norm": 2.1703877449035645,
"learning_rate": 4.0669014084507045e-05,
"loss": 0.0842,
"step": 3980
},
{
"epoch": 0.94,
"grad_norm": 1.3537096977233887,
"learning_rate": 4.0645539906103286e-05,
"loss": 0.0932,
"step": 3990
},
{
"epoch": 0.94,
"grad_norm": 1.504040002822876,
"learning_rate": 4.062206572769953e-05,
"loss": 0.1043,
"step": 4000
},
{
"epoch": 0.94,
"eval_loss": 0.08268122375011444,
"eval_macro/f1": 0.8728380733950062,
"eval_macro/precision": 0.88235722533482,
"eval_macro/recall": 0.8667865568824481,
"eval_micro/f1": 0.876,
"eval_micro/precision": 0.8823897186523099,
"eval_micro/recall": 0.8697021567956179,
"eval_runtime": 28.2041,
"eval_samples/accuracy": 0.8590893529613146,
"eval_samples_per_second": 517.832,
"eval_steps_per_second": 16.203,
"step": 4000
},
{
"epoch": 0.94,
"grad_norm": 2.4709396362304688,
"learning_rate": 4.059859154929578e-05,
"loss": 0.081,
"step": 4010
},
{
"epoch": 0.94,
"grad_norm": 1.5959906578063965,
"learning_rate": 4.057511737089202e-05,
"loss": 0.1023,
"step": 4020
},
{
"epoch": 0.95,
"grad_norm": 2.6471996307373047,
"learning_rate": 4.055164319248826e-05,
"loss": 0.0955,
"step": 4030
},
{
"epoch": 0.95,
"grad_norm": 1.5212956666946411,
"learning_rate": 4.052816901408451e-05,
"loss": 0.094,
"step": 4040
},
{
"epoch": 0.95,
"grad_norm": 1.4765437841415405,
"learning_rate": 4.050469483568075e-05,
"loss": 0.1121,
"step": 4050
},
{
"epoch": 0.95,
"grad_norm": 2.833054542541504,
"learning_rate": 4.0481220657277e-05,
"loss": 0.0835,
"step": 4060
},
{
"epoch": 0.96,
"grad_norm": 5.180490970611572,
"learning_rate": 4.0457746478873244e-05,
"loss": 0.1079,
"step": 4070
},
{
"epoch": 0.96,
"grad_norm": 0.6298426389694214,
"learning_rate": 4.0434272300469485e-05,
"loss": 0.0757,
"step": 4080
},
{
"epoch": 0.96,
"grad_norm": 16.718875885009766,
"learning_rate": 4.041079812206573e-05,
"loss": 0.0913,
"step": 4090
},
{
"epoch": 0.96,
"grad_norm": 4.327826023101807,
"learning_rate": 4.038732394366197e-05,
"loss": 0.0929,
"step": 4100
},
{
"epoch": 0.96,
"grad_norm": 9.035626411437988,
"learning_rate": 4.0363849765258213e-05,
"loss": 0.1165,
"step": 4110
},
{
"epoch": 0.97,
"grad_norm": 2.8778882026672363,
"learning_rate": 4.034272300469484e-05,
"loss": 0.0693,
"step": 4120
},
{
"epoch": 0.97,
"grad_norm": 2.3383145332336426,
"learning_rate": 4.0319248826291085e-05,
"loss": 0.0896,
"step": 4130
},
{
"epoch": 0.97,
"grad_norm": 32.227989196777344,
"learning_rate": 4.0295774647887326e-05,
"loss": 0.0792,
"step": 4140
},
{
"epoch": 0.97,
"grad_norm": 4.050955772399902,
"learning_rate": 4.027230046948357e-05,
"loss": 0.1008,
"step": 4150
},
{
"epoch": 0.98,
"grad_norm": 2.629568576812744,
"learning_rate": 4.0248826291079814e-05,
"loss": 0.0825,
"step": 4160
},
{
"epoch": 0.98,
"grad_norm": 3.6457648277282715,
"learning_rate": 4.0225352112676054e-05,
"loss": 0.08,
"step": 4170
},
{
"epoch": 0.98,
"grad_norm": 9.004801750183105,
"learning_rate": 4.02018779342723e-05,
"loss": 0.0986,
"step": 4180
},
{
"epoch": 0.98,
"grad_norm": 2.4670467376708984,
"learning_rate": 4.017840375586855e-05,
"loss": 0.1026,
"step": 4190
},
{
"epoch": 0.99,
"grad_norm": 2.2789056301116943,
"learning_rate": 4.015492957746479e-05,
"loss": 0.0832,
"step": 4200
},
{
"epoch": 0.99,
"grad_norm": 1.7449488639831543,
"learning_rate": 4.013145539906104e-05,
"loss": 0.0842,
"step": 4210
},
{
"epoch": 0.99,
"grad_norm": 2.188464879989624,
"learning_rate": 4.0107981220657284e-05,
"loss": 0.0756,
"step": 4220
},
{
"epoch": 0.99,
"grad_norm": 1.3610471487045288,
"learning_rate": 4.0084507042253525e-05,
"loss": 0.1,
"step": 4230
},
{
"epoch": 1.0,
"grad_norm": 7.353645324707031,
"learning_rate": 4.0061032863849766e-05,
"loss": 0.0816,
"step": 4240
},
{
"epoch": 1.0,
"grad_norm": 4.01577091217041,
"learning_rate": 4.003755868544601e-05,
"loss": 0.0783,
"step": 4250
},
{
"epoch": 1.0,
"grad_norm": 2.917956590652466,
"learning_rate": 4.0014084507042254e-05,
"loss": 0.0916,
"step": 4260
},
{
"epoch": 1.0,
"grad_norm": 3.5377771854400635,
"learning_rate": 3.99906103286385e-05,
"loss": 0.0902,
"step": 4270
},
{
"epoch": 1.0,
"grad_norm": 2.7792444229125977,
"learning_rate": 3.996713615023475e-05,
"loss": 0.0695,
"step": 4280
},
{
"epoch": 1.01,
"grad_norm": 6.497312545776367,
"learning_rate": 3.994366197183099e-05,
"loss": 0.0961,
"step": 4290
},
{
"epoch": 1.01,
"grad_norm": 2.327802896499634,
"learning_rate": 3.9920187793427236e-05,
"loss": 0.0965,
"step": 4300
},
{
"epoch": 1.01,
"grad_norm": 2.7272486686706543,
"learning_rate": 3.989671361502348e-05,
"loss": 0.0985,
"step": 4310
},
{
"epoch": 1.01,
"grad_norm": 1.634901762008667,
"learning_rate": 3.987323943661972e-05,
"loss": 0.0871,
"step": 4320
},
{
"epoch": 1.02,
"grad_norm": 5.812564849853516,
"learning_rate": 3.9849765258215965e-05,
"loss": 0.0622,
"step": 4330
},
{
"epoch": 1.02,
"grad_norm": 2.1237666606903076,
"learning_rate": 3.9826291079812205e-05,
"loss": 0.0695,
"step": 4340
},
{
"epoch": 1.02,
"grad_norm": 1.3049256801605225,
"learning_rate": 3.980281690140845e-05,
"loss": 0.1182,
"step": 4350
},
{
"epoch": 1.02,
"grad_norm": 2.1295623779296875,
"learning_rate": 3.97793427230047e-05,
"loss": 0.1011,
"step": 4360
},
{
"epoch": 1.03,
"grad_norm": 1.530081033706665,
"learning_rate": 3.975586854460094e-05,
"loss": 0.0848,
"step": 4370
},
{
"epoch": 1.03,
"grad_norm": 2.473222017288208,
"learning_rate": 3.973239436619718e-05,
"loss": 0.0813,
"step": 4380
},
{
"epoch": 1.03,
"grad_norm": 1.927211880683899,
"learning_rate": 3.970892018779343e-05,
"loss": 0.077,
"step": 4390
},
{
"epoch": 1.03,
"grad_norm": 2.43953800201416,
"learning_rate": 3.968544600938967e-05,
"loss": 0.0779,
"step": 4400
},
{
"epoch": 1.04,
"grad_norm": 2.9721133708953857,
"learning_rate": 3.9661971830985916e-05,
"loss": 0.1128,
"step": 4410
},
{
"epoch": 1.04,
"grad_norm": 2.547895669937134,
"learning_rate": 3.9638497652582164e-05,
"loss": 0.0869,
"step": 4420
},
{
"epoch": 1.04,
"grad_norm": 9.629310607910156,
"learning_rate": 3.9615023474178404e-05,
"loss": 0.0872,
"step": 4430
},
{
"epoch": 1.04,
"grad_norm": 0.8918728232383728,
"learning_rate": 3.959154929577465e-05,
"loss": 0.0784,
"step": 4440
},
{
"epoch": 1.04,
"grad_norm": 0.12496773898601532,
"learning_rate": 3.956807511737089e-05,
"loss": 0.0596,
"step": 4450
},
{
"epoch": 1.05,
"grad_norm": 2.4438138008117676,
"learning_rate": 3.954460093896713e-05,
"loss": 0.0811,
"step": 4460
},
{
"epoch": 1.05,
"grad_norm": 2.845428228378296,
"learning_rate": 3.952112676056338e-05,
"loss": 0.1081,
"step": 4470
},
{
"epoch": 1.05,
"grad_norm": 5.136063098907471,
"learning_rate": 3.949765258215963e-05,
"loss": 0.0791,
"step": 4480
},
{
"epoch": 1.05,
"grad_norm": 2.6351871490478516,
"learning_rate": 3.947417840375587e-05,
"loss": 0.0745,
"step": 4490
},
{
"epoch": 1.06,
"grad_norm": 2.7584948539733887,
"learning_rate": 3.9450704225352115e-05,
"loss": 0.1022,
"step": 4500
},
{
"epoch": 1.06,
"eval_loss": 0.0945679247379303,
"eval_macro/f1": 0.8653927311909163,
"eval_macro/precision": 0.8720764510455892,
"eval_macro/recall": 0.8665681005259545,
"eval_micro/f1": 0.8702337234444177,
"eval_micro/precision": 0.8724194880264244,
"eval_micro/recall": 0.8680588839438549,
"eval_runtime": 27.8503,
"eval_samples/accuracy": 0.8558712769599452,
"eval_samples_per_second": 524.411,
"eval_steps_per_second": 16.409,
"step": 4500
},
{
"epoch": 1.06,
"grad_norm": 2.672335624694824,
"learning_rate": 3.942723004694836e-05,
"loss": 0.069,
"step": 4510
},
{
"epoch": 1.06,
"grad_norm": 1.058993935585022,
"learning_rate": 3.94037558685446e-05,
"loss": 0.0767,
"step": 4520
},
{
"epoch": 1.06,
"grad_norm": 1.6829683780670166,
"learning_rate": 3.9380281690140844e-05,
"loss": 0.0691,
"step": 4530
},
{
"epoch": 1.07,
"grad_norm": 2.168950080871582,
"learning_rate": 3.935680751173709e-05,
"loss": 0.0567,
"step": 4540
},
{
"epoch": 1.07,
"grad_norm": 0.44047364592552185,
"learning_rate": 3.933333333333333e-05,
"loss": 0.064,
"step": 4550
},
{
"epoch": 1.07,
"grad_norm": 0.05623297393321991,
"learning_rate": 3.930985915492958e-05,
"loss": 0.0668,
"step": 4560
},
{
"epoch": 1.07,
"grad_norm": 3.3752222061157227,
"learning_rate": 3.9286384976525826e-05,
"loss": 0.0586,
"step": 4570
},
{
"epoch": 1.08,
"grad_norm": 3.060328722000122,
"learning_rate": 3.926291079812207e-05,
"loss": 0.0718,
"step": 4580
},
{
"epoch": 1.08,
"grad_norm": 3.963909149169922,
"learning_rate": 3.9239436619718314e-05,
"loss": 0.0704,
"step": 4590
},
{
"epoch": 1.08,
"grad_norm": 1.7161349058151245,
"learning_rate": 3.9215962441314555e-05,
"loss": 0.1007,
"step": 4600
},
{
"epoch": 1.08,
"grad_norm": 1.8788623809814453,
"learning_rate": 3.9192488262910795e-05,
"loss": 0.1044,
"step": 4610
},
{
"epoch": 1.08,
"grad_norm": 1.5107207298278809,
"learning_rate": 3.916901408450704e-05,
"loss": 0.0918,
"step": 4620
},
{
"epoch": 1.09,
"grad_norm": 1.8452281951904297,
"learning_rate": 3.914553990610329e-05,
"loss": 0.0709,
"step": 4630
},
{
"epoch": 1.09,
"grad_norm": 1.724955677986145,
"learning_rate": 3.912206572769953e-05,
"loss": 0.0869,
"step": 4640
},
{
"epoch": 1.09,
"grad_norm": 5.611421585083008,
"learning_rate": 3.909859154929578e-05,
"loss": 0.09,
"step": 4650
},
{
"epoch": 1.09,
"grad_norm": 0.4529111385345459,
"learning_rate": 3.9075117370892025e-05,
"loss": 0.0806,
"step": 4660
},
{
"epoch": 1.1,
"grad_norm": 2.281949043273926,
"learning_rate": 3.9051643192488266e-05,
"loss": 0.084,
"step": 4670
},
{
"epoch": 1.1,
"grad_norm": 2.5894758701324463,
"learning_rate": 3.9028169014084507e-05,
"loss": 0.0837,
"step": 4680
},
{
"epoch": 1.1,
"grad_norm": 3.316633701324463,
"learning_rate": 3.9004694835680754e-05,
"loss": 0.0899,
"step": 4690
},
{
"epoch": 1.1,
"grad_norm": 1.6215742826461792,
"learning_rate": 3.8981220657276994e-05,
"loss": 0.0773,
"step": 4700
},
{
"epoch": 1.11,
"grad_norm": 3.763181447982788,
"learning_rate": 3.895774647887324e-05,
"loss": 0.0898,
"step": 4710
},
{
"epoch": 1.11,
"grad_norm": 2.0760018825531006,
"learning_rate": 3.893427230046949e-05,
"loss": 0.0872,
"step": 4720
},
{
"epoch": 1.11,
"grad_norm": 4.817967414855957,
"learning_rate": 3.891079812206573e-05,
"loss": 0.0703,
"step": 4730
},
{
"epoch": 1.11,
"grad_norm": 3.2826614379882812,
"learning_rate": 3.888732394366198e-05,
"loss": 0.0863,
"step": 4740
},
{
"epoch": 1.12,
"grad_norm": 1.4272204637527466,
"learning_rate": 3.886384976525822e-05,
"loss": 0.0737,
"step": 4750
},
{
"epoch": 1.12,
"grad_norm": 2.626129150390625,
"learning_rate": 3.884037558685446e-05,
"loss": 0.0682,
"step": 4760
},
{
"epoch": 1.12,
"grad_norm": 0.8139760494232178,
"learning_rate": 3.8816901408450706e-05,
"loss": 0.0705,
"step": 4770
},
{
"epoch": 1.12,
"grad_norm": 7.998354911804199,
"learning_rate": 3.879342723004695e-05,
"loss": 0.0622,
"step": 4780
},
{
"epoch": 1.12,
"grad_norm": 2.174318790435791,
"learning_rate": 3.8769953051643193e-05,
"loss": 0.059,
"step": 4790
},
{
"epoch": 1.13,
"grad_norm": 2.3592910766601562,
"learning_rate": 3.874647887323944e-05,
"loss": 0.0842,
"step": 4800
},
{
"epoch": 1.13,
"grad_norm": 1.0818564891815186,
"learning_rate": 3.872300469483569e-05,
"loss": 0.092,
"step": 4810
},
{
"epoch": 1.13,
"grad_norm": 3.985733985900879,
"learning_rate": 3.869953051643193e-05,
"loss": 0.0579,
"step": 4820
},
{
"epoch": 1.13,
"grad_norm": 6.383425712585449,
"learning_rate": 3.867605633802817e-05,
"loss": 0.0818,
"step": 4830
},
{
"epoch": 1.14,
"grad_norm": 1.3395494222640991,
"learning_rate": 3.865258215962442e-05,
"loss": 0.0903,
"step": 4840
},
{
"epoch": 1.14,
"grad_norm": 2.034086227416992,
"learning_rate": 3.862910798122066e-05,
"loss": 0.0989,
"step": 4850
},
{
"epoch": 1.14,
"grad_norm": 1.0606484413146973,
"learning_rate": 3.8605633802816905e-05,
"loss": 0.0978,
"step": 4860
},
{
"epoch": 1.14,
"grad_norm": 0.6535605788230896,
"learning_rate": 3.858215962441315e-05,
"loss": 0.1039,
"step": 4870
},
{
"epoch": 1.15,
"grad_norm": 1.6977512836456299,
"learning_rate": 3.855868544600939e-05,
"loss": 0.0869,
"step": 4880
},
{
"epoch": 1.15,
"grad_norm": 1.8797959089279175,
"learning_rate": 3.853521126760564e-05,
"loss": 0.0797,
"step": 4890
},
{
"epoch": 1.15,
"grad_norm": 1.8291544914245605,
"learning_rate": 3.851173708920188e-05,
"loss": 0.0658,
"step": 4900
},
{
"epoch": 1.15,
"grad_norm": 1.1057475805282593,
"learning_rate": 3.848826291079812e-05,
"loss": 0.0814,
"step": 4910
},
{
"epoch": 1.15,
"grad_norm": 3.3656809329986572,
"learning_rate": 3.846478873239437e-05,
"loss": 0.1156,
"step": 4920
},
{
"epoch": 1.16,
"grad_norm": 1.4543434381484985,
"learning_rate": 3.844131455399061e-05,
"loss": 0.1014,
"step": 4930
},
{
"epoch": 1.16,
"grad_norm": 1.2256841659545898,
"learning_rate": 3.8417840375586856e-05,
"loss": 0.0735,
"step": 4940
},
{
"epoch": 1.16,
"grad_norm": 2.286642551422119,
"learning_rate": 3.8394366197183104e-05,
"loss": 0.0762,
"step": 4950
},
{
"epoch": 1.16,
"grad_norm": 1.6697473526000977,
"learning_rate": 3.8370892018779344e-05,
"loss": 0.0759,
"step": 4960
},
{
"epoch": 1.17,
"grad_norm": 3.3569507598876953,
"learning_rate": 3.8347417840375585e-05,
"loss": 0.064,
"step": 4970
},
{
"epoch": 1.17,
"grad_norm": 1.3700305223464966,
"learning_rate": 3.832394366197183e-05,
"loss": 0.0722,
"step": 4980
},
{
"epoch": 1.17,
"grad_norm": 6.224872589111328,
"learning_rate": 3.830046948356807e-05,
"loss": 0.09,
"step": 4990
},
{
"epoch": 1.17,
"grad_norm": 3.2396225929260254,
"learning_rate": 3.827699530516432e-05,
"loss": 0.0857,
"step": 5000
},
{
"epoch": 1.17,
"eval_loss": 0.07859091460704803,
"eval_macro/f1": 0.8827431167400268,
"eval_macro/precision": 0.886261334665688,
"eval_macro/recall": 0.8813339379181391,
"eval_micro/f1": 0.886541363698724,
"eval_micro/precision": 0.888186378943028,
"eval_micro/recall": 0.8849024306744265,
"eval_runtime": 27.7668,
"eval_samples/accuracy": 0.8712769599452242,
"eval_samples_per_second": 525.988,
"eval_steps_per_second": 16.459,
"step": 5000
},
{
"epoch": 1.18,
"grad_norm": 0.7627536058425903,
"learning_rate": 3.825352112676057e-05,
"loss": 0.0839,
"step": 5010
},
{
"epoch": 1.18,
"grad_norm": 1.6807057857513428,
"learning_rate": 3.823004694835681e-05,
"loss": 0.0613,
"step": 5020
},
{
"epoch": 1.18,
"grad_norm": 2.8506741523742676,
"learning_rate": 3.8206572769953055e-05,
"loss": 0.0706,
"step": 5030
},
{
"epoch": 1.18,
"grad_norm": 1.1173980236053467,
"learning_rate": 3.8183098591549296e-05,
"loss": 0.0982,
"step": 5040
},
{
"epoch": 1.19,
"grad_norm": 1.0861316919326782,
"learning_rate": 3.8159624413145536e-05,
"loss": 0.0768,
"step": 5050
},
{
"epoch": 1.19,
"grad_norm": 2.1186795234680176,
"learning_rate": 3.8136150234741784e-05,
"loss": 0.0914,
"step": 5060
},
{
"epoch": 1.19,
"grad_norm": 5.411003112792969,
"learning_rate": 3.811267605633803e-05,
"loss": 0.1255,
"step": 5070
},
{
"epoch": 1.19,
"grad_norm": 1.6623032093048096,
"learning_rate": 3.808920187793427e-05,
"loss": 0.0482,
"step": 5080
},
{
"epoch": 1.19,
"grad_norm": 2.952754497528076,
"learning_rate": 3.806572769953052e-05,
"loss": 0.0589,
"step": 5090
},
{
"epoch": 1.2,
"grad_norm": 19.873538970947266,
"learning_rate": 3.8042253521126766e-05,
"loss": 0.0817,
"step": 5100
},
{
"epoch": 1.2,
"grad_norm": 2.0706191062927246,
"learning_rate": 3.801877934272301e-05,
"loss": 0.0642,
"step": 5110
},
{
"epoch": 1.2,
"grad_norm": 0.5746757984161377,
"learning_rate": 3.799530516431925e-05,
"loss": 0.0788,
"step": 5120
},
{
"epoch": 1.2,
"grad_norm": 2.0287206172943115,
"learning_rate": 3.7971830985915495e-05,
"loss": 0.0894,
"step": 5130
},
{
"epoch": 1.21,
"grad_norm": 1.4267714023590088,
"learning_rate": 3.7948356807511735e-05,
"loss": 0.0921,
"step": 5140
},
{
"epoch": 1.21,
"grad_norm": 0.6256331205368042,
"learning_rate": 3.792488262910798e-05,
"loss": 0.0611,
"step": 5150
},
{
"epoch": 1.21,
"grad_norm": 1.0450356006622314,
"learning_rate": 3.790140845070423e-05,
"loss": 0.077,
"step": 5160
},
{
"epoch": 1.21,
"grad_norm": 1.0859485864639282,
"learning_rate": 3.787793427230047e-05,
"loss": 0.0848,
"step": 5170
},
{
"epoch": 1.22,
"grad_norm": 6.269891262054443,
"learning_rate": 3.785446009389672e-05,
"loss": 0.0766,
"step": 5180
},
{
"epoch": 1.22,
"grad_norm": 3.0235748291015625,
"learning_rate": 3.783098591549296e-05,
"loss": 0.0828,
"step": 5190
},
{
"epoch": 1.22,
"grad_norm": 1.3449300527572632,
"learning_rate": 3.78075117370892e-05,
"loss": 0.0751,
"step": 5200
},
{
"epoch": 1.22,
"grad_norm": 1.6258209943771362,
"learning_rate": 3.7784037558685447e-05,
"loss": 0.0878,
"step": 5210
},
{
"epoch": 1.23,
"grad_norm": 1.2452788352966309,
"learning_rate": 3.7760563380281694e-05,
"loss": 0.0524,
"step": 5220
},
{
"epoch": 1.23,
"grad_norm": 1.5970511436462402,
"learning_rate": 3.7737089201877934e-05,
"loss": 0.077,
"step": 5230
},
{
"epoch": 1.23,
"grad_norm": 1.322287917137146,
"learning_rate": 3.771361502347418e-05,
"loss": 0.0859,
"step": 5240
},
{
"epoch": 1.23,
"grad_norm": 1.2838901281356812,
"learning_rate": 3.769014084507043e-05,
"loss": 0.0747,
"step": 5250
},
{
"epoch": 1.23,
"grad_norm": 2.0644261837005615,
"learning_rate": 3.766666666666667e-05,
"loss": 0.0632,
"step": 5260
},
{
"epoch": 1.24,
"grad_norm": 1.1590847969055176,
"learning_rate": 3.764319248826291e-05,
"loss": 0.1018,
"step": 5270
},
{
"epoch": 1.24,
"grad_norm": 0.9163933992385864,
"learning_rate": 3.761971830985916e-05,
"loss": 0.078,
"step": 5280
},
{
"epoch": 1.24,
"grad_norm": 1.6301237344741821,
"learning_rate": 3.75962441314554e-05,
"loss": 0.0484,
"step": 5290
},
{
"epoch": 1.24,
"grad_norm": 2.292043447494507,
"learning_rate": 3.7572769953051646e-05,
"loss": 0.0777,
"step": 5300
},
{
"epoch": 1.25,
"grad_norm": 1.5556319952011108,
"learning_rate": 3.754929577464789e-05,
"loss": 0.0919,
"step": 5310
},
{
"epoch": 1.25,
"grad_norm": 1.770605444908142,
"learning_rate": 3.7525821596244133e-05,
"loss": 0.0928,
"step": 5320
},
{
"epoch": 1.25,
"grad_norm": 0.8810251355171204,
"learning_rate": 3.750234741784038e-05,
"loss": 0.0662,
"step": 5330
},
{
"epoch": 1.25,
"grad_norm": 0.905276358127594,
"learning_rate": 3.747887323943662e-05,
"loss": 0.073,
"step": 5340
},
{
"epoch": 1.26,
"grad_norm": 3.4868996143341064,
"learning_rate": 3.745539906103286e-05,
"loss": 0.0914,
"step": 5350
},
{
"epoch": 1.26,
"grad_norm": 1.6897212266921997,
"learning_rate": 3.743192488262911e-05,
"loss": 0.0569,
"step": 5360
},
{
"epoch": 1.26,
"grad_norm": 2.168696165084839,
"learning_rate": 3.740845070422536e-05,
"loss": 0.092,
"step": 5370
},
{
"epoch": 1.26,
"grad_norm": 2.419448137283325,
"learning_rate": 3.73849765258216e-05,
"loss": 0.0787,
"step": 5380
},
{
"epoch": 1.27,
"grad_norm": 1.9727363586425781,
"learning_rate": 3.7361502347417845e-05,
"loss": 0.0748,
"step": 5390
},
{
"epoch": 1.27,
"grad_norm": 1.079543948173523,
"learning_rate": 3.733802816901409e-05,
"loss": 0.0745,
"step": 5400
},
{
"epoch": 1.27,
"grad_norm": 2.7353591918945312,
"learning_rate": 3.731455399061033e-05,
"loss": 0.0616,
"step": 5410
},
{
"epoch": 1.27,
"grad_norm": 2.7964367866516113,
"learning_rate": 3.729107981220657e-05,
"loss": 0.0858,
"step": 5420
},
{
"epoch": 1.27,
"grad_norm": 1.155333399772644,
"learning_rate": 3.726760563380282e-05,
"loss": 0.0755,
"step": 5430
},
{
"epoch": 1.28,
"grad_norm": 1.547913670539856,
"learning_rate": 3.724413145539906e-05,
"loss": 0.0718,
"step": 5440
},
{
"epoch": 1.28,
"grad_norm": 4.3765764236450195,
"learning_rate": 3.722065727699531e-05,
"loss": 0.0945,
"step": 5450
},
{
"epoch": 1.28,
"grad_norm": 2.7049620151519775,
"learning_rate": 3.7197183098591556e-05,
"loss": 0.0717,
"step": 5460
},
{
"epoch": 1.28,
"grad_norm": 2.8908655643463135,
"learning_rate": 3.7173708920187796e-05,
"loss": 0.0909,
"step": 5470
},
{
"epoch": 1.29,
"grad_norm": 1.6085829734802246,
"learning_rate": 3.7150234741784044e-05,
"loss": 0.0557,
"step": 5480
},
{
"epoch": 1.29,
"grad_norm": 1.4436389207839966,
"learning_rate": 3.7126760563380284e-05,
"loss": 0.072,
"step": 5490
},
{
"epoch": 1.29,
"grad_norm": 2.450096607208252,
"learning_rate": 3.7103286384976525e-05,
"loss": 0.0787,
"step": 5500
},
{
"epoch": 1.29,
"eval_loss": 0.07993102073669434,
"eval_macro/f1": 0.8823025838255318,
"eval_macro/precision": 0.8854401633374165,
"eval_macro/recall": 0.8817389137045657,
"eval_micro/f1": 0.8849351626920313,
"eval_micro/precision": 0.884420735877445,
"eval_micro/recall": 0.885450188291681,
"eval_runtime": 28.6405,
"eval_samples/accuracy": 0.8701129750085587,
"eval_samples_per_second": 509.943,
"eval_steps_per_second": 15.956,
"step": 5500
},
{
"epoch": 1.29,
"grad_norm": 3.5232038497924805,
"learning_rate": 3.707981220657277e-05,
"loss": 0.0636,
"step": 5510
},
{
"epoch": 1.3,
"grad_norm": 1.5219241380691528,
"learning_rate": 3.705633802816901e-05,
"loss": 0.0742,
"step": 5520
},
{
"epoch": 1.3,
"grad_norm": 1.5301398038864136,
"learning_rate": 3.703286384976526e-05,
"loss": 0.0802,
"step": 5530
},
{
"epoch": 1.3,
"grad_norm": 1.245137333869934,
"learning_rate": 3.700938967136151e-05,
"loss": 0.0793,
"step": 5540
},
{
"epoch": 1.3,
"grad_norm": 0.6079393625259399,
"learning_rate": 3.698591549295775e-05,
"loss": 0.0769,
"step": 5550
},
{
"epoch": 1.31,
"grad_norm": 2.262885570526123,
"learning_rate": 3.696244131455399e-05,
"loss": 0.0707,
"step": 5560
},
{
"epoch": 1.31,
"grad_norm": 2.196953296661377,
"learning_rate": 3.6938967136150236e-05,
"loss": 0.0815,
"step": 5570
},
{
"epoch": 1.31,
"grad_norm": 1.519397258758545,
"learning_rate": 3.6915492957746476e-05,
"loss": 0.1008,
"step": 5580
},
{
"epoch": 1.31,
"grad_norm": 1.4691829681396484,
"learning_rate": 3.6892018779342724e-05,
"loss": 0.0678,
"step": 5590
},
{
"epoch": 1.31,
"grad_norm": 3.34647274017334,
"learning_rate": 3.686854460093897e-05,
"loss": 0.0717,
"step": 5600
},
{
"epoch": 1.32,
"grad_norm": 1.3005377054214478,
"learning_rate": 3.684507042253521e-05,
"loss": 0.0772,
"step": 5610
},
{
"epoch": 1.32,
"grad_norm": 3.112577199935913,
"learning_rate": 3.682159624413146e-05,
"loss": 0.0722,
"step": 5620
},
{
"epoch": 1.32,
"grad_norm": 2.1536271572113037,
"learning_rate": 3.67981220657277e-05,
"loss": 0.0619,
"step": 5630
},
{
"epoch": 1.32,
"grad_norm": 0.5694202780723572,
"learning_rate": 3.677464788732394e-05,
"loss": 0.0658,
"step": 5640
},
{
"epoch": 1.33,
"grad_norm": 5.5259504318237305,
"learning_rate": 3.675117370892019e-05,
"loss": 0.0631,
"step": 5650
},
{
"epoch": 1.33,
"grad_norm": 2.356536388397217,
"learning_rate": 3.6727699530516435e-05,
"loss": 0.0575,
"step": 5660
},
{
"epoch": 1.33,
"grad_norm": 0.8688063025474548,
"learning_rate": 3.6704225352112675e-05,
"loss": 0.0678,
"step": 5670
},
{
"epoch": 1.33,
"grad_norm": 5.551433086395264,
"learning_rate": 3.668075117370892e-05,
"loss": 0.0609,
"step": 5680
},
{
"epoch": 1.34,
"grad_norm": 2.565351963043213,
"learning_rate": 3.665727699530517e-05,
"loss": 0.0656,
"step": 5690
},
{
"epoch": 1.34,
"grad_norm": 0.8359733819961548,
"learning_rate": 3.663380281690141e-05,
"loss": 0.0659,
"step": 5700
},
{
"epoch": 1.34,
"grad_norm": 0.4640568196773529,
"learning_rate": 3.661032863849765e-05,
"loss": 0.0466,
"step": 5710
},
{
"epoch": 1.34,
"grad_norm": 0.10229873657226562,
"learning_rate": 3.65868544600939e-05,
"loss": 0.0794,
"step": 5720
},
{
"epoch": 1.35,
"grad_norm": 4.01750373840332,
"learning_rate": 3.656338028169014e-05,
"loss": 0.1165,
"step": 5730
},
{
"epoch": 1.35,
"grad_norm": 2.3960859775543213,
"learning_rate": 3.6539906103286386e-05,
"loss": 0.0757,
"step": 5740
},
{
"epoch": 1.35,
"grad_norm": 0.8701328039169312,
"learning_rate": 3.6516431924882634e-05,
"loss": 0.046,
"step": 5750
},
{
"epoch": 1.35,
"grad_norm": 0.9990036487579346,
"learning_rate": 3.6492957746478874e-05,
"loss": 0.1002,
"step": 5760
},
{
"epoch": 1.35,
"grad_norm": 2.334117889404297,
"learning_rate": 3.646948356807512e-05,
"loss": 0.0724,
"step": 5770
},
{
"epoch": 1.36,
"grad_norm": 2.3866279125213623,
"learning_rate": 3.644600938967136e-05,
"loss": 0.0994,
"step": 5780
},
{
"epoch": 1.36,
"grad_norm": 0.8514377474784851,
"learning_rate": 3.64225352112676e-05,
"loss": 0.0744,
"step": 5790
},
{
"epoch": 1.36,
"grad_norm": 1.0646476745605469,
"learning_rate": 3.639906103286385e-05,
"loss": 0.0785,
"step": 5800
},
{
"epoch": 1.36,
"grad_norm": 1.6509699821472168,
"learning_rate": 3.63755868544601e-05,
"loss": 0.0968,
"step": 5810
},
{
"epoch": 1.37,
"grad_norm": 2.1369731426239014,
"learning_rate": 3.635211267605634e-05,
"loss": 0.0722,
"step": 5820
},
{
"epoch": 1.37,
"grad_norm": 2.6235861778259277,
"learning_rate": 3.6328638497652585e-05,
"loss": 0.0725,
"step": 5830
},
{
"epoch": 1.37,
"grad_norm": 1.6085470914840698,
"learning_rate": 3.630516431924883e-05,
"loss": 0.0916,
"step": 5840
},
{
"epoch": 1.37,
"grad_norm": 0.7220445275306702,
"learning_rate": 3.6281690140845073e-05,
"loss": 0.0779,
"step": 5850
},
{
"epoch": 1.38,
"grad_norm": 1.6964889764785767,
"learning_rate": 3.6258215962441314e-05,
"loss": 0.0682,
"step": 5860
},
{
"epoch": 1.38,
"grad_norm": 1.0012626647949219,
"learning_rate": 3.623474178403756e-05,
"loss": 0.0573,
"step": 5870
},
{
"epoch": 1.38,
"grad_norm": 1.5465306043624878,
"learning_rate": 3.62112676056338e-05,
"loss": 0.0912,
"step": 5880
},
{
"epoch": 1.38,
"grad_norm": 3.804412841796875,
"learning_rate": 3.618779342723005e-05,
"loss": 0.0683,
"step": 5890
},
{
"epoch": 1.38,
"grad_norm": 2.037912130355835,
"learning_rate": 3.6164319248826297e-05,
"loss": 0.0655,
"step": 5900
},
{
"epoch": 1.39,
"grad_norm": 1.2398627996444702,
"learning_rate": 3.614084507042254e-05,
"loss": 0.0772,
"step": 5910
},
{
"epoch": 1.39,
"grad_norm": 1.238403081893921,
"learning_rate": 3.6117370892018785e-05,
"loss": 0.0758,
"step": 5920
},
{
"epoch": 1.39,
"grad_norm": 2.2002389430999756,
"learning_rate": 3.6093896713615025e-05,
"loss": 0.0794,
"step": 5930
},
{
"epoch": 1.39,
"grad_norm": 0.43891093134880066,
"learning_rate": 3.6070422535211266e-05,
"loss": 0.0737,
"step": 5940
},
{
"epoch": 1.4,
"grad_norm": 2.9602551460266113,
"learning_rate": 3.604694835680751e-05,
"loss": 0.0638,
"step": 5950
},
{
"epoch": 1.4,
"grad_norm": 2.5585052967071533,
"learning_rate": 3.602347417840376e-05,
"loss": 0.0899,
"step": 5960
},
{
"epoch": 1.4,
"grad_norm": 1.7965030670166016,
"learning_rate": 3.6e-05,
"loss": 0.0717,
"step": 5970
},
{
"epoch": 1.4,
"grad_norm": 1.0735397338867188,
"learning_rate": 3.597652582159625e-05,
"loss": 0.0664,
"step": 5980
},
{
"epoch": 1.41,
"grad_norm": 0.42542290687561035,
"learning_rate": 3.5953051643192496e-05,
"loss": 0.0667,
"step": 5990
},
{
"epoch": 1.41,
"grad_norm": 1.6038751602172852,
"learning_rate": 3.5929577464788736e-05,
"loss": 0.0603,
"step": 6000
},
{
"epoch": 1.41,
"eval_loss": 0.07968219369649887,
"eval_macro/f1": 0.8841156142067595,
"eval_macro/precision": 0.8863706352948096,
"eval_macro/recall": 0.8858079234329682,
"eval_micro/f1": 0.8880147763031878,
"eval_micro/precision": 0.8872257535370105,
"eval_micro/recall": 0.8888052036973639,
"eval_runtime": 28.6019,
"eval_samples/accuracy": 0.8748373844573776,
"eval_samples_per_second": 510.63,
"eval_steps_per_second": 15.978,
"step": 6000
},
{
"epoch": 1.41,
"grad_norm": 1.2723647356033325,
"learning_rate": 3.590610328638498e-05,
"loss": 0.0638,
"step": 6010
},
{
"epoch": 1.41,
"grad_norm": 1.2588775157928467,
"learning_rate": 3.5882629107981224e-05,
"loss": 0.0768,
"step": 6020
},
{
"epoch": 1.42,
"grad_norm": 1.3960028886795044,
"learning_rate": 3.5859154929577465e-05,
"loss": 0.0589,
"step": 6030
},
{
"epoch": 1.42,
"grad_norm": 0.8584194183349609,
"learning_rate": 3.583568075117371e-05,
"loss": 0.0816,
"step": 6040
},
{
"epoch": 1.42,
"grad_norm": 3.710993528366089,
"learning_rate": 3.581220657276996e-05,
"loss": 0.0818,
"step": 6050
},
{
"epoch": 1.42,
"grad_norm": 0.5568336844444275,
"learning_rate": 3.57887323943662e-05,
"loss": 0.0659,
"step": 6060
},
{
"epoch": 1.42,
"grad_norm": 6.254613876342773,
"learning_rate": 3.576525821596244e-05,
"loss": 0.0765,
"step": 6070
},
{
"epoch": 1.43,
"grad_norm": 1.4226367473602295,
"learning_rate": 3.574178403755869e-05,
"loss": 0.0702,
"step": 6080
},
{
"epoch": 1.43,
"grad_norm": 2.3769171237945557,
"learning_rate": 3.571830985915493e-05,
"loss": 0.0631,
"step": 6090
},
{
"epoch": 1.43,
"grad_norm": 2.0111894607543945,
"learning_rate": 3.5694835680751176e-05,
"loss": 0.0525,
"step": 6100
},
{
"epoch": 1.43,
"grad_norm": 2.731039524078369,
"learning_rate": 3.5671361502347416e-05,
"loss": 0.0434,
"step": 6110
},
{
"epoch": 1.44,
"grad_norm": 1.3040751218795776,
"learning_rate": 3.5647887323943664e-05,
"loss": 0.0738,
"step": 6120
},
{
"epoch": 1.44,
"grad_norm": 3.0165939331054688,
"learning_rate": 3.562441314553991e-05,
"loss": 0.084,
"step": 6130
},
{
"epoch": 1.44,
"grad_norm": 4.147961139678955,
"learning_rate": 3.560093896713615e-05,
"loss": 0.1147,
"step": 6140
},
{
"epoch": 1.44,
"grad_norm": 1.984157681465149,
"learning_rate": 3.557746478873239e-05,
"loss": 0.1099,
"step": 6150
},
{
"epoch": 1.45,
"grad_norm": 3.322106122970581,
"learning_rate": 3.555399061032864e-05,
"loss": 0.0787,
"step": 6160
},
{
"epoch": 1.45,
"grad_norm": 0.5959970951080322,
"learning_rate": 3.553051643192488e-05,
"loss": 0.0503,
"step": 6170
},
{
"epoch": 1.45,
"grad_norm": 3.4334895610809326,
"learning_rate": 3.550704225352113e-05,
"loss": 0.0617,
"step": 6180
},
{
"epoch": 1.45,
"grad_norm": 0.8281345963478088,
"learning_rate": 3.5483568075117375e-05,
"loss": 0.0814,
"step": 6190
},
{
"epoch": 1.46,
"grad_norm": 1.6435192823410034,
"learning_rate": 3.5460093896713615e-05,
"loss": 0.0671,
"step": 6200
},
{
"epoch": 1.46,
"grad_norm": 5.341729164123535,
"learning_rate": 3.543661971830986e-05,
"loss": 0.0629,
"step": 6210
},
{
"epoch": 1.46,
"grad_norm": 4.257093906402588,
"learning_rate": 3.54131455399061e-05,
"loss": 0.0403,
"step": 6220
},
{
"epoch": 1.46,
"grad_norm": 1.0729011297225952,
"learning_rate": 3.5389671361502344e-05,
"loss": 0.0995,
"step": 6230
},
{
"epoch": 1.46,
"grad_norm": 4.933761119842529,
"learning_rate": 3.536619718309859e-05,
"loss": 0.0797,
"step": 6240
},
{
"epoch": 1.47,
"grad_norm": 1.1815464496612549,
"learning_rate": 3.534272300469484e-05,
"loss": 0.0641,
"step": 6250
},
{
"epoch": 1.47,
"grad_norm": 2.0844922065734863,
"learning_rate": 3.531924882629108e-05,
"loss": 0.0736,
"step": 6260
},
{
"epoch": 1.47,
"grad_norm": 1.1614429950714111,
"learning_rate": 3.5295774647887326e-05,
"loss": 0.0666,
"step": 6270
},
{
"epoch": 1.47,
"grad_norm": 1.0777498483657837,
"learning_rate": 3.5272300469483574e-05,
"loss": 0.0705,
"step": 6280
},
{
"epoch": 1.48,
"grad_norm": 4.16050910949707,
"learning_rate": 3.5248826291079814e-05,
"loss": 0.0661,
"step": 6290
},
{
"epoch": 1.48,
"grad_norm": 1.9127310514450073,
"learning_rate": 3.5225352112676055e-05,
"loss": 0.0742,
"step": 6300
},
{
"epoch": 1.48,
"grad_norm": 2.6476809978485107,
"learning_rate": 3.52018779342723e-05,
"loss": 0.0535,
"step": 6310
},
{
"epoch": 1.48,
"grad_norm": 3.1948328018188477,
"learning_rate": 3.517840375586854e-05,
"loss": 0.0828,
"step": 6320
},
{
"epoch": 1.49,
"grad_norm": 1.5699864625930786,
"learning_rate": 3.515492957746479e-05,
"loss": 0.0855,
"step": 6330
},
{
"epoch": 1.49,
"grad_norm": 2.835286855697632,
"learning_rate": 3.513145539906104e-05,
"loss": 0.0442,
"step": 6340
},
{
"epoch": 1.49,
"grad_norm": 1.8206992149353027,
"learning_rate": 3.510798122065728e-05,
"loss": 0.0483,
"step": 6350
},
{
"epoch": 1.49,
"grad_norm": 2.245073080062866,
"learning_rate": 3.5084507042253525e-05,
"loss": 0.0694,
"step": 6360
},
{
"epoch": 1.5,
"grad_norm": 2.3250207901000977,
"learning_rate": 3.5061032863849766e-05,
"loss": 0.0776,
"step": 6370
},
{
"epoch": 1.5,
"grad_norm": 0.4506942331790924,
"learning_rate": 3.5037558685446007e-05,
"loss": 0.0757,
"step": 6380
},
{
"epoch": 1.5,
"grad_norm": 1.560647964477539,
"learning_rate": 3.5014084507042254e-05,
"loss": 0.0581,
"step": 6390
},
{
"epoch": 1.5,
"grad_norm": 2.318333148956299,
"learning_rate": 3.499295774647888e-05,
"loss": 0.048,
"step": 6400
},
{
"epoch": 1.5,
"grad_norm": 1.2419353723526,
"learning_rate": 3.496948356807512e-05,
"loss": 0.0668,
"step": 6410
},
{
"epoch": 1.51,
"grad_norm": 2.8134100437164307,
"learning_rate": 3.4946009389671367e-05,
"loss": 0.0848,
"step": 6420
},
{
"epoch": 1.51,
"grad_norm": 0.7494829297065735,
"learning_rate": 3.492253521126761e-05,
"loss": 0.0731,
"step": 6430
},
{
"epoch": 1.51,
"grad_norm": 2.2532362937927246,
"learning_rate": 3.489906103286385e-05,
"loss": 0.0912,
"step": 6440
},
{
"epoch": 1.51,
"grad_norm": 1.5315017700195312,
"learning_rate": 3.4875586854460095e-05,
"loss": 0.0655,
"step": 6450
},
{
"epoch": 1.52,
"grad_norm": 1.3311911821365356,
"learning_rate": 3.485211267605634e-05,
"loss": 0.0685,
"step": 6460
},
{
"epoch": 1.52,
"grad_norm": 1.0309032201766968,
"learning_rate": 3.482863849765258e-05,
"loss": 0.0778,
"step": 6470
},
{
"epoch": 1.52,
"grad_norm": 0.9944983720779419,
"learning_rate": 3.480516431924883e-05,
"loss": 0.0757,
"step": 6480
},
{
"epoch": 1.52,
"grad_norm": 2.2548060417175293,
"learning_rate": 3.478169014084508e-05,
"loss": 0.0747,
"step": 6490
},
{
"epoch": 1.53,
"grad_norm": 3.4860355854034424,
"learning_rate": 3.475821596244132e-05,
"loss": 0.0654,
"step": 6500
},
{
"epoch": 1.53,
"eval_loss": 0.07351405918598175,
"eval_macro/f1": 0.8905284200569832,
"eval_macro/precision": 0.8936967336400916,
"eval_macro/recall": 0.8890971037530004,
"eval_micro/f1": 0.8933296771097553,
"eval_micro/precision": 0.8944333859564829,
"eval_micro/recall": 0.8922286888052037,
"eval_runtime": 28.7347,
"eval_samples/accuracy": 0.8788086271824718,
"eval_samples_per_second": 508.27,
"eval_steps_per_second": 15.904,
"step": 6500
},
{
"epoch": 1.53,
"grad_norm": 1.7109043598175049,
"learning_rate": 3.473474178403756e-05,
"loss": 0.0717,
"step": 6510
},
{
"epoch": 1.53,
"grad_norm": 0.9149212837219238,
"learning_rate": 3.4711267605633806e-05,
"loss": 0.0474,
"step": 6520
},
{
"epoch": 1.53,
"grad_norm": 1.6851046085357666,
"learning_rate": 3.468779342723005e-05,
"loss": 0.0694,
"step": 6530
},
{
"epoch": 1.54,
"grad_norm": 44.414886474609375,
"learning_rate": 3.4664319248826294e-05,
"loss": 0.0653,
"step": 6540
},
{
"epoch": 1.54,
"grad_norm": 3.4222354888916016,
"learning_rate": 3.464084507042254e-05,
"loss": 0.0882,
"step": 6550
},
{
"epoch": 1.54,
"grad_norm": 2.1840708255767822,
"learning_rate": 3.461737089201878e-05,
"loss": 0.0579,
"step": 6560
},
{
"epoch": 1.54,
"grad_norm": NaN,
"learning_rate": 3.45962441314554e-05,
"loss": 0.0563,
"step": 6570
},
{
"epoch": 1.54,
"grad_norm": 1.8609527349472046,
"learning_rate": 3.457276995305164e-05,
"loss": 0.0694,
"step": 6580
},
{
"epoch": 1.55,
"grad_norm": 1.5708072185516357,
"learning_rate": 3.454929577464789e-05,
"loss": 0.0888,
"step": 6590
},
{
"epoch": 1.55,
"grad_norm": 1.0683521032333374,
"learning_rate": 3.4525821596244135e-05,
"loss": 0.0611,
"step": 6600
},
{
"epoch": 1.55,
"grad_norm": 5.111327171325684,
"learning_rate": 3.4502347417840376e-05,
"loss": 0.0508,
"step": 6610
},
{
"epoch": 1.55,
"grad_norm": 1.7825844287872314,
"learning_rate": 3.447887323943662e-05,
"loss": 0.0691,
"step": 6620
},
{
"epoch": 1.56,
"grad_norm": 3.4711480140686035,
"learning_rate": 3.445539906103287e-05,
"loss": 0.1069,
"step": 6630
},
{
"epoch": 1.56,
"grad_norm": 0.459466814994812,
"learning_rate": 3.4431924882629104e-05,
"loss": 0.083,
"step": 6640
},
{
"epoch": 1.56,
"grad_norm": 3.5742270946502686,
"learning_rate": 3.440845070422535e-05,
"loss": 0.0809,
"step": 6650
},
{
"epoch": 1.56,
"grad_norm": 0.6339036226272583,
"learning_rate": 3.43849765258216e-05,
"loss": 0.0442,
"step": 6660
},
{
"epoch": 1.57,
"grad_norm": 2.5935568809509277,
"learning_rate": 3.436150234741784e-05,
"loss": 0.0746,
"step": 6670
},
{
"epoch": 1.57,
"grad_norm": 3.0025858879089355,
"learning_rate": 3.433802816901409e-05,
"loss": 0.0781,
"step": 6680
},
{
"epoch": 1.57,
"grad_norm": 1.0843034982681274,
"learning_rate": 3.4314553990610334e-05,
"loss": 0.061,
"step": 6690
},
{
"epoch": 1.57,
"grad_norm": 2.5993733406066895,
"learning_rate": 3.4291079812206575e-05,
"loss": 0.0881,
"step": 6700
},
{
"epoch": 1.58,
"grad_norm": 3.3276283740997314,
"learning_rate": 3.4267605633802815e-05,
"loss": 0.0815,
"step": 6710
},
{
"epoch": 1.58,
"grad_norm": 3.007833957672119,
"learning_rate": 3.424413145539906e-05,
"loss": 0.0699,
"step": 6720
},
{
"epoch": 1.58,
"grad_norm": 3.273491621017456,
"learning_rate": 3.42206572769953e-05,
"loss": 0.0873,
"step": 6730
},
{
"epoch": 1.58,
"grad_norm": 0.6225374341011047,
"learning_rate": 3.419718309859155e-05,
"loss": 0.0877,
"step": 6740
},
{
"epoch": 1.58,
"grad_norm": 2.3699939250946045,
"learning_rate": 3.41737089201878e-05,
"loss": 0.0628,
"step": 6750
},
{
"epoch": 1.59,
"grad_norm": 3.982626438140869,
"learning_rate": 3.415023474178404e-05,
"loss": 0.0818,
"step": 6760
},
{
"epoch": 1.59,
"grad_norm": 2.5500738620758057,
"learning_rate": 3.4126760563380286e-05,
"loss": 0.077,
"step": 6770
},
{
"epoch": 1.59,
"grad_norm": 1.9729875326156616,
"learning_rate": 3.4103286384976526e-05,
"loss": 0.0584,
"step": 6780
},
{
"epoch": 1.59,
"grad_norm": 3.429597854614258,
"learning_rate": 3.407981220657277e-05,
"loss": 0.0692,
"step": 6790
},
{
"epoch": 1.6,
"grad_norm": 3.9442617893218994,
"learning_rate": 3.4056338028169014e-05,
"loss": 0.0758,
"step": 6800
},
{
"epoch": 1.6,
"grad_norm": 3.5854039192199707,
"learning_rate": 3.403286384976526e-05,
"loss": 0.0759,
"step": 6810
},
{
"epoch": 1.6,
"grad_norm": 1.9469636678695679,
"learning_rate": 3.40093896713615e-05,
"loss": 0.0546,
"step": 6820
},
{
"epoch": 1.6,
"grad_norm": 1.9262510538101196,
"learning_rate": 3.398591549295775e-05,
"loss": 0.0852,
"step": 6830
},
{
"epoch": 1.61,
"grad_norm": 3.2512049674987793,
"learning_rate": 3.3962441314554e-05,
"loss": 0.0679,
"step": 6840
},
{
"epoch": 1.61,
"grad_norm": 2.976078987121582,
"learning_rate": 3.393896713615024e-05,
"loss": 0.0551,
"step": 6850
},
{
"epoch": 1.61,
"grad_norm": 1.684304118156433,
"learning_rate": 3.391549295774648e-05,
"loss": 0.0569,
"step": 6860
},
{
"epoch": 1.61,
"grad_norm": 2.2591466903686523,
"learning_rate": 3.3892018779342725e-05,
"loss": 0.0594,
"step": 6870
},
{
"epoch": 1.62,
"grad_norm": 2.1120476722717285,
"learning_rate": 3.3868544600938966e-05,
"loss": 0.0895,
"step": 6880
},
{
"epoch": 1.62,
"grad_norm": 0.734380304813385,
"learning_rate": 3.384507042253521e-05,
"loss": 0.0515,
"step": 6890
},
{
"epoch": 1.62,
"grad_norm": 2.5207552909851074,
"learning_rate": 3.382159624413146e-05,
"loss": 0.0824,
"step": 6900
},
{
"epoch": 1.62,
"grad_norm": 1.5820809602737427,
"learning_rate": 3.37981220657277e-05,
"loss": 0.0984,
"step": 6910
},
{
"epoch": 1.62,
"grad_norm": 2.8833916187286377,
"learning_rate": 3.377464788732395e-05,
"loss": 0.0718,
"step": 6920
},
{
"epoch": 1.63,
"grad_norm": 1.4902230501174927,
"learning_rate": 3.375117370892019e-05,
"loss": 0.059,
"step": 6930
},
{
"epoch": 1.63,
"grad_norm": 2.578580141067505,
"learning_rate": 3.372769953051643e-05,
"loss": 0.0768,
"step": 6940
},
{
"epoch": 1.63,
"grad_norm": 1.8747726678848267,
"learning_rate": 3.370422535211268e-05,
"loss": 0.0996,
"step": 6950
},
{
"epoch": 1.63,
"grad_norm": 1.386628270149231,
"learning_rate": 3.3680751173708924e-05,
"loss": 0.0448,
"step": 6960
},
{
"epoch": 1.64,
"grad_norm": 1.6875271797180176,
"learning_rate": 3.3657276995305165e-05,
"loss": 0.1286,
"step": 6970
},
{
"epoch": 1.64,
"grad_norm": 1.8919246196746826,
"learning_rate": 3.363380281690141e-05,
"loss": 0.0644,
"step": 6980
},
{
"epoch": 1.64,
"grad_norm": 1.1084660291671753,
"learning_rate": 3.361032863849766e-05,
"loss": 0.0652,
"step": 6990
},
{
"epoch": 1.64,
"grad_norm": 2.3537535667419434,
"learning_rate": 3.35868544600939e-05,
"loss": 0.0844,
"step": 7000
},
{
"epoch": 1.64,
"eval_loss": 0.06981974095106125,
"eval_macro/f1": 0.895093218534528,
"eval_macro/precision": 0.8993744432665862,
"eval_macro/recall": 0.893346057078023,
"eval_micro/f1": 0.8977924262711573,
"eval_micro/precision": 0.9003580773998072,
"eval_micro/recall": 0.8952413557001027,
"eval_runtime": 28.2589,
"eval_samples/accuracy": 0.885792536802465,
"eval_samples_per_second": 516.828,
"eval_steps_per_second": 16.172,
"step": 7000
},
{
"epoch": 1.65,
"grad_norm": 2.0603086948394775,
"learning_rate": 3.356338028169014e-05,
"loss": 0.08,
"step": 7010
},
{
"epoch": 1.65,
"grad_norm": 3.941000461578369,
"learning_rate": 3.353990610328639e-05,
"loss": 0.0561,
"step": 7020
},
{
"epoch": 1.65,
"grad_norm": 1.4806817770004272,
"learning_rate": 3.351643192488263e-05,
"loss": 0.0699,
"step": 7030
},
{
"epoch": 1.65,
"grad_norm": 0.9418378472328186,
"learning_rate": 3.3492957746478876e-05,
"loss": 0.062,
"step": 7040
},
{
"epoch": 1.65,
"grad_norm": 2.2413206100463867,
"learning_rate": 3.3469483568075117e-05,
"loss": 0.0643,
"step": 7050
},
{
"epoch": 1.66,
"grad_norm": 0.5685548782348633,
"learning_rate": 3.3446009389671364e-05,
"loss": 0.0621,
"step": 7060
},
{
"epoch": 1.66,
"grad_norm": 2.528632640838623,
"learning_rate": 3.342253521126761e-05,
"loss": 0.1233,
"step": 7070
},
{
"epoch": 1.66,
"grad_norm": 0.1424780786037445,
"learning_rate": 3.339906103286385e-05,
"loss": 0.0732,
"step": 7080
},
{
"epoch": 1.66,
"grad_norm": 1.396640658378601,
"learning_rate": 3.337558685446009e-05,
"loss": 0.0749,
"step": 7090
},
{
"epoch": 1.67,
"grad_norm": 0.4843108057975769,
"learning_rate": 3.335211267605634e-05,
"loss": 0.0604,
"step": 7100
},
{
"epoch": 1.67,
"grad_norm": 1.4272027015686035,
"learning_rate": 3.332863849765258e-05,
"loss": 0.0476,
"step": 7110
},
{
"epoch": 1.67,
"grad_norm": 1.5079143047332764,
"learning_rate": 3.330516431924883e-05,
"loss": 0.0747,
"step": 7120
},
{
"epoch": 1.67,
"grad_norm": 2.5758399963378906,
"learning_rate": 3.3281690140845075e-05,
"loss": 0.0578,
"step": 7130
},
{
"epoch": 1.68,
"grad_norm": 3.6596169471740723,
"learning_rate": 3.3258215962441316e-05,
"loss": 0.0586,
"step": 7140
},
{
"epoch": 1.68,
"grad_norm": 2.591481924057007,
"learning_rate": 3.323474178403756e-05,
"loss": 0.0571,
"step": 7150
},
{
"epoch": 1.68,
"grad_norm": 0.9179452657699585,
"learning_rate": 3.3211267605633804e-05,
"loss": 0.0722,
"step": 7160
},
{
"epoch": 1.68,
"grad_norm": 2.251352071762085,
"learning_rate": 3.3187793427230044e-05,
"loss": 0.0784,
"step": 7170
},
{
"epoch": 1.69,
"grad_norm": 2.1384119987487793,
"learning_rate": 3.316431924882629e-05,
"loss": 0.0814,
"step": 7180
},
{
"epoch": 1.69,
"grad_norm": 0.7491868138313293,
"learning_rate": 3.314084507042254e-05,
"loss": 0.0701,
"step": 7190
},
{
"epoch": 1.69,
"grad_norm": 2.3708689212799072,
"learning_rate": 3.311737089201878e-05,
"loss": 0.0816,
"step": 7200
},
{
"epoch": 1.69,
"grad_norm": 1.7296544313430786,
"learning_rate": 3.309389671361503e-05,
"loss": 0.0704,
"step": 7210
},
{
"epoch": 1.69,
"grad_norm": 1.8387882709503174,
"learning_rate": 3.307042253521127e-05,
"loss": 0.0669,
"step": 7220
},
{
"epoch": 1.7,
"grad_norm": 0.6314681768417358,
"learning_rate": 3.304694835680751e-05,
"loss": 0.0657,
"step": 7230
},
{
"epoch": 1.7,
"grad_norm": 0.852002739906311,
"learning_rate": 3.3023474178403755e-05,
"loss": 0.0562,
"step": 7240
},
{
"epoch": 1.7,
"grad_norm": 2.0356638431549072,
"learning_rate": 3.3e-05,
"loss": 0.0793,
"step": 7250
},
{
"epoch": 1.7,
"grad_norm": 1.4071274995803833,
"learning_rate": 3.297652582159624e-05,
"loss": 0.062,
"step": 7260
},
{
"epoch": 1.71,
"grad_norm": 3.241372585296631,
"learning_rate": 3.295305164319249e-05,
"loss": 0.0774,
"step": 7270
},
{
"epoch": 1.71,
"grad_norm": 0.8536049127578735,
"learning_rate": 3.292957746478874e-05,
"loss": 0.0507,
"step": 7280
},
{
"epoch": 1.71,
"grad_norm": 2.601881980895996,
"learning_rate": 3.290610328638498e-05,
"loss": 0.0837,
"step": 7290
},
{
"epoch": 1.71,
"grad_norm": 0.7648743391036987,
"learning_rate": 3.288262910798122e-05,
"loss": 0.0598,
"step": 7300
},
{
"epoch": 1.72,
"grad_norm": 1.050346851348877,
"learning_rate": 3.2859154929577466e-05,
"loss": 0.0676,
"step": 7310
},
{
"epoch": 1.72,
"grad_norm": 0.15019965171813965,
"learning_rate": 3.283568075117371e-05,
"loss": 0.0539,
"step": 7320
},
{
"epoch": 1.72,
"grad_norm": 1.2535006999969482,
"learning_rate": 3.2812206572769954e-05,
"loss": 0.0478,
"step": 7330
},
{
"epoch": 1.72,
"grad_norm": 1.989013433456421,
"learning_rate": 3.27887323943662e-05,
"loss": 0.0599,
"step": 7340
},
{
"epoch": 1.73,
"grad_norm": 4.4332756996154785,
"learning_rate": 3.276525821596244e-05,
"loss": 0.0609,
"step": 7350
},
{
"epoch": 1.73,
"grad_norm": 2.3076884746551514,
"learning_rate": 3.274178403755869e-05,
"loss": 0.058,
"step": 7360
},
{
"epoch": 1.73,
"grad_norm": 1.2508978843688965,
"learning_rate": 3.271830985915493e-05,
"loss": 0.0802,
"step": 7370
},
{
"epoch": 1.73,
"grad_norm": 1.3526707887649536,
"learning_rate": 3.269483568075117e-05,
"loss": 0.048,
"step": 7380
},
{
"epoch": 1.73,
"grad_norm": 1.801395058631897,
"learning_rate": 3.267136150234742e-05,
"loss": 0.1073,
"step": 7390
},
{
"epoch": 1.74,
"grad_norm": 2.8772401809692383,
"learning_rate": 3.2647887323943665e-05,
"loss": 0.089,
"step": 7400
},
{
"epoch": 1.74,
"grad_norm": 1.380043387413025,
"learning_rate": 3.2624413145539906e-05,
"loss": 0.0525,
"step": 7410
},
{
"epoch": 1.74,
"grad_norm": 1.199769377708435,
"learning_rate": 3.260093896713615e-05,
"loss": 0.0799,
"step": 7420
},
{
"epoch": 1.74,
"grad_norm": 2.3984384536743164,
"learning_rate": 3.25774647887324e-05,
"loss": 0.0465,
"step": 7430
},
{
"epoch": 1.75,
"grad_norm": 0.5681291222572327,
"learning_rate": 3.255399061032864e-05,
"loss": 0.0328,
"step": 7440
},
{
"epoch": 1.75,
"grad_norm": 6.884750843048096,
"learning_rate": 3.253051643192488e-05,
"loss": 0.0789,
"step": 7450
},
{
"epoch": 1.75,
"grad_norm": 5.364428520202637,
"learning_rate": 3.250704225352113e-05,
"loss": 0.1007,
"step": 7460
},
{
"epoch": 1.75,
"grad_norm": 1.7261794805526733,
"learning_rate": 3.248356807511737e-05,
"loss": 0.0461,
"step": 7470
},
{
"epoch": 1.76,
"grad_norm": 2.6595466136932373,
"learning_rate": 3.246009389671362e-05,
"loss": 0.0682,
"step": 7480
},
{
"epoch": 1.76,
"grad_norm": 3.9795382022857666,
"learning_rate": 3.2436619718309864e-05,
"loss": 0.0717,
"step": 7490
},
{
"epoch": 1.76,
"grad_norm": 2.648000955581665,
"learning_rate": 3.2413145539906105e-05,
"loss": 0.0751,
"step": 7500
},
{
"epoch": 1.76,
"eval_loss": 0.07527824491262436,
"eval_macro/f1": 0.8894495203537635,
"eval_macro/precision": 0.8925620624824648,
"eval_macro/recall": 0.8900694450587358,
"eval_micro/f1": 0.8929048156894959,
"eval_micro/precision": 0.8926909389542842,
"eval_micro/recall": 0.8931187949332421,
"eval_runtime": 29.2466,
"eval_samples/accuracy": 0.8813420061622732,
"eval_samples_per_second": 499.375,
"eval_steps_per_second": 15.626,
"step": 7500
},
{
"epoch": 1.76,
"grad_norm": 1.2241559028625488,
"learning_rate": 3.238967136150235e-05,
"loss": 0.0636,
"step": 7510
},
{
"epoch": 1.77,
"grad_norm": 1.2783581018447876,
"learning_rate": 3.236619718309859e-05,
"loss": 0.058,
"step": 7520
},
{
"epoch": 1.77,
"grad_norm": 1.1444569826126099,
"learning_rate": 3.234272300469483e-05,
"loss": 0.0529,
"step": 7530
},
{
"epoch": 1.77,
"grad_norm": 3.06350040435791,
"learning_rate": 3.231924882629108e-05,
"loss": 0.083,
"step": 7540
},
{
"epoch": 1.77,
"grad_norm": 0.7670800685882568,
"learning_rate": 3.229577464788733e-05,
"loss": 0.056,
"step": 7550
},
{
"epoch": 1.77,
"grad_norm": 2.1597325801849365,
"learning_rate": 3.227230046948357e-05,
"loss": 0.0796,
"step": 7560
},
{
"epoch": 1.78,
"grad_norm": 1.469767451286316,
"learning_rate": 3.2248826291079816e-05,
"loss": 0.0595,
"step": 7570
},
{
"epoch": 1.78,
"grad_norm": 1.9761245250701904,
"learning_rate": 3.222535211267606e-05,
"loss": 0.0757,
"step": 7580
},
{
"epoch": 1.78,
"grad_norm": 1.3863333463668823,
"learning_rate": 3.2201877934272304e-05,
"loss": 0.102,
"step": 7590
},
{
"epoch": 1.78,
"grad_norm": 2.2688632011413574,
"learning_rate": 3.2178403755868544e-05,
"loss": 0.0489,
"step": 7600
},
{
"epoch": 1.79,
"grad_norm": 1.1232688426971436,
"learning_rate": 3.215492957746479e-05,
"loss": 0.0621,
"step": 7610
},
{
"epoch": 1.79,
"grad_norm": 1.6261709928512573,
"learning_rate": 3.213145539906103e-05,
"loss": 0.0464,
"step": 7620
},
{
"epoch": 1.79,
"grad_norm": 1.0990047454833984,
"learning_rate": 3.210798122065728e-05,
"loss": 0.0756,
"step": 7630
},
{
"epoch": 1.79,
"grad_norm": 1.7575910091400146,
"learning_rate": 3.208450704225353e-05,
"loss": 0.0865,
"step": 7640
},
{
"epoch": 1.8,
"grad_norm": 0.6897794008255005,
"learning_rate": 3.206103286384977e-05,
"loss": 0.0874,
"step": 7650
},
{
"epoch": 1.8,
"grad_norm": 3.900545120239258,
"learning_rate": 3.2037558685446015e-05,
"loss": 0.0491,
"step": 7660
},
{
"epoch": 1.8,
"grad_norm": 1.0206152200698853,
"learning_rate": 3.2014084507042256e-05,
"loss": 0.0678,
"step": 7670
},
{
"epoch": 1.8,
"grad_norm": 1.4470140933990479,
"learning_rate": 3.1990610328638496e-05,
"loss": 0.0817,
"step": 7680
},
{
"epoch": 1.81,
"grad_norm": 1.211830496788025,
"learning_rate": 3.1967136150234743e-05,
"loss": 0.0651,
"step": 7690
},
{
"epoch": 1.81,
"grad_norm": 1.4572837352752686,
"learning_rate": 3.1943661971830984e-05,
"loss": 0.0527,
"step": 7700
},
{
"epoch": 1.81,
"grad_norm": 1.8414143323898315,
"learning_rate": 3.192018779342723e-05,
"loss": 0.0535,
"step": 7710
},
{
"epoch": 1.81,
"grad_norm": 0.916439950466156,
"learning_rate": 3.189671361502348e-05,
"loss": 0.0653,
"step": 7720
},
{
"epoch": 1.81,
"grad_norm": 2.0691421031951904,
"learning_rate": 3.187323943661972e-05,
"loss": 0.0717,
"step": 7730
},
{
"epoch": 1.82,
"grad_norm": 0.38336944580078125,
"learning_rate": 3.184976525821597e-05,
"loss": 0.0677,
"step": 7740
},
{
"epoch": 1.82,
"grad_norm": 1.506400465965271,
"learning_rate": 3.182629107981221e-05,
"loss": 0.0585,
"step": 7750
},
{
"epoch": 1.82,
"grad_norm": 0.8172094821929932,
"learning_rate": 3.180281690140845e-05,
"loss": 0.0807,
"step": 7760
},
{
"epoch": 1.82,
"grad_norm": 1.8917680978775024,
"learning_rate": 3.1779342723004695e-05,
"loss": 0.0714,
"step": 7770
},
{
"epoch": 1.83,
"grad_norm": 1.2013388872146606,
"learning_rate": 3.175586854460094e-05,
"loss": 0.0703,
"step": 7780
},
{
"epoch": 1.83,
"grad_norm": 0.5137236714363098,
"learning_rate": 3.173239436619718e-05,
"loss": 0.063,
"step": 7790
},
{
"epoch": 1.83,
"grad_norm": 4.698399543762207,
"learning_rate": 3.170892018779343e-05,
"loss": 0.0573,
"step": 7800
},
{
"epoch": 1.83,
"grad_norm": 2.3770229816436768,
"learning_rate": 3.168544600938967e-05,
"loss": 0.0771,
"step": 7810
},
{
"epoch": 1.84,
"grad_norm": 1.3372191190719604,
"learning_rate": 3.166197183098591e-05,
"loss": 0.074,
"step": 7820
},
{
"epoch": 1.84,
"grad_norm": 1.439131259918213,
"learning_rate": 3.163849765258216e-05,
"loss": 0.0552,
"step": 7830
},
{
"epoch": 1.84,
"grad_norm": 9.400777816772461,
"learning_rate": 3.1615023474178406e-05,
"loss": 0.0633,
"step": 7840
},
{
"epoch": 1.84,
"grad_norm": 1.817305326461792,
"learning_rate": 3.159154929577465e-05,
"loss": 0.0608,
"step": 7850
},
{
"epoch": 1.85,
"grad_norm": 3.0985467433929443,
"learning_rate": 3.1568075117370894e-05,
"loss": 0.0856,
"step": 7860
},
{
"epoch": 1.85,
"grad_norm": 2.4275131225585938,
"learning_rate": 3.154460093896714e-05,
"loss": 0.0829,
"step": 7870
},
{
"epoch": 1.85,
"grad_norm": 2.892664909362793,
"learning_rate": 3.152112676056338e-05,
"loss": 0.0714,
"step": 7880
},
{
"epoch": 1.85,
"grad_norm": 1.899727702140808,
"learning_rate": 3.149765258215962e-05,
"loss": 0.0689,
"step": 7890
},
{
"epoch": 1.85,
"grad_norm": 2.081380844116211,
"learning_rate": 3.147417840375587e-05,
"loss": 0.0618,
"step": 7900
},
{
"epoch": 1.86,
"grad_norm": 1.7399240732192993,
"learning_rate": 3.145070422535211e-05,
"loss": 0.062,
"step": 7910
},
{
"epoch": 1.86,
"grad_norm": 0.36951398849487305,
"learning_rate": 3.142723004694836e-05,
"loss": 0.0501,
"step": 7920
},
{
"epoch": 1.86,
"grad_norm": 0.5229721069335938,
"learning_rate": 3.1403755868544605e-05,
"loss": 0.088,
"step": 7930
},
{
"epoch": 1.86,
"grad_norm": 2.4102940559387207,
"learning_rate": 3.1380281690140846e-05,
"loss": 0.0403,
"step": 7940
},
{
"epoch": 1.87,
"grad_norm": 1.0259218215942383,
"learning_rate": 3.135680751173709e-05,
"loss": 0.07,
"step": 7950
},
{
"epoch": 1.87,
"grad_norm": 2.9200522899627686,
"learning_rate": 3.1333333333333334e-05,
"loss": 0.0934,
"step": 7960
},
{
"epoch": 1.87,
"grad_norm": 1.3367905616760254,
"learning_rate": 3.1309859154929574e-05,
"loss": 0.0799,
"step": 7970
},
{
"epoch": 1.87,
"grad_norm": 0.7008821964263916,
"learning_rate": 3.128638497652582e-05,
"loss": 0.0581,
"step": 7980
},
{
"epoch": 1.88,
"grad_norm": 6.872345447540283,
"learning_rate": 3.126291079812207e-05,
"loss": 0.0762,
"step": 7990
},
{
"epoch": 1.88,
"grad_norm": 2.593233346939087,
"learning_rate": 3.123943661971831e-05,
"loss": 0.065,
"step": 8000
},
{
"epoch": 1.88,
"eval_loss": 0.07052791863679886,
"eval_macro/f1": 0.8992477199985428,
"eval_macro/precision": 0.8994462156866554,
"eval_macro/recall": 0.9003938152127393,
"eval_micro/f1": 0.9019728519164358,
"eval_micro/precision": 0.9008332195055321,
"eval_micro/recall": 0.9031153714481343,
"eval_runtime": 28.0078,
"eval_samples/accuracy": 0.8907223553577542,
"eval_samples_per_second": 521.462,
"eval_steps_per_second": 16.317,
"step": 8000
},
{
"epoch": 1.88,
"grad_norm": 5.146605491638184,
"learning_rate": 3.121596244131456e-05,
"loss": 0.0658,
"step": 8010
},
{
"epoch": 1.88,
"grad_norm": 0.41272249817848206,
"learning_rate": 3.1192488262910804e-05,
"loss": 0.066,
"step": 8020
},
{
"epoch": 1.88,
"grad_norm": 0.21056464314460754,
"learning_rate": 3.1169014084507045e-05,
"loss": 0.0414,
"step": 8030
},
{
"epoch": 1.89,
"grad_norm": 2.248830556869507,
"learning_rate": 3.1145539906103285e-05,
"loss": 0.0526,
"step": 8040
},
{
"epoch": 1.89,
"grad_norm": 1.4600979089736938,
"learning_rate": 3.112206572769953e-05,
"loss": 0.0498,
"step": 8050
},
{
"epoch": 1.89,
"grad_norm": 0.836912214756012,
"learning_rate": 3.109859154929577e-05,
"loss": 0.0711,
"step": 8060
},
{
"epoch": 1.89,
"grad_norm": 1.9406927824020386,
"learning_rate": 3.107511737089202e-05,
"loss": 0.0398,
"step": 8070
},
{
"epoch": 1.9,
"grad_norm": 1.8109996318817139,
"learning_rate": 3.105164319248827e-05,
"loss": 0.0607,
"step": 8080
},
{
"epoch": 1.9,
"grad_norm": 3.5531466007232666,
"learning_rate": 3.102816901408451e-05,
"loss": 0.057,
"step": 8090
},
{
"epoch": 1.9,
"grad_norm": 0.3769063651561737,
"learning_rate": 3.1004694835680756e-05,
"loss": 0.0649,
"step": 8100
},
{
"epoch": 1.9,
"grad_norm": 0.7092906832695007,
"learning_rate": 3.0981220657276997e-05,
"loss": 0.0722,
"step": 8110
},
{
"epoch": 1.91,
"grad_norm": 1.2704371213912964,
"learning_rate": 3.095774647887324e-05,
"loss": 0.0629,
"step": 8120
},
{
"epoch": 1.91,
"grad_norm": 0.6935750842094421,
"learning_rate": 3.0934272300469484e-05,
"loss": 0.0519,
"step": 8130
},
{
"epoch": 1.91,
"grad_norm": 1.7571748495101929,
"learning_rate": 3.091079812206573e-05,
"loss": 0.0436,
"step": 8140
},
{
"epoch": 1.91,
"grad_norm": 0.3458290100097656,
"learning_rate": 3.088732394366197e-05,
"loss": 0.0628,
"step": 8150
},
{
"epoch": 1.92,
"grad_norm": 2.861957311630249,
"learning_rate": 3.086384976525822e-05,
"loss": 0.0603,
"step": 8160
},
{
"epoch": 1.92,
"grad_norm": 2.1730966567993164,
"learning_rate": 3.084037558685447e-05,
"loss": 0.1088,
"step": 8170
},
{
"epoch": 1.92,
"grad_norm": 0.8724583983421326,
"learning_rate": 3.081690140845071e-05,
"loss": 0.0419,
"step": 8180
},
{
"epoch": 1.92,
"grad_norm": 4.432601451873779,
"learning_rate": 3.079342723004695e-05,
"loss": 0.0556,
"step": 8190
},
{
"epoch": 1.92,
"grad_norm": 1.5362699031829834,
"learning_rate": 3.0769953051643196e-05,
"loss": 0.0514,
"step": 8200
},
{
"epoch": 1.93,
"grad_norm": 3.248563766479492,
"learning_rate": 3.0746478873239436e-05,
"loss": 0.0713,
"step": 8210
},
{
"epoch": 1.93,
"grad_norm": 2.2945611476898193,
"learning_rate": 3.0723004694835683e-05,
"loss": 0.0553,
"step": 8220
},
{
"epoch": 1.93,
"grad_norm": 1.3250812292099,
"learning_rate": 3.069953051643193e-05,
"loss": 0.0577,
"step": 8230
},
{
"epoch": 1.93,
"grad_norm": 2.426212787628174,
"learning_rate": 3.067605633802817e-05,
"loss": 0.086,
"step": 8240
},
{
"epoch": 1.94,
"grad_norm": 1.70464289188385,
"learning_rate": 3.065258215962442e-05,
"loss": 0.0542,
"step": 8250
},
{
"epoch": 1.94,
"grad_norm": 1.791068434715271,
"learning_rate": 3.062910798122066e-05,
"loss": 0.0697,
"step": 8260
},
{
"epoch": 1.94,
"grad_norm": 1.9915051460266113,
"learning_rate": 3.06056338028169e-05,
"loss": 0.0711,
"step": 8270
},
{
"epoch": 1.94,
"grad_norm": 0.4869524836540222,
"learning_rate": 3.058215962441315e-05,
"loss": 0.0541,
"step": 8280
},
{
"epoch": 1.95,
"grad_norm": 0.5803549885749817,
"learning_rate": 3.055868544600939e-05,
"loss": 0.0392,
"step": 8290
},
{
"epoch": 1.95,
"grad_norm": 1.3323622941970825,
"learning_rate": 3.0535211267605635e-05,
"loss": 0.0631,
"step": 8300
},
{
"epoch": 1.95,
"grad_norm": 2.372645139694214,
"learning_rate": 3.051173708920188e-05,
"loss": 0.0815,
"step": 8310
},
{
"epoch": 1.95,
"grad_norm": 1.5064605474472046,
"learning_rate": 3.048826291079812e-05,
"loss": 0.0925,
"step": 8320
},
{
"epoch": 1.96,
"grad_norm": 2.408501625061035,
"learning_rate": 3.0464788732394367e-05,
"loss": 0.0769,
"step": 8330
},
{
"epoch": 1.96,
"grad_norm": 1.6267452239990234,
"learning_rate": 3.0441314553990614e-05,
"loss": 0.0574,
"step": 8340
},
{
"epoch": 1.96,
"grad_norm": 2.341489791870117,
"learning_rate": 3.0417840375586855e-05,
"loss": 0.0665,
"step": 8350
},
{
"epoch": 1.96,
"grad_norm": 2.8234708309173584,
"learning_rate": 3.03943661971831e-05,
"loss": 0.0629,
"step": 8360
},
{
"epoch": 1.96,
"grad_norm": 0.7846190333366394,
"learning_rate": 3.0370892018779346e-05,
"loss": 0.0646,
"step": 8370
},
{
"epoch": 1.97,
"grad_norm": 2.0671329498291016,
"learning_rate": 3.0347417840375587e-05,
"loss": 0.0667,
"step": 8380
},
{
"epoch": 1.97,
"grad_norm": 1.061323881149292,
"learning_rate": 3.032394366197183e-05,
"loss": 0.0676,
"step": 8390
},
{
"epoch": 1.97,
"grad_norm": 2.8302061557769775,
"learning_rate": 3.0300469483568078e-05,
"loss": 0.0454,
"step": 8400
},
{
"epoch": 1.97,
"grad_norm": 2.0348949432373047,
"learning_rate": 3.027699530516432e-05,
"loss": 0.0839,
"step": 8410
},
{
"epoch": 1.98,
"grad_norm": 4.4613494873046875,
"learning_rate": 3.0253521126760566e-05,
"loss": 0.0617,
"step": 8420
},
{
"epoch": 1.98,
"grad_norm": 1.2178605794906616,
"learning_rate": 3.023004694835681e-05,
"loss": 0.0877,
"step": 8430
},
{
"epoch": 1.98,
"grad_norm": 0.9891900420188904,
"learning_rate": 3.020657276995305e-05,
"loss": 0.0658,
"step": 8440
},
{
"epoch": 1.98,
"grad_norm": 1.4533942937850952,
"learning_rate": 3.0183098591549298e-05,
"loss": 0.0901,
"step": 8450
},
{
"epoch": 1.99,
"grad_norm": 1.6907552480697632,
"learning_rate": 3.0159624413145542e-05,
"loss": 0.053,
"step": 8460
},
{
"epoch": 1.99,
"grad_norm": 0.9100379943847656,
"learning_rate": 3.0136150234741782e-05,
"loss": 0.0392,
"step": 8470
},
{
"epoch": 1.99,
"grad_norm": 1.1074556112289429,
"learning_rate": 3.011267605633803e-05,
"loss": 0.0781,
"step": 8480
},
{
"epoch": 1.99,
"grad_norm": 2.8616244792938232,
"learning_rate": 3.0089201877934277e-05,
"loss": 0.0448,
"step": 8490
},
{
"epoch": 2.0,
"grad_norm": 3.9619174003601074,
"learning_rate": 3.0065727699530514e-05,
"loss": 0.0803,
"step": 8500
},
{
"epoch": 2.0,
"eval_loss": 0.06560152769088745,
"eval_macro/f1": 0.9052200757632112,
"eval_macro/precision": 0.9052297506739012,
"eval_macro/recall": 0.905709010268969,
"eval_micro/f1": 0.9075227599425012,
"eval_micro/precision": 0.9072743447615137,
"eval_micro/recall": 0.9077713111947963,
"eval_runtime": 27.8697,
"eval_samples/accuracy": 0.8985963711057857,
"eval_samples_per_second": 524.046,
"eval_steps_per_second": 16.398,
"step": 8500
},
{
"epoch": 2.0,
"grad_norm": 2.557591199874878,
"learning_rate": 3.004225352112676e-05,
"loss": 0.072,
"step": 8510
},
{
"epoch": 2.0,
"grad_norm": 0.9702019691467285,
"learning_rate": 3.001877934272301e-05,
"loss": 0.0418,
"step": 8520
},
{
"epoch": 2.0,
"grad_norm": 2.3909103870391846,
"learning_rate": 2.999530516431925e-05,
"loss": 0.0548,
"step": 8530
},
{
"epoch": 2.0,
"grad_norm": 2.390047073364258,
"learning_rate": 2.9971830985915494e-05,
"loss": 0.0602,
"step": 8540
},
{
"epoch": 2.01,
"grad_norm": 0.9028347134590149,
"learning_rate": 2.994835680751174e-05,
"loss": 0.0489,
"step": 8550
},
{
"epoch": 2.01,
"grad_norm": 1.160230278968811,
"learning_rate": 2.992488262910798e-05,
"loss": 0.0542,
"step": 8560
},
{
"epoch": 2.01,
"grad_norm": 0.7624329924583435,
"learning_rate": 2.9901408450704225e-05,
"loss": 0.0568,
"step": 8570
},
{
"epoch": 2.01,
"grad_norm": 1.706424593925476,
"learning_rate": 2.9877934272300473e-05,
"loss": 0.0593,
"step": 8580
},
{
"epoch": 2.02,
"grad_norm": 2.120983839035034,
"learning_rate": 2.9854460093896713e-05,
"loss": 0.045,
"step": 8590
},
{
"epoch": 2.02,
"grad_norm": 1.4256010055541992,
"learning_rate": 2.983098591549296e-05,
"loss": 0.0559,
"step": 8600
},
{
"epoch": 2.02,
"grad_norm": 7.589344501495361,
"learning_rate": 2.9807511737089205e-05,
"loss": 0.0348,
"step": 8610
},
{
"epoch": 2.02,
"grad_norm": 2.1067638397216797,
"learning_rate": 2.9784037558685445e-05,
"loss": 0.057,
"step": 8620
},
{
"epoch": 2.03,
"grad_norm": 0.43838655948638916,
"learning_rate": 2.9760563380281693e-05,
"loss": 0.0371,
"step": 8630
},
{
"epoch": 2.03,
"grad_norm": 2.4295332431793213,
"learning_rate": 2.9737089201877936e-05,
"loss": 0.077,
"step": 8640
},
{
"epoch": 2.03,
"grad_norm": 3.2406866550445557,
"learning_rate": 2.9713615023474177e-05,
"loss": 0.0472,
"step": 8650
},
{
"epoch": 2.03,
"grad_norm": 2.5891144275665283,
"learning_rate": 2.9690140845070424e-05,
"loss": 0.0882,
"step": 8660
},
{
"epoch": 2.04,
"grad_norm": 0.5570282340049744,
"learning_rate": 2.9666666666666672e-05,
"loss": 0.0601,
"step": 8670
},
{
"epoch": 2.04,
"grad_norm": 1.3984103202819824,
"learning_rate": 2.9643192488262912e-05,
"loss": 0.0484,
"step": 8680
},
{
"epoch": 2.04,
"grad_norm": 8.100823402404785,
"learning_rate": 2.9619718309859156e-05,
"loss": 0.0383,
"step": 8690
},
{
"epoch": 2.04,
"grad_norm": 3.0917282104492188,
"learning_rate": 2.9596244131455404e-05,
"loss": 0.0452,
"step": 8700
},
{
"epoch": 2.04,
"grad_norm": 1.646676778793335,
"learning_rate": 2.9572769953051644e-05,
"loss": 0.0407,
"step": 8710
},
{
"epoch": 2.05,
"grad_norm": 0.6801753640174866,
"learning_rate": 2.9549295774647888e-05,
"loss": 0.0493,
"step": 8720
},
{
"epoch": 2.05,
"grad_norm": 0.7821123600006104,
"learning_rate": 2.9525821596244135e-05,
"loss": 0.0599,
"step": 8730
},
{
"epoch": 2.05,
"grad_norm": 1.8716830015182495,
"learning_rate": 2.9502347417840376e-05,
"loss": 0.0515,
"step": 8740
},
{
"epoch": 2.05,
"grad_norm": 0.6820869445800781,
"learning_rate": 2.9478873239436623e-05,
"loss": 0.0392,
"step": 8750
},
{
"epoch": 2.06,
"grad_norm": 2.376614809036255,
"learning_rate": 2.9455399061032867e-05,
"loss": 0.0568,
"step": 8760
},
{
"epoch": 2.06,
"grad_norm": 1.6240296363830566,
"learning_rate": 2.9431924882629108e-05,
"loss": 0.063,
"step": 8770
},
{
"epoch": 2.06,
"grad_norm": 2.458850145339966,
"learning_rate": 2.9408450704225355e-05,
"loss": 0.0508,
"step": 8780
},
{
"epoch": 2.06,
"grad_norm": 2.6562559604644775,
"learning_rate": 2.93849765258216e-05,
"loss": 0.0689,
"step": 8790
},
{
"epoch": 2.07,
"grad_norm": 3.8325881958007812,
"learning_rate": 2.936150234741784e-05,
"loss": 0.055,
"step": 8800
},
{
"epoch": 2.07,
"grad_norm": 3.6671085357666016,
"learning_rate": 2.9338028169014087e-05,
"loss": 0.0546,
"step": 8810
},
{
"epoch": 2.07,
"grad_norm": 3.2882981300354004,
"learning_rate": 2.931455399061033e-05,
"loss": 0.0687,
"step": 8820
},
{
"epoch": 2.07,
"grad_norm": 1.7834620475769043,
"learning_rate": 2.9291079812206572e-05,
"loss": 0.0535,
"step": 8830
},
{
"epoch": 2.08,
"grad_norm": 2.3183083534240723,
"learning_rate": 2.926760563380282e-05,
"loss": 0.0494,
"step": 8840
},
{
"epoch": 2.08,
"grad_norm": 1.2512354850769043,
"learning_rate": 2.924413145539906e-05,
"loss": 0.0537,
"step": 8850
},
{
"epoch": 2.08,
"grad_norm": 10.471619606018066,
"learning_rate": 2.9220657276995307e-05,
"loss": 0.0644,
"step": 8860
},
{
"epoch": 2.08,
"grad_norm": 2.114579439163208,
"learning_rate": 2.919718309859155e-05,
"loss": 0.0466,
"step": 8870
},
{
"epoch": 2.08,
"grad_norm": 0.7307925224304199,
"learning_rate": 2.917370892018779e-05,
"loss": 0.0536,
"step": 8880
},
{
"epoch": 2.09,
"grad_norm": 1.281708836555481,
"learning_rate": 2.915023474178404e-05,
"loss": 0.0627,
"step": 8890
},
{
"epoch": 2.09,
"grad_norm": 4.101240634918213,
"learning_rate": 2.9126760563380283e-05,
"loss": 0.0685,
"step": 8900
},
{
"epoch": 2.09,
"grad_norm": 0.2623077929019928,
"learning_rate": 2.9103286384976523e-05,
"loss": 0.0415,
"step": 8910
},
{
"epoch": 2.09,
"grad_norm": 2.209989547729492,
"learning_rate": 2.907981220657277e-05,
"loss": 0.063,
"step": 8920
},
{
"epoch": 2.1,
"grad_norm": 1.111786961555481,
"learning_rate": 2.9056338028169018e-05,
"loss": 0.0753,
"step": 8930
},
{
"epoch": 2.1,
"grad_norm": 2.8421497344970703,
"learning_rate": 2.903286384976526e-05,
"loss": 0.0259,
"step": 8940
},
{
"epoch": 2.1,
"grad_norm": 3.9581027030944824,
"learning_rate": 2.9009389671361503e-05,
"loss": 0.0534,
"step": 8950
},
{
"epoch": 2.1,
"grad_norm": 1.7602144479751587,
"learning_rate": 2.898591549295775e-05,
"loss": 0.0633,
"step": 8960
},
{
"epoch": 2.11,
"grad_norm": 3.0346012115478516,
"learning_rate": 2.896244131455399e-05,
"loss": 0.0488,
"step": 8970
},
{
"epoch": 2.11,
"grad_norm": 4.6578850746154785,
"learning_rate": 2.8938967136150234e-05,
"loss": 0.0563,
"step": 8980
},
{
"epoch": 2.11,
"grad_norm": 1.955411434173584,
"learning_rate": 2.8915492957746482e-05,
"loss": 0.0544,
"step": 8990
},
{
"epoch": 2.11,
"grad_norm": 0.6909317970275879,
"learning_rate": 2.8892018779342722e-05,
"loss": 0.0744,
"step": 9000
},
{
"epoch": 2.11,
"eval_loss": 0.0659680888056755,
"eval_macro/f1": 0.9030170471313492,
"eval_macro/precision": 0.9076760486157719,
"eval_macro/recall": 0.8993950760206066,
"eval_micro/f1": 0.9036940221503749,
"eval_micro/precision": 0.9079411154882853,
"eval_micro/recall": 0.899486477233824,
"eval_runtime": 28.0326,
"eval_samples/accuracy": 0.8906538856555974,
"eval_samples_per_second": 521.0,
"eval_steps_per_second": 16.302,
"step": 9000
},
{
"epoch": 2.12,
"grad_norm": 0.8493853807449341,
"learning_rate": 2.886854460093897e-05,
"loss": 0.0391,
"step": 9010
},
{
"epoch": 2.12,
"grad_norm": 1.0768928527832031,
"learning_rate": 2.8845070422535214e-05,
"loss": 0.051,
"step": 9020
},
{
"epoch": 2.12,
"grad_norm": 1.8907384872436523,
"learning_rate": 2.8821596244131454e-05,
"loss": 0.0382,
"step": 9030
},
{
"epoch": 2.12,
"grad_norm": 3.2443666458129883,
"learning_rate": 2.87981220657277e-05,
"loss": 0.0441,
"step": 9040
},
{
"epoch": 2.12,
"grad_norm": 0.6289181709289551,
"learning_rate": 2.8774647887323946e-05,
"loss": 0.0429,
"step": 9050
},
{
"epoch": 2.13,
"grad_norm": 2.1197361946105957,
"learning_rate": 2.8751173708920186e-05,
"loss": 0.0364,
"step": 9060
},
{
"epoch": 2.13,
"grad_norm": 4.240948677062988,
"learning_rate": 2.8727699530516433e-05,
"loss": 0.0597,
"step": 9070
},
{
"epoch": 2.13,
"grad_norm": 2.6654043197631836,
"learning_rate": 2.870422535211268e-05,
"loss": 0.0571,
"step": 9080
},
{
"epoch": 2.13,
"grad_norm": 1.7354888916015625,
"learning_rate": 2.8680751173708918e-05,
"loss": 0.0363,
"step": 9090
},
{
"epoch": 2.14,
"grad_norm": 0.7101438045501709,
"learning_rate": 2.8657276995305165e-05,
"loss": 0.0507,
"step": 9100
},
{
"epoch": 2.14,
"grad_norm": 0.45135313272476196,
"learning_rate": 2.8633802816901413e-05,
"loss": 0.0473,
"step": 9110
},
{
"epoch": 2.14,
"grad_norm": 1.211514949798584,
"learning_rate": 2.8610328638497653e-05,
"loss": 0.0565,
"step": 9120
},
{
"epoch": 2.14,
"grad_norm": 1.7397087812423706,
"learning_rate": 2.8586854460093897e-05,
"loss": 0.0608,
"step": 9130
},
{
"epoch": 2.15,
"grad_norm": 0.08779401332139969,
"learning_rate": 2.8563380281690145e-05,
"loss": 0.0606,
"step": 9140
},
{
"epoch": 2.15,
"grad_norm": 1.5981838703155518,
"learning_rate": 2.8539906103286385e-05,
"loss": 0.0359,
"step": 9150
},
{
"epoch": 2.15,
"grad_norm": 1.314452052116394,
"learning_rate": 2.851643192488263e-05,
"loss": 0.0521,
"step": 9160
},
{
"epoch": 2.15,
"grad_norm": 0.439706414937973,
"learning_rate": 2.8492957746478876e-05,
"loss": 0.0746,
"step": 9170
},
{
"epoch": 2.15,
"grad_norm": 2.364208221435547,
"learning_rate": 2.8469483568075117e-05,
"loss": 0.0672,
"step": 9180
},
{
"epoch": 2.16,
"grad_norm": 1.9845068454742432,
"learning_rate": 2.8446009389671364e-05,
"loss": 0.1084,
"step": 9190
},
{
"epoch": 2.16,
"grad_norm": 1.915307641029358,
"learning_rate": 2.842253521126761e-05,
"loss": 0.0886,
"step": 9200
},
{
"epoch": 2.16,
"grad_norm": 2.353233814239502,
"learning_rate": 2.839906103286385e-05,
"loss": 0.0528,
"step": 9210
},
{
"epoch": 2.16,
"grad_norm": 4.984065055847168,
"learning_rate": 2.8375586854460096e-05,
"loss": 0.0632,
"step": 9220
},
{
"epoch": 2.17,
"grad_norm": 2.3609771728515625,
"learning_rate": 2.835211267605634e-05,
"loss": 0.039,
"step": 9230
},
{
"epoch": 2.17,
"grad_norm": 1.2718889713287354,
"learning_rate": 2.832863849765258e-05,
"loss": 0.0316,
"step": 9240
},
{
"epoch": 2.17,
"grad_norm": 0.3682420551776886,
"learning_rate": 2.8305164319248828e-05,
"loss": 0.0469,
"step": 9250
},
{
"epoch": 2.17,
"grad_norm": 2.0798330307006836,
"learning_rate": 2.8281690140845075e-05,
"loss": 0.0343,
"step": 9260
},
{
"epoch": 2.18,
"grad_norm": 2.4822909832000732,
"learning_rate": 2.8258215962441316e-05,
"loss": 0.0608,
"step": 9270
},
{
"epoch": 2.18,
"grad_norm": 1.664831280708313,
"learning_rate": 2.823474178403756e-05,
"loss": 0.0674,
"step": 9280
},
{
"epoch": 2.18,
"grad_norm": 0.6971834897994995,
"learning_rate": 2.8211267605633807e-05,
"loss": 0.0561,
"step": 9290
},
{
"epoch": 2.18,
"grad_norm": 1.227864146232605,
"learning_rate": 2.8187793427230048e-05,
"loss": 0.0258,
"step": 9300
},
{
"epoch": 2.19,
"grad_norm": 1.0127270221710205,
"learning_rate": 2.8164319248826292e-05,
"loss": 0.0724,
"step": 9310
},
{
"epoch": 2.19,
"grad_norm": 1.144787073135376,
"learning_rate": 2.814084507042254e-05,
"loss": 0.0502,
"step": 9320
},
{
"epoch": 2.19,
"grad_norm": 3.193720817565918,
"learning_rate": 2.811737089201878e-05,
"loss": 0.0624,
"step": 9330
},
{
"epoch": 2.19,
"grad_norm": 1.395615816116333,
"learning_rate": 2.8093896713615027e-05,
"loss": 0.0616,
"step": 9340
},
{
"epoch": 2.19,
"grad_norm": 1.2624863386154175,
"learning_rate": 2.807042253521127e-05,
"loss": 0.0405,
"step": 9350
},
{
"epoch": 2.2,
"grad_norm": 1.1176111698150635,
"learning_rate": 2.804694835680751e-05,
"loss": 0.0425,
"step": 9360
},
{
"epoch": 2.2,
"grad_norm": 1.4671870470046997,
"learning_rate": 2.802347417840376e-05,
"loss": 0.0456,
"step": 9370
},
{
"epoch": 2.2,
"grad_norm": 0.19849862158298492,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.0388,
"step": 9380
},
{
"epoch": 2.2,
"grad_norm": 1.17306649684906,
"learning_rate": 2.7976525821596244e-05,
"loss": 0.0593,
"step": 9390
},
{
"epoch": 2.21,
"grad_norm": 1.8869905471801758,
"learning_rate": 2.795305164319249e-05,
"loss": 0.0549,
"step": 9400
},
{
"epoch": 2.21,
"grad_norm": 3.7512552738189697,
"learning_rate": 2.7929577464788735e-05,
"loss": 0.0546,
"step": 9410
},
{
"epoch": 2.21,
"grad_norm": 2.4991276264190674,
"learning_rate": 2.7906103286384975e-05,
"loss": 0.057,
"step": 9420
},
{
"epoch": 2.21,
"grad_norm": 1.2474638223648071,
"learning_rate": 2.7882629107981223e-05,
"loss": 0.0546,
"step": 9430
},
{
"epoch": 2.22,
"grad_norm": 1.6727144718170166,
"learning_rate": 2.7859154929577463e-05,
"loss": 0.0352,
"step": 9440
},
{
"epoch": 2.22,
"grad_norm": 0.7062831521034241,
"learning_rate": 2.783568075117371e-05,
"loss": 0.0234,
"step": 9450
},
{
"epoch": 2.22,
"grad_norm": 1.569942593574524,
"learning_rate": 2.7812206572769955e-05,
"loss": 0.045,
"step": 9460
},
{
"epoch": 2.22,
"grad_norm": 3.6019442081451416,
"learning_rate": 2.7788732394366195e-05,
"loss": 0.058,
"step": 9470
},
{
"epoch": 2.23,
"grad_norm": 1.731298565864563,
"learning_rate": 2.7765258215962443e-05,
"loss": 0.0549,
"step": 9480
},
{
"epoch": 2.23,
"grad_norm": 2.845656156539917,
"learning_rate": 2.7741784037558687e-05,
"loss": 0.0388,
"step": 9490
},
{
"epoch": 2.23,
"grad_norm": 3.3697431087493896,
"learning_rate": 2.7718309859154927e-05,
"loss": 0.0476,
"step": 9500
},
{
"epoch": 2.23,
"eval_loss": 0.07065404951572418,
"eval_macro/f1": 0.9069429246419818,
"eval_macro/precision": 0.9071087424181236,
"eval_macro/recall": 0.910052009047393,
"eval_micro/f1": 0.9090350128095644,
"eval_micro/precision": 0.9070211315610088,
"eval_micro/recall": 0.9110578568983225,
"eval_runtime": 30.3133,
"eval_samples/accuracy": 0.900513522766176,
"eval_samples_per_second": 481.802,
"eval_steps_per_second": 15.076,
"step": 9500
},
{
"epoch": 2.23,
"grad_norm": 5.096194744110107,
"learning_rate": 2.7694835680751174e-05,
"loss": 0.0354,
"step": 9510
},
{
"epoch": 2.23,
"grad_norm": 4.457573890686035,
"learning_rate": 2.7671361502347422e-05,
"loss": 0.0826,
"step": 9520
},
{
"epoch": 2.24,
"grad_norm": 1.3853645324707031,
"learning_rate": 2.7647887323943662e-05,
"loss": 0.0493,
"step": 9530
},
{
"epoch": 2.24,
"grad_norm": 6.033750534057617,
"learning_rate": 2.7624413145539906e-05,
"loss": 0.0437,
"step": 9540
},
{
"epoch": 2.24,
"grad_norm": 1.1589934825897217,
"learning_rate": 2.7600938967136154e-05,
"loss": 0.026,
"step": 9550
},
{
"epoch": 2.24,
"grad_norm": 0.9086150527000427,
"learning_rate": 2.7577464788732394e-05,
"loss": 0.058,
"step": 9560
},
{
"epoch": 2.25,
"grad_norm": 7.624779224395752,
"learning_rate": 2.7553990610328638e-05,
"loss": 0.0731,
"step": 9570
},
{
"epoch": 2.25,
"grad_norm": 2.7912135124206543,
"learning_rate": 2.7530516431924886e-05,
"loss": 0.0668,
"step": 9580
},
{
"epoch": 2.25,
"grad_norm": 0.7742284536361694,
"learning_rate": 2.7507042253521126e-05,
"loss": 0.0359,
"step": 9590
},
{
"epoch": 2.25,
"grad_norm": 0.47039780020713806,
"learning_rate": 2.7483568075117373e-05,
"loss": 0.0595,
"step": 9600
},
{
"epoch": 2.26,
"grad_norm": 2.1637165546417236,
"learning_rate": 2.7460093896713617e-05,
"loss": 0.0523,
"step": 9610
},
{
"epoch": 2.26,
"grad_norm": 0.5711060762405396,
"learning_rate": 2.7436619718309858e-05,
"loss": 0.0298,
"step": 9620
},
{
"epoch": 2.26,
"grad_norm": 0.5349418520927429,
"learning_rate": 2.7413145539906105e-05,
"loss": 0.0472,
"step": 9630
},
{
"epoch": 2.26,
"grad_norm": 1.7174102067947388,
"learning_rate": 2.738967136150235e-05,
"loss": 0.0619,
"step": 9640
},
{
"epoch": 2.27,
"grad_norm": 0.6406490802764893,
"learning_rate": 2.736619718309859e-05,
"loss": 0.0228,
"step": 9650
},
{
"epoch": 2.27,
"grad_norm": 1.3063549995422363,
"learning_rate": 2.7342723004694837e-05,
"loss": 0.0576,
"step": 9660
},
{
"epoch": 2.27,
"grad_norm": 2.3705832958221436,
"learning_rate": 2.731924882629108e-05,
"loss": 0.0528,
"step": 9670
},
{
"epoch": 2.27,
"grad_norm": 1.0235642194747925,
"learning_rate": 2.7295774647887322e-05,
"loss": 0.0736,
"step": 9680
},
{
"epoch": 2.27,
"grad_norm": 0.5643028616905212,
"learning_rate": 2.727230046948357e-05,
"loss": 0.0494,
"step": 9690
},
{
"epoch": 2.28,
"grad_norm": 5.157418251037598,
"learning_rate": 2.7248826291079816e-05,
"loss": 0.0589,
"step": 9700
},
{
"epoch": 2.28,
"grad_norm": 0.2736777663230896,
"learning_rate": 2.7225352112676057e-05,
"loss": 0.063,
"step": 9710
},
{
"epoch": 2.28,
"grad_norm": 2.397003650665283,
"learning_rate": 2.72018779342723e-05,
"loss": 0.0536,
"step": 9720
},
{
"epoch": 2.28,
"grad_norm": 2.01479172706604,
"learning_rate": 2.7178403755868548e-05,
"loss": 0.0529,
"step": 9730
},
{
"epoch": 2.29,
"grad_norm": 4.520587921142578,
"learning_rate": 2.715492957746479e-05,
"loss": 0.0524,
"step": 9740
},
{
"epoch": 2.29,
"grad_norm": 1.8048969507217407,
"learning_rate": 2.7131455399061033e-05,
"loss": 0.0475,
"step": 9750
},
{
"epoch": 2.29,
"grad_norm": 1.0479780435562134,
"learning_rate": 2.710798122065728e-05,
"loss": 0.0621,
"step": 9760
},
{
"epoch": 2.29,
"grad_norm": 2.384406089782715,
"learning_rate": 2.708450704225352e-05,
"loss": 0.0566,
"step": 9770
},
{
"epoch": 2.3,
"grad_norm": 0.4049185514450073,
"learning_rate": 2.7061032863849768e-05,
"loss": 0.0356,
"step": 9780
},
{
"epoch": 2.3,
"grad_norm": 1.6596524715423584,
"learning_rate": 2.7037558685446012e-05,
"loss": 0.0702,
"step": 9790
},
{
"epoch": 2.3,
"grad_norm": 3.465924024581909,
"learning_rate": 2.7014084507042253e-05,
"loss": 0.0482,
"step": 9800
},
{
"epoch": 2.3,
"grad_norm": 3.690901756286621,
"learning_rate": 2.69906103286385e-05,
"loss": 0.0486,
"step": 9810
},
{
"epoch": 2.31,
"grad_norm": 2.503009557723999,
"learning_rate": 2.6967136150234744e-05,
"loss": 0.0382,
"step": 9820
},
{
"epoch": 2.31,
"grad_norm": 0.9366639256477356,
"learning_rate": 2.6943661971830984e-05,
"loss": 0.0501,
"step": 9830
},
{
"epoch": 2.31,
"grad_norm": 0.9588242769241333,
"learning_rate": 2.6920187793427232e-05,
"loss": 0.0731,
"step": 9840
},
{
"epoch": 2.31,
"grad_norm": 3.305715799331665,
"learning_rate": 2.689671361502348e-05,
"loss": 0.0458,
"step": 9850
},
{
"epoch": 2.31,
"grad_norm": 4.218424320220947,
"learning_rate": 2.687323943661972e-05,
"loss": 0.0685,
"step": 9860
},
{
"epoch": 2.32,
"grad_norm": 1.0759609937667847,
"learning_rate": 2.6849765258215964e-05,
"loss": 0.0473,
"step": 9870
},
{
"epoch": 2.32,
"grad_norm": 1.3329447507858276,
"learning_rate": 2.682629107981221e-05,
"loss": 0.0581,
"step": 9880
},
{
"epoch": 2.32,
"grad_norm": 3.64178466796875,
"learning_rate": 2.680281690140845e-05,
"loss": 0.0244,
"step": 9890
},
{
"epoch": 2.32,
"grad_norm": 1.0093239545822144,
"learning_rate": 2.6779342723004696e-05,
"loss": 0.0315,
"step": 9900
},
{
"epoch": 2.33,
"grad_norm": 2.4063069820404053,
"learning_rate": 2.6755868544600943e-05,
"loss": 0.0416,
"step": 9910
},
{
"epoch": 2.33,
"grad_norm": 0.29868409037590027,
"learning_rate": 2.6732394366197184e-05,
"loss": 0.0354,
"step": 9920
},
{
"epoch": 2.33,
"grad_norm": 1.0405687093734741,
"learning_rate": 2.670892018779343e-05,
"loss": 0.0498,
"step": 9930
},
{
"epoch": 2.33,
"grad_norm": 0.8380749821662903,
"learning_rate": 2.6685446009389675e-05,
"loss": 0.0426,
"step": 9940
},
{
"epoch": 2.34,
"grad_norm": 1.7580487728118896,
"learning_rate": 2.6661971830985915e-05,
"loss": 0.0596,
"step": 9950
},
{
"epoch": 2.34,
"grad_norm": 0.4274970293045044,
"learning_rate": 2.6638497652582163e-05,
"loss": 0.0754,
"step": 9960
},
{
"epoch": 2.34,
"grad_norm": 1.6633018255233765,
"learning_rate": 2.6615023474178407e-05,
"loss": 0.0713,
"step": 9970
},
{
"epoch": 2.34,
"grad_norm": 1.3542613983154297,
"learning_rate": 2.6591549295774647e-05,
"loss": 0.0604,
"step": 9980
},
{
"epoch": 2.35,
"grad_norm": 2.1828737258911133,
"learning_rate": 2.6568075117370895e-05,
"loss": 0.0426,
"step": 9990
},
{
"epoch": 2.35,
"grad_norm": 0.2581530213356018,
"learning_rate": 2.654460093896714e-05,
"loss": 0.041,
"step": 10000
},
{
"epoch": 2.35,
"eval_loss": 0.06118783354759216,
"eval_macro/f1": 0.9121583171919195,
"eval_macro/precision": 0.9130776209618738,
"eval_macro/recall": 0.9122677551293914,
"eval_micro/f1": 0.9137872049264454,
"eval_micro/precision": 0.9131623931623931,
"eval_micro/recall": 0.9144128723040055,
"eval_runtime": 30.3668,
"eval_samples/accuracy": 0.905169462512838,
"eval_samples_per_second": 480.952,
"eval_steps_per_second": 15.049,
"step": 10000
},
{
"epoch": 2.35,
"grad_norm": 3.0459187030792236,
"learning_rate": 2.652112676056338e-05,
"loss": 0.0516,
"step": 10010
},
{
"epoch": 2.35,
"grad_norm": 2.2514278888702393,
"learning_rate": 2.6497652582159626e-05,
"loss": 0.0527,
"step": 10020
},
{
"epoch": 2.35,
"grad_norm": 0.5657632946968079,
"learning_rate": 2.6474178403755867e-05,
"loss": 0.0612,
"step": 10030
},
{
"epoch": 2.36,
"grad_norm": 1.468795895576477,
"learning_rate": 2.6450704225352114e-05,
"loss": 0.0426,
"step": 10040
},
{
"epoch": 2.36,
"grad_norm": 0.9087652564048767,
"learning_rate": 2.642723004694836e-05,
"loss": 0.0446,
"step": 10050
},
{
"epoch": 2.36,
"grad_norm": 2.5247867107391357,
"learning_rate": 2.64037558685446e-05,
"loss": 0.0724,
"step": 10060
},
{
"epoch": 2.36,
"grad_norm": 1.4081361293792725,
"learning_rate": 2.6380281690140846e-05,
"loss": 0.0492,
"step": 10070
},
{
"epoch": 2.37,
"grad_norm": 202.60423278808594,
"learning_rate": 2.635680751173709e-05,
"loss": 0.033,
"step": 10080
},
{
"epoch": 2.37,
"grad_norm": 2.9532530307769775,
"learning_rate": 2.633333333333333e-05,
"loss": 0.0627,
"step": 10090
},
{
"epoch": 2.37,
"grad_norm": 1.42042076587677,
"learning_rate": 2.6309859154929578e-05,
"loss": 0.046,
"step": 10100
},
{
"epoch": 2.37,
"grad_norm": 0.46867698431015015,
"learning_rate": 2.6286384976525825e-05,
"loss": 0.0549,
"step": 10110
},
{
"epoch": 2.38,
"grad_norm": 2.7932143211364746,
"learning_rate": 2.6262910798122066e-05,
"loss": 0.0465,
"step": 10120
},
{
"epoch": 2.38,
"grad_norm": 3.8150582313537598,
"learning_rate": 2.623943661971831e-05,
"loss": 0.0421,
"step": 10130
},
{
"epoch": 2.38,
"grad_norm": 0.6926825642585754,
"learning_rate": 2.6215962441314557e-05,
"loss": 0.0536,
"step": 10140
},
{
"epoch": 2.38,
"grad_norm": 1.488939642906189,
"learning_rate": 2.6192488262910798e-05,
"loss": 0.0572,
"step": 10150
},
{
"epoch": 2.38,
"grad_norm": 0.9060409069061279,
"learning_rate": 2.6169014084507042e-05,
"loss": 0.05,
"step": 10160
},
{
"epoch": 2.39,
"grad_norm": 2.041475296020508,
"learning_rate": 2.614553990610329e-05,
"loss": 0.0604,
"step": 10170
},
{
"epoch": 2.39,
"grad_norm": 0.3654033839702606,
"learning_rate": 2.612206572769953e-05,
"loss": 0.045,
"step": 10180
},
{
"epoch": 2.39,
"grad_norm": 5.171931266784668,
"learning_rate": 2.6098591549295777e-05,
"loss": 0.0303,
"step": 10190
},
{
"epoch": 2.39,
"grad_norm": 2.96879243850708,
"learning_rate": 2.607511737089202e-05,
"loss": 0.0634,
"step": 10200
},
{
"epoch": 2.4,
"grad_norm": 1.6135741472244263,
"learning_rate": 2.605164319248826e-05,
"loss": 0.0519,
"step": 10210
},
{
"epoch": 2.4,
"grad_norm": 0.8153369426727295,
"learning_rate": 2.602816901408451e-05,
"loss": 0.0564,
"step": 10220
},
{
"epoch": 2.4,
"grad_norm": 2.526282787322998,
"learning_rate": 2.6004694835680753e-05,
"loss": 0.0459,
"step": 10230
},
{
"epoch": 2.4,
"grad_norm": 2.152677059173584,
"learning_rate": 2.5981220657276994e-05,
"loss": 0.0656,
"step": 10240
},
{
"epoch": 2.41,
"grad_norm": 1.8138638734817505,
"learning_rate": 2.595774647887324e-05,
"loss": 0.0433,
"step": 10250
},
{
"epoch": 2.41,
"grad_norm": 2.1275670528411865,
"learning_rate": 2.5934272300469485e-05,
"loss": 0.0776,
"step": 10260
},
{
"epoch": 2.41,
"grad_norm": 1.4768202304840088,
"learning_rate": 2.5910798122065725e-05,
"loss": 0.0495,
"step": 10270
},
{
"epoch": 2.41,
"grad_norm": 5.845564365386963,
"learning_rate": 2.5887323943661973e-05,
"loss": 0.0506,
"step": 10280
},
{
"epoch": 2.42,
"grad_norm": 1.0060453414916992,
"learning_rate": 2.586384976525822e-05,
"loss": 0.0663,
"step": 10290
},
{
"epoch": 2.42,
"grad_norm": 1.312296986579895,
"learning_rate": 2.584037558685446e-05,
"loss": 0.0515,
"step": 10300
},
{
"epoch": 2.42,
"grad_norm": 0.9539171457290649,
"learning_rate": 2.5816901408450705e-05,
"loss": 0.0706,
"step": 10310
},
{
"epoch": 2.42,
"grad_norm": 2.121685266494751,
"learning_rate": 2.5793427230046952e-05,
"loss": 0.04,
"step": 10320
},
{
"epoch": 2.42,
"grad_norm": 2.037627696990967,
"learning_rate": 2.5769953051643193e-05,
"loss": 0.0423,
"step": 10330
},
{
"epoch": 2.43,
"grad_norm": 0.45661213994026184,
"learning_rate": 2.5746478873239437e-05,
"loss": 0.0451,
"step": 10340
},
{
"epoch": 2.43,
"grad_norm": 2.1163055896759033,
"learning_rate": 2.5723004694835684e-05,
"loss": 0.0662,
"step": 10350
},
{
"epoch": 2.43,
"grad_norm": 0.9851250052452087,
"learning_rate": 2.5699530516431924e-05,
"loss": 0.0421,
"step": 10360
},
{
"epoch": 2.43,
"grad_norm": 1.8168209791183472,
"learning_rate": 2.5676056338028172e-05,
"loss": 0.0577,
"step": 10370
},
{
"epoch": 2.44,
"grad_norm": 1.1686525344848633,
"learning_rate": 2.5652582159624416e-05,
"loss": 0.0542,
"step": 10380
},
{
"epoch": 2.44,
"grad_norm": 1.2663167715072632,
"learning_rate": 2.5629107981220656e-05,
"loss": 0.0577,
"step": 10390
},
{
"epoch": 2.44,
"grad_norm": 2.535994529724121,
"learning_rate": 2.5605633802816904e-05,
"loss": 0.0528,
"step": 10400
},
{
"epoch": 2.44,
"grad_norm": 97.92497253417969,
"learning_rate": 2.5582159624413148e-05,
"loss": 0.0661,
"step": 10410
},
{
"epoch": 2.45,
"grad_norm": 1.2818212509155273,
"learning_rate": 2.5558685446009388e-05,
"loss": 0.0483,
"step": 10420
},
{
"epoch": 2.45,
"grad_norm": 1.5373280048370361,
"learning_rate": 2.5535211267605636e-05,
"loss": 0.0508,
"step": 10430
},
{
"epoch": 2.45,
"grad_norm": 1.694848656654358,
"learning_rate": 2.5511737089201883e-05,
"loss": 0.0591,
"step": 10440
},
{
"epoch": 2.45,
"grad_norm": 3.430144786834717,
"learning_rate": 2.5488262910798123e-05,
"loss": 0.0621,
"step": 10450
},
{
"epoch": 2.46,
"grad_norm": 3.0223701000213623,
"learning_rate": 2.5464788732394367e-05,
"loss": 0.0578,
"step": 10460
},
{
"epoch": 2.46,
"grad_norm": 1.3250190019607544,
"learning_rate": 2.5441314553990615e-05,
"loss": 0.0413,
"step": 10470
},
{
"epoch": 2.46,
"grad_norm": 1.6544115543365479,
"learning_rate": 2.5417840375586855e-05,
"loss": 0.0567,
"step": 10480
},
{
"epoch": 2.46,
"grad_norm": 0.7126723527908325,
"learning_rate": 2.53943661971831e-05,
"loss": 0.0626,
"step": 10490
},
{
"epoch": 2.46,
"grad_norm": 0.39465492963790894,
"learning_rate": 2.5370892018779347e-05,
"loss": 0.0384,
"step": 10500
},
{
"epoch": 2.46,
"eval_loss": 0.06695384532213211,
"eval_macro/f1": 0.9068315076135826,
"eval_macro/precision": 0.9077032497066345,
"eval_macro/recall": 0.9072453550876894,
"eval_micro/f1": 0.908891928864569,
"eval_micro/precision": 0.9079603689784762,
"eval_micro/recall": 0.9098254022595001,
"eval_runtime": 30.7367,
"eval_samples/accuracy": 0.8998972954467648,
"eval_samples_per_second": 475.165,
"eval_steps_per_second": 14.868,
"step": 10500
},
{
"epoch": 2.47,
"grad_norm": 1.648552656173706,
"learning_rate": 2.5347417840375587e-05,
"loss": 0.0684,
"step": 10510
},
{
"epoch": 2.47,
"grad_norm": 1.2032274007797241,
"learning_rate": 2.5323943661971835e-05,
"loss": 0.0668,
"step": 10520
},
{
"epoch": 2.47,
"grad_norm": 2.3236911296844482,
"learning_rate": 2.530046948356808e-05,
"loss": 0.053,
"step": 10530
},
{
"epoch": 2.47,
"grad_norm": 0.6463286876678467,
"learning_rate": 2.527699530516432e-05,
"loss": 0.0502,
"step": 10540
},
{
"epoch": 2.48,
"grad_norm": 1.206963300704956,
"learning_rate": 2.5253521126760566e-05,
"loss": 0.0613,
"step": 10550
},
{
"epoch": 2.48,
"grad_norm": 1.3414201736450195,
"learning_rate": 2.523004694835681e-05,
"loss": 0.0704,
"step": 10560
},
{
"epoch": 2.48,
"grad_norm": 0.45636922121047974,
"learning_rate": 2.520657276995305e-05,
"loss": 0.0272,
"step": 10570
},
{
"epoch": 2.48,
"grad_norm": 1.4351228475570679,
"learning_rate": 2.51830985915493e-05,
"loss": 0.0348,
"step": 10580
},
{
"epoch": 2.49,
"grad_norm": 0.7133152484893799,
"learning_rate": 2.5159624413145542e-05,
"loss": 0.0471,
"step": 10590
},
{
"epoch": 2.49,
"grad_norm": 2.3375461101531982,
"learning_rate": 2.5136150234741783e-05,
"loss": 0.0593,
"step": 10600
},
{
"epoch": 2.49,
"grad_norm": 1.4897431135177612,
"learning_rate": 2.511267605633803e-05,
"loss": 0.062,
"step": 10610
},
{
"epoch": 2.49,
"grad_norm": 0.9795855283737183,
"learning_rate": 2.508920187793427e-05,
"loss": 0.0647,
"step": 10620
},
{
"epoch": 2.5,
"grad_norm": 2.317823886871338,
"learning_rate": 2.5065727699530518e-05,
"loss": 0.0556,
"step": 10630
},
{
"epoch": 2.5,
"grad_norm": 1.6250512599945068,
"learning_rate": 2.5042253521126762e-05,
"loss": 0.0564,
"step": 10640
},
{
"epoch": 2.5,
"grad_norm": 2.342941999435425,
"learning_rate": 2.5018779342723003e-05,
"loss": 0.0425,
"step": 10650
},
{
"epoch": 2.5,
"grad_norm": 3.147282838821411,
"learning_rate": 2.499530516431925e-05,
"loss": 0.0601,
"step": 10660
},
{
"epoch": 2.5,
"grad_norm": 0.04555138200521469,
"learning_rate": 2.4971830985915494e-05,
"loss": 0.0489,
"step": 10670
},
{
"epoch": 2.51,
"grad_norm": 0.3302970230579376,
"learning_rate": 2.4948356807511738e-05,
"loss": 0.0598,
"step": 10680
},
{
"epoch": 2.51,
"grad_norm": 2.386502504348755,
"learning_rate": 2.4924882629107982e-05,
"loss": 0.0612,
"step": 10690
},
{
"epoch": 2.51,
"grad_norm": 1.918862223625183,
"learning_rate": 2.490140845070423e-05,
"loss": 0.0522,
"step": 10700
},
{
"epoch": 2.51,
"grad_norm": 1.9548710584640503,
"learning_rate": 2.487793427230047e-05,
"loss": 0.0559,
"step": 10710
},
{
"epoch": 2.52,
"grad_norm": 2.2457354068756104,
"learning_rate": 2.4854460093896714e-05,
"loss": 0.0473,
"step": 10720
},
{
"epoch": 2.52,
"grad_norm": 2.8347527980804443,
"learning_rate": 2.483098591549296e-05,
"loss": 0.0529,
"step": 10730
},
{
"epoch": 2.52,
"grad_norm": 1.7313975095748901,
"learning_rate": 2.4807511737089205e-05,
"loss": 0.0702,
"step": 10740
},
{
"epoch": 2.52,
"grad_norm": 2.26188325881958,
"learning_rate": 2.4784037558685446e-05,
"loss": 0.0473,
"step": 10750
},
{
"epoch": 2.53,
"grad_norm": 0.3577061891555786,
"learning_rate": 2.476056338028169e-05,
"loss": 0.0426,
"step": 10760
},
{
"epoch": 2.53,
"grad_norm": 1.3557612895965576,
"learning_rate": 2.4737089201877937e-05,
"loss": 0.0294,
"step": 10770
},
{
"epoch": 2.53,
"grad_norm": 2.5885844230651855,
"learning_rate": 2.471361502347418e-05,
"loss": 0.0546,
"step": 10780
},
{
"epoch": 2.53,
"grad_norm": 2.567915201187134,
"learning_rate": 2.469014084507042e-05,
"loss": 0.0527,
"step": 10790
},
{
"epoch": 2.54,
"grad_norm": 3.9320240020751953,
"learning_rate": 2.466666666666667e-05,
"loss": 0.0528,
"step": 10800
},
{
"epoch": 2.54,
"grad_norm": 1.2177060842514038,
"learning_rate": 2.4643192488262913e-05,
"loss": 0.0594,
"step": 10810
},
{
"epoch": 2.54,
"grad_norm": 2.4854683876037598,
"learning_rate": 2.4619718309859153e-05,
"loss": 0.0631,
"step": 10820
},
{
"epoch": 2.54,
"grad_norm": 2.2723195552825928,
"learning_rate": 2.45962441314554e-05,
"loss": 0.04,
"step": 10830
},
{
"epoch": 2.54,
"grad_norm": 3.3895249366760254,
"learning_rate": 2.4572769953051645e-05,
"loss": 0.0461,
"step": 10840
},
{
"epoch": 2.55,
"grad_norm": 0.03994426131248474,
"learning_rate": 2.454929577464789e-05,
"loss": 0.0333,
"step": 10850
},
{
"epoch": 2.55,
"grad_norm": 1.1587481498718262,
"learning_rate": 2.4525821596244133e-05,
"loss": 0.0378,
"step": 10860
},
{
"epoch": 2.55,
"grad_norm": 2.166707754135132,
"learning_rate": 2.4502347417840377e-05,
"loss": 0.0627,
"step": 10870
},
{
"epoch": 2.55,
"grad_norm": 0.7003379464149475,
"learning_rate": 2.447887323943662e-05,
"loss": 0.038,
"step": 10880
},
{
"epoch": 2.56,
"grad_norm": 2.285989284515381,
"learning_rate": 2.4455399061032864e-05,
"loss": 0.0562,
"step": 10890
},
{
"epoch": 2.56,
"grad_norm": 4.629007816314697,
"learning_rate": 2.443192488262911e-05,
"loss": 0.0419,
"step": 10900
},
{
"epoch": 2.56,
"grad_norm": 3.119145154953003,
"learning_rate": 2.4408450704225352e-05,
"loss": 0.039,
"step": 10910
},
{
"epoch": 2.56,
"grad_norm": 1.251133680343628,
"learning_rate": 2.43849765258216e-05,
"loss": 0.083,
"step": 10920
},
{
"epoch": 2.57,
"grad_norm": 0.3550426661968231,
"learning_rate": 2.436150234741784e-05,
"loss": 0.0387,
"step": 10930
},
{
"epoch": 2.57,
"grad_norm": 1.6787813901901245,
"learning_rate": 2.4338028169014084e-05,
"loss": 0.0566,
"step": 10940
},
{
"epoch": 2.57,
"grad_norm": 11.414258003234863,
"learning_rate": 2.431455399061033e-05,
"loss": 0.0418,
"step": 10950
},
{
"epoch": 2.57,
"grad_norm": 4.182372093200684,
"learning_rate": 2.4291079812206576e-05,
"loss": 0.0413,
"step": 10960
},
{
"epoch": 2.58,
"grad_norm": 0.34739282727241516,
"learning_rate": 2.4267605633802816e-05,
"loss": 0.0407,
"step": 10970
},
{
"epoch": 2.58,
"grad_norm": 0.3519314229488373,
"learning_rate": 2.4244131455399063e-05,
"loss": 0.0236,
"step": 10980
},
{
"epoch": 2.58,
"grad_norm": 0.8023984432220459,
"learning_rate": 2.4220657276995307e-05,
"loss": 0.0592,
"step": 10990
},
{
"epoch": 2.58,
"grad_norm": 1.21758234500885,
"learning_rate": 2.419718309859155e-05,
"loss": 0.0469,
"step": 11000
},
{
"epoch": 2.58,
"eval_loss": 0.06562425941228867,
"eval_macro/f1": 0.9144028854078985,
"eval_macro/precision": 0.9130815641679679,
"eval_macro/recall": 0.9169312081190664,
"eval_micro/f1": 0.9161118508655126,
"eval_micro/precision": 0.9136475074911469,
"eval_micro/recall": 0.91858952413557,
"eval_runtime": 29.3898,
"eval_samples/accuracy": 0.9083875385142075,
"eval_samples_per_second": 496.94,
"eval_steps_per_second": 15.55,
"step": 11000
},
{
"epoch": 2.58,
"grad_norm": 2.0988662242889404,
"learning_rate": 2.4173708920187795e-05,
"loss": 0.0357,
"step": 11010
},
{
"epoch": 2.59,
"grad_norm": 5.835433483123779,
"learning_rate": 2.415023474178404e-05,
"loss": 0.0669,
"step": 11020
},
{
"epoch": 2.59,
"grad_norm": 3.476304531097412,
"learning_rate": 2.4126760563380283e-05,
"loss": 0.0633,
"step": 11030
},
{
"epoch": 2.59,
"grad_norm": 1.9310297966003418,
"learning_rate": 2.4103286384976527e-05,
"loss": 0.0461,
"step": 11040
},
{
"epoch": 2.59,
"grad_norm": 1.138595700263977,
"learning_rate": 2.407981220657277e-05,
"loss": 0.0579,
"step": 11050
},
{
"epoch": 2.6,
"grad_norm": 1.2649039030075073,
"learning_rate": 2.4056338028169015e-05,
"loss": 0.03,
"step": 11060
},
{
"epoch": 2.6,
"grad_norm": 1.3273512125015259,
"learning_rate": 2.403286384976526e-05,
"loss": 0.0563,
"step": 11070
},
{
"epoch": 2.6,
"grad_norm": 4.17151403427124,
"learning_rate": 2.4009389671361503e-05,
"loss": 0.069,
"step": 11080
},
{
"epoch": 2.6,
"grad_norm": 1.5479555130004883,
"learning_rate": 2.3985915492957747e-05,
"loss": 0.0344,
"step": 11090
},
{
"epoch": 2.61,
"grad_norm": 2.591539144515991,
"learning_rate": 2.396244131455399e-05,
"loss": 0.0374,
"step": 11100
},
{
"epoch": 2.61,
"grad_norm": 1.1022748947143555,
"learning_rate": 2.3938967136150235e-05,
"loss": 0.0411,
"step": 11110
},
{
"epoch": 2.61,
"grad_norm": 0.20835240185260773,
"learning_rate": 2.391549295774648e-05,
"loss": 0.0528,
"step": 11120
},
{
"epoch": 2.61,
"grad_norm": 0.4419134855270386,
"learning_rate": 2.3892018779342723e-05,
"loss": 0.0402,
"step": 11130
},
{
"epoch": 2.62,
"grad_norm": 1.681576132774353,
"learning_rate": 2.386854460093897e-05,
"loss": 0.0273,
"step": 11140
},
{
"epoch": 2.62,
"grad_norm": 2.0418591499328613,
"learning_rate": 2.384507042253521e-05,
"loss": 0.059,
"step": 11150
},
{
"epoch": 2.62,
"grad_norm": 1.7147711515426636,
"learning_rate": 2.3821596244131455e-05,
"loss": 0.0911,
"step": 11160
},
{
"epoch": 2.62,
"grad_norm": 2.6744132041931152,
"learning_rate": 2.3798122065727702e-05,
"loss": 0.0551,
"step": 11170
},
{
"epoch": 2.62,
"grad_norm": 0.905580461025238,
"learning_rate": 2.3774647887323946e-05,
"loss": 0.0537,
"step": 11180
},
{
"epoch": 2.63,
"grad_norm": 2.434372663497925,
"learning_rate": 2.3751173708920187e-05,
"loss": 0.05,
"step": 11190
},
{
"epoch": 2.63,
"grad_norm": 0.8721539378166199,
"learning_rate": 2.3727699530516434e-05,
"loss": 0.0351,
"step": 11200
},
{
"epoch": 2.63,
"grad_norm": 5.539384365081787,
"learning_rate": 2.3704225352112678e-05,
"loss": 0.035,
"step": 11210
},
{
"epoch": 2.63,
"grad_norm": 0.04614145681262016,
"learning_rate": 2.3680751173708922e-05,
"loss": 0.0502,
"step": 11220
},
{
"epoch": 2.64,
"grad_norm": 7.171780109405518,
"learning_rate": 2.3657276995305166e-05,
"loss": 0.043,
"step": 11230
},
{
"epoch": 2.64,
"grad_norm": 1.1372716426849365,
"learning_rate": 2.363380281690141e-05,
"loss": 0.037,
"step": 11240
},
{
"epoch": 2.64,
"grad_norm": 0.49709656834602356,
"learning_rate": 2.3610328638497654e-05,
"loss": 0.0735,
"step": 11250
},
{
"epoch": 2.64,
"grad_norm": 3.8573594093322754,
"learning_rate": 2.3586854460093898e-05,
"loss": 0.0439,
"step": 11260
},
{
"epoch": 2.65,
"grad_norm": 0.6819432973861694,
"learning_rate": 2.356338028169014e-05,
"loss": 0.038,
"step": 11270
},
{
"epoch": 2.65,
"grad_norm": 0.4849399924278259,
"learning_rate": 2.3539906103286386e-05,
"loss": 0.0423,
"step": 11280
},
{
"epoch": 2.65,
"grad_norm": 2.0761923789978027,
"learning_rate": 2.3516431924882633e-05,
"loss": 0.0359,
"step": 11290
},
{
"epoch": 2.65,
"grad_norm": 0.2302016168832779,
"learning_rate": 2.3492957746478874e-05,
"loss": 0.0437,
"step": 11300
},
{
"epoch": 2.65,
"grad_norm": 0.9668428301811218,
"learning_rate": 2.3469483568075117e-05,
"loss": 0.0711,
"step": 11310
},
{
"epoch": 2.66,
"grad_norm": 2.1143290996551514,
"learning_rate": 2.3446009389671365e-05,
"loss": 0.0559,
"step": 11320
},
{
"epoch": 2.66,
"grad_norm": 1.4595099687576294,
"learning_rate": 2.342253521126761e-05,
"loss": 0.04,
"step": 11330
},
{
"epoch": 2.66,
"grad_norm": 4.2474775314331055,
"learning_rate": 2.339906103286385e-05,
"loss": 0.071,
"step": 11340
},
{
"epoch": 2.66,
"grad_norm": 1.3273273706436157,
"learning_rate": 2.3375586854460093e-05,
"loss": 0.0637,
"step": 11350
},
{
"epoch": 2.67,
"grad_norm": 2.783879280090332,
"learning_rate": 2.335211267605634e-05,
"loss": 0.0379,
"step": 11360
},
{
"epoch": 2.67,
"grad_norm": 1.8824636936187744,
"learning_rate": 2.3328638497652585e-05,
"loss": 0.0643,
"step": 11370
},
{
"epoch": 2.67,
"grad_norm": 1.5119823217391968,
"learning_rate": 2.3305164319248825e-05,
"loss": 0.0412,
"step": 11380
},
{
"epoch": 2.67,
"grad_norm": 1.3782035112380981,
"learning_rate": 2.3281690140845073e-05,
"loss": 0.0433,
"step": 11390
},
{
"epoch": 2.68,
"grad_norm": 0.557918906211853,
"learning_rate": 2.3258215962441316e-05,
"loss": 0.056,
"step": 11400
},
{
"epoch": 2.68,
"grad_norm": 0.31254222989082336,
"learning_rate": 2.3234741784037557e-05,
"loss": 0.046,
"step": 11410
},
{
"epoch": 2.68,
"grad_norm": 0.7209023833274841,
"learning_rate": 2.3211267605633804e-05,
"loss": 0.048,
"step": 11420
},
{
"epoch": 2.68,
"grad_norm": 0.2861871123313904,
"learning_rate": 2.318779342723005e-05,
"loss": 0.0624,
"step": 11430
},
{
"epoch": 2.69,
"grad_norm": 1.4058027267456055,
"learning_rate": 2.3164319248826292e-05,
"loss": 0.0424,
"step": 11440
},
{
"epoch": 2.69,
"grad_norm": 2.840806007385254,
"learning_rate": 2.3140845070422536e-05,
"loss": 0.0492,
"step": 11450
},
{
"epoch": 2.69,
"grad_norm": 0.5399413108825684,
"learning_rate": 2.311737089201878e-05,
"loss": 0.041,
"step": 11460
},
{
"epoch": 2.69,
"grad_norm": 3.8651294708251953,
"learning_rate": 2.3093896713615024e-05,
"loss": 0.0494,
"step": 11470
},
{
"epoch": 2.69,
"grad_norm": 4.223033428192139,
"learning_rate": 2.3070422535211268e-05,
"loss": 0.0745,
"step": 11480
},
{
"epoch": 2.7,
"grad_norm": 1.967286229133606,
"learning_rate": 2.3046948356807512e-05,
"loss": 0.043,
"step": 11490
},
{
"epoch": 2.7,
"grad_norm": 2.8095009326934814,
"learning_rate": 2.3023474178403756e-05,
"loss": 0.0556,
"step": 11500
},
{
"epoch": 2.7,
"eval_loss": 0.0602407269179821,
"eval_macro/f1": 0.9160111322025722,
"eval_macro/precision": 0.9174229990455038,
"eval_macro/recall": 0.9169255864513151,
"eval_micro/f1": 0.9172736910502377,
"eval_micro/precision": 0.9163022683793386,
"eval_micro/recall": 0.9182471756247861,
"eval_runtime": 30.1851,
"eval_samples/accuracy": 0.907839780896953,
"eval_samples_per_second": 483.847,
"eval_steps_per_second": 15.14,
"step": 11500
},
{
"epoch": 2.7,
"grad_norm": 0.8641358017921448,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.0303,
"step": 11510
},
{
"epoch": 2.7,
"grad_norm": 1.8955144882202148,
"learning_rate": 2.2976525821596244e-05,
"loss": 0.0304,
"step": 11520
},
{
"epoch": 2.71,
"grad_norm": 2.131303310394287,
"learning_rate": 2.2953051643192488e-05,
"loss": 0.0781,
"step": 11530
},
{
"epoch": 2.71,
"grad_norm": 2.817549467086792,
"learning_rate": 2.2929577464788735e-05,
"loss": 0.0379,
"step": 11540
},
{
"epoch": 2.71,
"grad_norm": 1.6803205013275146,
"learning_rate": 2.290610328638498e-05,
"loss": 0.0526,
"step": 11550
},
{
"epoch": 2.71,
"grad_norm": 1.1056827306747437,
"learning_rate": 2.288262910798122e-05,
"loss": 0.0408,
"step": 11560
},
{
"epoch": 2.72,
"grad_norm": 3.293531894683838,
"learning_rate": 2.2859154929577467e-05,
"loss": 0.0737,
"step": 11570
},
{
"epoch": 2.72,
"grad_norm": 0.13934262096881866,
"learning_rate": 2.283568075117371e-05,
"loss": 0.0687,
"step": 11580
},
{
"epoch": 2.72,
"grad_norm": 1.664345622062683,
"learning_rate": 2.2812206572769955e-05,
"loss": 0.0529,
"step": 11590
},
{
"epoch": 2.72,
"grad_norm": 2.634932518005371,
"learning_rate": 2.27887323943662e-05,
"loss": 0.0725,
"step": 11600
},
{
"epoch": 2.73,
"grad_norm": 1.9947081804275513,
"learning_rate": 2.2765258215962443e-05,
"loss": 0.0424,
"step": 11610
},
{
"epoch": 2.73,
"grad_norm": 2.1717348098754883,
"learning_rate": 2.2741784037558687e-05,
"loss": 0.0511,
"step": 11620
},
{
"epoch": 2.73,
"grad_norm": 3.0661449432373047,
"learning_rate": 2.271830985915493e-05,
"loss": 0.0267,
"step": 11630
},
{
"epoch": 2.73,
"grad_norm": 2.451406478881836,
"learning_rate": 2.2694835680751175e-05,
"loss": 0.0553,
"step": 11640
},
{
"epoch": 2.73,
"grad_norm": 1.1184929609298706,
"learning_rate": 2.267136150234742e-05,
"loss": 0.0526,
"step": 11650
},
{
"epoch": 2.74,
"grad_norm": 2.386408567428589,
"learning_rate": 2.2647887323943663e-05,
"loss": 0.0566,
"step": 11660
},
{
"epoch": 2.74,
"grad_norm": 1.6783018112182617,
"learning_rate": 2.2624413145539907e-05,
"loss": 0.0299,
"step": 11670
},
{
"epoch": 2.74,
"grad_norm": 0.21903283894062042,
"learning_rate": 2.260093896713615e-05,
"loss": 0.0508,
"step": 11680
},
{
"epoch": 2.74,
"grad_norm": 1.843197226524353,
"learning_rate": 2.2577464788732395e-05,
"loss": 0.0572,
"step": 11690
},
{
"epoch": 2.75,
"grad_norm": 0.7903924584388733,
"learning_rate": 2.255399061032864e-05,
"loss": 0.0434,
"step": 11700
},
{
"epoch": 2.75,
"grad_norm": 1.0302156209945679,
"learning_rate": 2.2530516431924883e-05,
"loss": 0.0529,
"step": 11710
},
{
"epoch": 2.75,
"grad_norm": 3.6759471893310547,
"learning_rate": 2.2507042253521127e-05,
"loss": 0.0303,
"step": 11720
},
{
"epoch": 2.75,
"grad_norm": 2.0208933353424072,
"learning_rate": 2.2483568075117374e-05,
"loss": 0.0316,
"step": 11730
},
{
"epoch": 2.76,
"grad_norm": 1.2156116962432861,
"learning_rate": 2.2460093896713614e-05,
"loss": 0.033,
"step": 11740
},
{
"epoch": 2.76,
"grad_norm": 0.3793451488018036,
"learning_rate": 2.243661971830986e-05,
"loss": 0.0397,
"step": 11750
},
{
"epoch": 2.76,
"grad_norm": 2.972238779067993,
"learning_rate": 2.2413145539906106e-05,
"loss": 0.0561,
"step": 11760
},
{
"epoch": 2.76,
"grad_norm": 3.2234435081481934,
"learning_rate": 2.238967136150235e-05,
"loss": 0.0487,
"step": 11770
},
{
"epoch": 2.77,
"grad_norm": 1.875913143157959,
"learning_rate": 2.236619718309859e-05,
"loss": 0.0578,
"step": 11780
},
{
"epoch": 2.77,
"grad_norm": 0.3704244792461395,
"learning_rate": 2.2342723004694838e-05,
"loss": 0.05,
"step": 11790
},
{
"epoch": 2.77,
"grad_norm": 2.043782949447632,
"learning_rate": 2.231924882629108e-05,
"loss": 0.0488,
"step": 11800
},
{
"epoch": 2.77,
"grad_norm": 0.9040173888206482,
"learning_rate": 2.2295774647887326e-05,
"loss": 0.0644,
"step": 11810
},
{
"epoch": 2.77,
"grad_norm": 2.0497360229492188,
"learning_rate": 2.227230046948357e-05,
"loss": 0.043,
"step": 11820
},
{
"epoch": 2.78,
"grad_norm": 1.3397915363311768,
"learning_rate": 2.2248826291079813e-05,
"loss": 0.0451,
"step": 11830
},
{
"epoch": 2.78,
"grad_norm": 2.3610448837280273,
"learning_rate": 2.2225352112676057e-05,
"loss": 0.0585,
"step": 11840
},
{
"epoch": 2.78,
"grad_norm": 1.3045170307159424,
"learning_rate": 2.22018779342723e-05,
"loss": 0.0704,
"step": 11850
},
{
"epoch": 2.78,
"grad_norm": 1.5577067136764526,
"learning_rate": 2.2178403755868545e-05,
"loss": 0.0583,
"step": 11860
},
{
"epoch": 2.79,
"grad_norm": 1.5495411157608032,
"learning_rate": 2.215492957746479e-05,
"loss": 0.0431,
"step": 11870
},
{
"epoch": 2.79,
"grad_norm": 1.3377631902694702,
"learning_rate": 2.2131455399061037e-05,
"loss": 0.0348,
"step": 11880
},
{
"epoch": 2.79,
"grad_norm": 0.4540291726589203,
"learning_rate": 2.2107981220657277e-05,
"loss": 0.051,
"step": 11890
},
{
"epoch": 2.79,
"grad_norm": 1.256713628768921,
"learning_rate": 2.208450704225352e-05,
"loss": 0.058,
"step": 11900
},
{
"epoch": 2.8,
"grad_norm": 1.3505330085754395,
"learning_rate": 2.206103286384977e-05,
"loss": 0.0474,
"step": 11910
},
{
"epoch": 2.8,
"grad_norm": 2.184422254562378,
"learning_rate": 2.2037558685446012e-05,
"loss": 0.0347,
"step": 11920
},
{
"epoch": 2.8,
"grad_norm": 0.9837118983268738,
"learning_rate": 2.2014084507042253e-05,
"loss": 0.0486,
"step": 11930
},
{
"epoch": 2.8,
"grad_norm": 0.9448314905166626,
"learning_rate": 2.1990610328638497e-05,
"loss": 0.0508,
"step": 11940
},
{
"epoch": 2.81,
"grad_norm": 0.18796034157276154,
"learning_rate": 2.1967136150234744e-05,
"loss": 0.0365,
"step": 11950
},
{
"epoch": 2.81,
"grad_norm": 2.7096333503723145,
"learning_rate": 2.1943661971830985e-05,
"loss": 0.0447,
"step": 11960
},
{
"epoch": 2.81,
"grad_norm": 1.3839486837387085,
"learning_rate": 2.192018779342723e-05,
"loss": 0.0606,
"step": 11970
},
{
"epoch": 2.81,
"grad_norm": 0.7544217705726624,
"learning_rate": 2.1896713615023476e-05,
"loss": 0.0517,
"step": 11980
},
{
"epoch": 2.81,
"grad_norm": 3.9837958812713623,
"learning_rate": 2.187323943661972e-05,
"loss": 0.0491,
"step": 11990
},
{
"epoch": 2.82,
"grad_norm": 4.055257797241211,
"learning_rate": 2.184976525821596e-05,
"loss": 0.0509,
"step": 12000
},
{
"epoch": 2.82,
"eval_loss": 0.057348866015672684,
"eval_macro/f1": 0.9190607485607669,
"eval_macro/precision": 0.91942223429283,
"eval_macro/recall": 0.9200309369578695,
"eval_micro/f1": 0.9205556315861503,
"eval_micro/precision": 0.9199890583327635,
"eval_micro/recall": 0.9211229031153715,
"eval_runtime": 29.9261,
"eval_samples/accuracy": 0.912221841834988,
"eval_samples_per_second": 488.036,
"eval_steps_per_second": 15.271,
"step": 12000
},
{
"epoch": 2.82,
"grad_norm": 0.9651139974594116,
"learning_rate": 2.1826291079812208e-05,
"loss": 0.046,
"step": 12010
},
{
"epoch": 2.82,
"grad_norm": 1.4122140407562256,
"learning_rate": 2.1802816901408452e-05,
"loss": 0.0427,
"step": 12020
},
{
"epoch": 2.82,
"grad_norm": 3.4565532207489014,
"learning_rate": 2.1779342723004696e-05,
"loss": 0.0496,
"step": 12030
},
{
"epoch": 2.83,
"grad_norm": 2.368950366973877,
"learning_rate": 2.175586854460094e-05,
"loss": 0.05,
"step": 12040
},
{
"epoch": 2.83,
"grad_norm": 0.5657429695129395,
"learning_rate": 2.1732394366197184e-05,
"loss": 0.0387,
"step": 12050
},
{
"epoch": 2.83,
"grad_norm": 2.9971156120300293,
"learning_rate": 2.1708920187793428e-05,
"loss": 0.0729,
"step": 12060
},
{
"epoch": 2.83,
"grad_norm": 2.455146074295044,
"learning_rate": 2.1685446009389672e-05,
"loss": 0.0594,
"step": 12070
},
{
"epoch": 2.84,
"grad_norm": 2.0647032260894775,
"learning_rate": 2.1661971830985916e-05,
"loss": 0.0489,
"step": 12080
},
{
"epoch": 2.84,
"grad_norm": 0.9192816019058228,
"learning_rate": 2.163849765258216e-05,
"loss": 0.069,
"step": 12090
},
{
"epoch": 2.84,
"grad_norm": 1.2612496614456177,
"learning_rate": 2.1615023474178407e-05,
"loss": 0.0447,
"step": 12100
},
{
"epoch": 2.84,
"grad_norm": 1.4851101636886597,
"learning_rate": 2.1591549295774648e-05,
"loss": 0.0357,
"step": 12110
},
{
"epoch": 2.85,
"grad_norm": 3.6981115341186523,
"learning_rate": 2.156807511737089e-05,
"loss": 0.0763,
"step": 12120
},
{
"epoch": 2.85,
"grad_norm": 1.6905491352081299,
"learning_rate": 2.154460093896714e-05,
"loss": 0.0509,
"step": 12130
},
{
"epoch": 2.85,
"grad_norm": 1.0096628665924072,
"learning_rate": 2.1521126760563383e-05,
"loss": 0.051,
"step": 12140
},
{
"epoch": 2.85,
"grad_norm": 2.6806323528289795,
"learning_rate": 2.1497652582159624e-05,
"loss": 0.0418,
"step": 12150
},
{
"epoch": 2.85,
"grad_norm": 1.4341468811035156,
"learning_rate": 2.147417840375587e-05,
"loss": 0.0551,
"step": 12160
},
{
"epoch": 2.86,
"grad_norm": 0.7659865617752075,
"learning_rate": 2.1450704225352115e-05,
"loss": 0.0529,
"step": 12170
},
{
"epoch": 2.86,
"grad_norm": 1.6196166276931763,
"learning_rate": 2.142723004694836e-05,
"loss": 0.0426,
"step": 12180
},
{
"epoch": 2.86,
"grad_norm": 4.964820861816406,
"learning_rate": 2.1403755868544603e-05,
"loss": 0.0764,
"step": 12190
},
{
"epoch": 2.86,
"grad_norm": 1.1169065237045288,
"learning_rate": 2.1380281690140847e-05,
"loss": 0.0661,
"step": 12200
},
{
"epoch": 2.87,
"grad_norm": 2.428039073944092,
"learning_rate": 2.135680751173709e-05,
"loss": 0.0362,
"step": 12210
},
{
"epoch": 2.87,
"grad_norm": 2.051858901977539,
"learning_rate": 2.1333333333333335e-05,
"loss": 0.0698,
"step": 12220
},
{
"epoch": 2.87,
"grad_norm": 1.6289446353912354,
"learning_rate": 2.130985915492958e-05,
"loss": 0.0688,
"step": 12230
},
{
"epoch": 2.87,
"grad_norm": 1.0273969173431396,
"learning_rate": 2.1286384976525823e-05,
"loss": 0.0412,
"step": 12240
},
{
"epoch": 2.88,
"grad_norm": 2.8337440490722656,
"learning_rate": 2.1262910798122067e-05,
"loss": 0.0464,
"step": 12250
},
{
"epoch": 2.88,
"grad_norm": 3.0352559089660645,
"learning_rate": 2.123943661971831e-05,
"loss": 0.0361,
"step": 12260
},
{
"epoch": 2.88,
"grad_norm": 3.7453436851501465,
"learning_rate": 2.1215962441314554e-05,
"loss": 0.05,
"step": 12270
},
{
"epoch": 2.88,
"grad_norm": 3.489884853363037,
"learning_rate": 2.11924882629108e-05,
"loss": 0.075,
"step": 12280
},
{
"epoch": 2.88,
"grad_norm": 0.7656161785125732,
"learning_rate": 2.1169014084507042e-05,
"loss": 0.0621,
"step": 12290
},
{
"epoch": 2.89,
"grad_norm": 2.6409249305725098,
"learning_rate": 2.1145539906103286e-05,
"loss": 0.06,
"step": 12300
},
{
"epoch": 2.89,
"grad_norm": 1.3973041772842407,
"learning_rate": 2.112206572769953e-05,
"loss": 0.059,
"step": 12310
},
{
"epoch": 2.89,
"grad_norm": 1.4552676677703857,
"learning_rate": 2.1098591549295778e-05,
"loss": 0.0559,
"step": 12320
},
{
"epoch": 2.89,
"grad_norm": 0.8865125179290771,
"learning_rate": 2.1075117370892018e-05,
"loss": 0.0409,
"step": 12330
},
{
"epoch": 2.9,
"grad_norm": 1.3112214803695679,
"learning_rate": 2.1051643192488262e-05,
"loss": 0.0515,
"step": 12340
},
{
"epoch": 2.9,
"grad_norm": 1.4582147598266602,
"learning_rate": 2.102816901408451e-05,
"loss": 0.0499,
"step": 12350
},
{
"epoch": 2.9,
"grad_norm": 0.7377904057502747,
"learning_rate": 2.1004694835680753e-05,
"loss": 0.0419,
"step": 12360
},
{
"epoch": 2.9,
"grad_norm": 0.7822595238685608,
"learning_rate": 2.0981220657276994e-05,
"loss": 0.0492,
"step": 12370
},
{
"epoch": 2.91,
"grad_norm": 2.651425361633301,
"learning_rate": 2.095774647887324e-05,
"loss": 0.0292,
"step": 12380
},
{
"epoch": 2.91,
"grad_norm": 2.326988697052002,
"learning_rate": 2.0934272300469485e-05,
"loss": 0.0613,
"step": 12390
},
{
"epoch": 2.91,
"grad_norm": 2.0126523971557617,
"learning_rate": 2.091079812206573e-05,
"loss": 0.0554,
"step": 12400
},
{
"epoch": 2.91,
"grad_norm": 4.0273566246032715,
"learning_rate": 2.0887323943661973e-05,
"loss": 0.0483,
"step": 12410
},
{
"epoch": 2.92,
"grad_norm": 3.330744981765747,
"learning_rate": 2.0863849765258217e-05,
"loss": 0.052,
"step": 12420
},
{
"epoch": 2.92,
"grad_norm": 1.2141571044921875,
"learning_rate": 2.084037558685446e-05,
"loss": 0.0583,
"step": 12430
},
{
"epoch": 2.92,
"grad_norm": 0.20693588256835938,
"learning_rate": 2.0816901408450705e-05,
"loss": 0.0526,
"step": 12440
},
{
"epoch": 2.92,
"grad_norm": 3.2535054683685303,
"learning_rate": 2.079342723004695e-05,
"loss": 0.0344,
"step": 12450
},
{
"epoch": 2.92,
"grad_norm": 1.5202856063842773,
"learning_rate": 2.0769953051643193e-05,
"loss": 0.0478,
"step": 12460
},
{
"epoch": 2.93,
"grad_norm": 3.9802026748657227,
"learning_rate": 2.074647887323944e-05,
"loss": 0.0459,
"step": 12470
},
{
"epoch": 2.93,
"grad_norm": 1.4574544429779053,
"learning_rate": 2.072300469483568e-05,
"loss": 0.0425,
"step": 12480
},
{
"epoch": 2.93,
"grad_norm": 0.9468692541122437,
"learning_rate": 2.0699530516431925e-05,
"loss": 0.0327,
"step": 12490
},
{
"epoch": 2.93,
"grad_norm": 1.450246810913086,
"learning_rate": 2.0676056338028172e-05,
"loss": 0.0266,
"step": 12500
},
{
"epoch": 2.93,
"eval_loss": 0.06408223509788513,
"eval_macro/f1": 0.9174916425127692,
"eval_macro/precision": 0.9170797303677747,
"eval_macro/recall": 0.9189415678502407,
"eval_micro/f1": 0.9193680755026672,
"eval_micro/precision": 0.9183004303572648,
"eval_micro/recall": 0.9204382060938034,
"eval_runtime": 27.9407,
"eval_samples/accuracy": 0.9120849024306744,
"eval_samples_per_second": 522.714,
"eval_steps_per_second": 16.356,
"step": 12500
},
{
"epoch": 2.94,
"grad_norm": 0.10892952233552933,
"learning_rate": 2.0652582159624416e-05,
"loss": 0.0349,
"step": 12510
},
{
"epoch": 2.94,
"grad_norm": 2.7067954540252686,
"learning_rate": 2.0629107981220657e-05,
"loss": 0.0429,
"step": 12520
},
{
"epoch": 2.94,
"grad_norm": 0.4395139813423157,
"learning_rate": 2.06056338028169e-05,
"loss": 0.0499,
"step": 12530
},
{
"epoch": 2.94,
"grad_norm": 1.1774306297302246,
"learning_rate": 2.0582159624413148e-05,
"loss": 0.0576,
"step": 12540
},
{
"epoch": 2.95,
"grad_norm": 3.046229124069214,
"learning_rate": 2.055868544600939e-05,
"loss": 0.0463,
"step": 12550
},
{
"epoch": 2.95,
"grad_norm": 1.7256075143814087,
"learning_rate": 2.0535211267605633e-05,
"loss": 0.0594,
"step": 12560
},
{
"epoch": 2.95,
"grad_norm": 2.626065254211426,
"learning_rate": 2.051173708920188e-05,
"loss": 0.0451,
"step": 12570
},
{
"epoch": 2.95,
"grad_norm": 1.4180030822753906,
"learning_rate": 2.0488262910798124e-05,
"loss": 0.0485,
"step": 12580
},
{
"epoch": 2.96,
"grad_norm": 4.02133846282959,
"learning_rate": 2.0464788732394364e-05,
"loss": 0.049,
"step": 12590
},
{
"epoch": 2.96,
"grad_norm": 1.9987022876739502,
"learning_rate": 2.0441314553990612e-05,
"loss": 0.0397,
"step": 12600
},
{
"epoch": 2.96,
"grad_norm": 0.2887463867664337,
"learning_rate": 2.0417840375586856e-05,
"loss": 0.0369,
"step": 12610
},
{
"epoch": 2.96,
"grad_norm": 2.342617988586426,
"learning_rate": 2.03943661971831e-05,
"loss": 0.0337,
"step": 12620
},
{
"epoch": 2.96,
"grad_norm": 2.309424638748169,
"learning_rate": 2.0370892018779344e-05,
"loss": 0.0535,
"step": 12630
},
{
"epoch": 2.97,
"grad_norm": 0.7142930030822754,
"learning_rate": 2.0347417840375588e-05,
"loss": 0.0418,
"step": 12640
},
{
"epoch": 2.97,
"grad_norm": 2.1817173957824707,
"learning_rate": 2.032394366197183e-05,
"loss": 0.0357,
"step": 12650
},
{
"epoch": 2.97,
"grad_norm": 4.096397399902344,
"learning_rate": 2.0300469483568076e-05,
"loss": 0.072,
"step": 12660
},
{
"epoch": 2.97,
"grad_norm": 4.028958320617676,
"learning_rate": 2.027699530516432e-05,
"loss": 0.0518,
"step": 12670
},
{
"epoch": 2.98,
"grad_norm": 1.7069728374481201,
"learning_rate": 2.0253521126760563e-05,
"loss": 0.0317,
"step": 12680
},
{
"epoch": 2.98,
"grad_norm": 2.568117380142212,
"learning_rate": 2.023004694835681e-05,
"loss": 0.0317,
"step": 12690
},
{
"epoch": 2.98,
"grad_norm": 1.4156475067138672,
"learning_rate": 2.020657276995305e-05,
"loss": 0.0356,
"step": 12700
},
{
"epoch": 2.98,
"grad_norm": 3.9762303829193115,
"learning_rate": 2.0183098591549295e-05,
"loss": 0.0545,
"step": 12710
},
{
"epoch": 2.99,
"grad_norm": 0.18147669732570648,
"learning_rate": 2.0159624413145543e-05,
"loss": 0.0356,
"step": 12720
},
{
"epoch": 2.99,
"grad_norm": 2.423311948776245,
"learning_rate": 2.0136150234741787e-05,
"loss": 0.0732,
"step": 12730
},
{
"epoch": 2.99,
"grad_norm": 2.2960498332977295,
"learning_rate": 2.0112676056338027e-05,
"loss": 0.0579,
"step": 12740
},
{
"epoch": 2.99,
"grad_norm": 2.083404064178467,
"learning_rate": 2.0089201877934275e-05,
"loss": 0.0575,
"step": 12750
},
{
"epoch": 3.0,
"grad_norm": 0.8938828706741333,
"learning_rate": 2.006572769953052e-05,
"loss": 0.0336,
"step": 12760
},
{
"epoch": 3.0,
"grad_norm": 2.287044048309326,
"learning_rate": 2.0042253521126763e-05,
"loss": 0.0484,
"step": 12770
},
{
"epoch": 3.0,
"grad_norm": 0.1882569044828415,
"learning_rate": 2.0018779342723006e-05,
"loss": 0.0404,
"step": 12780
},
{
"epoch": 3.0,
"grad_norm": 0.9885941743850708,
"learning_rate": 1.999530516431925e-05,
"loss": 0.0436,
"step": 12790
},
{
"epoch": 3.0,
"grad_norm": 3.3820126056671143,
"learning_rate": 1.9971830985915494e-05,
"loss": 0.0363,
"step": 12800
},
{
"epoch": 3.01,
"grad_norm": 4.282595634460449,
"learning_rate": 1.994835680751174e-05,
"loss": 0.0485,
"step": 12810
},
{
"epoch": 3.01,
"grad_norm": 2.2270476818084717,
"learning_rate": 1.9924882629107982e-05,
"loss": 0.0389,
"step": 12820
},
{
"epoch": 3.01,
"grad_norm": 1.3999806642532349,
"learning_rate": 1.9901408450704226e-05,
"loss": 0.0393,
"step": 12830
},
{
"epoch": 3.01,
"grad_norm": 0.3673723042011261,
"learning_rate": 1.987793427230047e-05,
"loss": 0.0418,
"step": 12840
},
{
"epoch": 3.02,
"grad_norm": 3.573798179626465,
"learning_rate": 1.9854460093896714e-05,
"loss": 0.0384,
"step": 12850
},
{
"epoch": 3.02,
"grad_norm": 1.584816336631775,
"learning_rate": 1.9830985915492958e-05,
"loss": 0.0354,
"step": 12860
},
{
"epoch": 3.02,
"grad_norm": 2.1099088191986084,
"learning_rate": 1.9807511737089202e-05,
"loss": 0.0479,
"step": 12870
},
{
"epoch": 3.02,
"grad_norm": 2.267026424407959,
"learning_rate": 1.9784037558685446e-05,
"loss": 0.0612,
"step": 12880
},
{
"epoch": 3.03,
"grad_norm": 4.091300964355469,
"learning_rate": 1.976056338028169e-05,
"loss": 0.0298,
"step": 12890
},
{
"epoch": 3.03,
"grad_norm": 0.49155735969543457,
"learning_rate": 1.9737089201877934e-05,
"loss": 0.0341,
"step": 12900
},
{
"epoch": 3.03,
"grad_norm": 2.704821825027466,
"learning_rate": 1.971361502347418e-05,
"loss": 0.0414,
"step": 12910
},
{
"epoch": 3.03,
"grad_norm": 0.24696892499923706,
"learning_rate": 1.9690140845070422e-05,
"loss": 0.025,
"step": 12920
},
{
"epoch": 3.04,
"grad_norm": 1.6729565858840942,
"learning_rate": 1.9666666666666666e-05,
"loss": 0.0288,
"step": 12930
},
{
"epoch": 3.04,
"grad_norm": 1.8321446180343628,
"learning_rate": 1.9643192488262913e-05,
"loss": 0.0463,
"step": 12940
},
{
"epoch": 3.04,
"grad_norm": 0.1947106420993805,
"learning_rate": 1.9619718309859157e-05,
"loss": 0.0223,
"step": 12950
},
{
"epoch": 3.04,
"grad_norm": 4.319494724273682,
"learning_rate": 1.9596244131455398e-05,
"loss": 0.0379,
"step": 12960
},
{
"epoch": 3.04,
"grad_norm": 0.17218045890331268,
"learning_rate": 1.9572769953051645e-05,
"loss": 0.0214,
"step": 12970
},
{
"epoch": 3.05,
"grad_norm": 1.846875548362732,
"learning_rate": 1.954929577464789e-05,
"loss": 0.044,
"step": 12980
},
{
"epoch": 3.05,
"grad_norm": 2.1127357482910156,
"learning_rate": 1.9525821596244133e-05,
"loss": 0.0289,
"step": 12990
},
{
"epoch": 3.05,
"grad_norm": 2.4004576206207275,
"learning_rate": 1.9502347417840377e-05,
"loss": 0.0468,
"step": 13000
},
{
"epoch": 3.05,
"eval_loss": 0.06455881893634796,
"eval_macro/f1": 0.9173124672477516,
"eval_macro/precision": 0.9169905051673943,
"eval_macro/recall": 0.9183038895525318,
"eval_micro/f1": 0.9185863695399549,
"eval_micro/precision": 0.9170818262471849,
"eval_micro/recall": 0.9200958575830195,
"eval_runtime": 28.2634,
"eval_samples/accuracy": 0.9116056145155769,
"eval_samples_per_second": 516.745,
"eval_steps_per_second": 16.169,
"step": 13000
},
{
"epoch": 3.05,
"grad_norm": 2.3656160831451416,
"learning_rate": 1.947887323943662e-05,
"loss": 0.0468,
"step": 13010
},
{
"epoch": 3.06,
"grad_norm": 2.0503287315368652,
"learning_rate": 1.9455399061032865e-05,
"loss": 0.0346,
"step": 13020
},
{
"epoch": 3.06,
"grad_norm": 1.2296708822250366,
"learning_rate": 1.943192488262911e-05,
"loss": 0.043,
"step": 13030
},
{
"epoch": 3.06,
"grad_norm": 1.9218964576721191,
"learning_rate": 1.9408450704225353e-05,
"loss": 0.0504,
"step": 13040
},
{
"epoch": 3.06,
"grad_norm": 0.4588683247566223,
"learning_rate": 1.9384976525821597e-05,
"loss": 0.0339,
"step": 13050
},
{
"epoch": 3.07,
"grad_norm": 0.322399377822876,
"learning_rate": 1.9361502347417844e-05,
"loss": 0.0484,
"step": 13060
},
{
"epoch": 3.07,
"grad_norm": 1.4765088558197021,
"learning_rate": 1.9338028169014085e-05,
"loss": 0.0368,
"step": 13070
},
{
"epoch": 3.07,
"grad_norm": 2.2785489559173584,
"learning_rate": 1.931455399061033e-05,
"loss": 0.027,
"step": 13080
},
{
"epoch": 3.07,
"grad_norm": 0.05628859996795654,
"learning_rate": 1.9291079812206576e-05,
"loss": 0.0442,
"step": 13090
},
{
"epoch": 3.08,
"grad_norm": 2.550320863723755,
"learning_rate": 1.926760563380282e-05,
"loss": 0.0475,
"step": 13100
},
{
"epoch": 3.08,
"grad_norm": 1.3099887371063232,
"learning_rate": 1.924413145539906e-05,
"loss": 0.0333,
"step": 13110
},
{
"epoch": 3.08,
"grad_norm": 1.2200229167938232,
"learning_rate": 1.9220657276995304e-05,
"loss": 0.037,
"step": 13120
},
{
"epoch": 3.08,
"grad_norm": 1.5526323318481445,
"learning_rate": 1.9197183098591552e-05,
"loss": 0.0599,
"step": 13130
},
{
"epoch": 3.08,
"grad_norm": 0.1484403908252716,
"learning_rate": 1.9173708920187792e-05,
"loss": 0.035,
"step": 13140
},
{
"epoch": 3.09,
"grad_norm": 2.4299137592315674,
"learning_rate": 1.9150234741784036e-05,
"loss": 0.0423,
"step": 13150
},
{
"epoch": 3.09,
"grad_norm": 0.21701346337795258,
"learning_rate": 1.9126760563380284e-05,
"loss": 0.0282,
"step": 13160
},
{
"epoch": 3.09,
"grad_norm": 0.13961966335773468,
"learning_rate": 1.9103286384976528e-05,
"loss": 0.025,
"step": 13170
},
{
"epoch": 3.09,
"grad_norm": 0.26682791113853455,
"learning_rate": 1.9079812206572768e-05,
"loss": 0.0302,
"step": 13180
},
{
"epoch": 3.1,
"grad_norm": 0.1746547371149063,
"learning_rate": 1.9056338028169016e-05,
"loss": 0.0376,
"step": 13190
},
{
"epoch": 3.1,
"grad_norm": 0.3016582429409027,
"learning_rate": 1.903286384976526e-05,
"loss": 0.0567,
"step": 13200
},
{
"epoch": 3.1,
"grad_norm": 0.5560840964317322,
"learning_rate": 1.9009389671361503e-05,
"loss": 0.0342,
"step": 13210
},
{
"epoch": 3.1,
"grad_norm": 0.5322542786598206,
"learning_rate": 1.8985915492957747e-05,
"loss": 0.036,
"step": 13220
},
{
"epoch": 3.11,
"grad_norm": 3.8124992847442627,
"learning_rate": 1.896244131455399e-05,
"loss": 0.0345,
"step": 13230
},
{
"epoch": 3.11,
"grad_norm": 2.32499623298645,
"learning_rate": 1.8938967136150235e-05,
"loss": 0.0221,
"step": 13240
},
{
"epoch": 3.11,
"grad_norm": 0.9310886263847351,
"learning_rate": 1.891549295774648e-05,
"loss": 0.0179,
"step": 13250
},
{
"epoch": 3.11,
"grad_norm": 2.592772960662842,
"learning_rate": 1.8892018779342723e-05,
"loss": 0.0627,
"step": 13260
},
{
"epoch": 3.12,
"grad_norm": 0.32913756370544434,
"learning_rate": 1.8868544600938967e-05,
"loss": 0.064,
"step": 13270
},
{
"epoch": 3.12,
"grad_norm": 0.5899044871330261,
"learning_rate": 1.8845070422535215e-05,
"loss": 0.041,
"step": 13280
},
{
"epoch": 3.12,
"grad_norm": 4.0505170822143555,
"learning_rate": 1.8821596244131455e-05,
"loss": 0.0325,
"step": 13290
},
{
"epoch": 3.12,
"grad_norm": 2.8459744453430176,
"learning_rate": 1.87981220657277e-05,
"loss": 0.0351,
"step": 13300
},
{
"epoch": 3.12,
"grad_norm": 1.0423197746276855,
"learning_rate": 1.8774647887323946e-05,
"loss": 0.0356,
"step": 13310
},
{
"epoch": 3.13,
"grad_norm": 1.0360386371612549,
"learning_rate": 1.875117370892019e-05,
"loss": 0.0366,
"step": 13320
},
{
"epoch": 3.13,
"grad_norm": 1.490789771080017,
"learning_rate": 1.872769953051643e-05,
"loss": 0.0456,
"step": 13330
},
{
"epoch": 3.13,
"grad_norm": 2.266568422317505,
"learning_rate": 1.870422535211268e-05,
"loss": 0.0383,
"step": 13340
},
{
"epoch": 3.13,
"grad_norm": 2.135704755783081,
"learning_rate": 1.8680751173708922e-05,
"loss": 0.0539,
"step": 13350
},
{
"epoch": 3.14,
"grad_norm": 2.030789852142334,
"learning_rate": 1.8657276995305166e-05,
"loss": 0.0316,
"step": 13360
},
{
"epoch": 3.14,
"grad_norm": 2.624379873275757,
"learning_rate": 1.863380281690141e-05,
"loss": 0.0507,
"step": 13370
},
{
"epoch": 3.14,
"grad_norm": 0.05331215262413025,
"learning_rate": 1.8610328638497654e-05,
"loss": 0.058,
"step": 13380
},
{
"epoch": 3.14,
"grad_norm": 4.441781520843506,
"learning_rate": 1.8586854460093898e-05,
"loss": 0.0248,
"step": 13390
},
{
"epoch": 3.15,
"grad_norm": 3.5211195945739746,
"learning_rate": 1.8563380281690142e-05,
"loss": 0.0393,
"step": 13400
},
{
"epoch": 3.15,
"grad_norm": 1.485267996788025,
"learning_rate": 1.8539906103286386e-05,
"loss": 0.0476,
"step": 13410
},
{
"epoch": 3.15,
"grad_norm": 1.912190556526184,
"learning_rate": 1.851643192488263e-05,
"loss": 0.0334,
"step": 13420
},
{
"epoch": 3.15,
"grad_norm": 0.3232327699661255,
"learning_rate": 1.8492957746478874e-05,
"loss": 0.039,
"step": 13430
},
{
"epoch": 3.15,
"grad_norm": 3.1284468173980713,
"learning_rate": 1.8469483568075118e-05,
"loss": 0.0458,
"step": 13440
},
{
"epoch": 3.16,
"grad_norm": 0.6203511357307434,
"learning_rate": 1.8446009389671362e-05,
"loss": 0.0263,
"step": 13450
},
{
"epoch": 3.16,
"grad_norm": 0.8480092883110046,
"learning_rate": 1.8422535211267606e-05,
"loss": 0.03,
"step": 13460
},
{
"epoch": 3.16,
"grad_norm": 0.9211137890815735,
"learning_rate": 1.839906103286385e-05,
"loss": 0.0331,
"step": 13470
},
{
"epoch": 3.16,
"grad_norm": 2.1461546421051025,
"learning_rate": 1.8375586854460094e-05,
"loss": 0.0464,
"step": 13480
},
{
"epoch": 3.17,
"grad_norm": 1.1663808822631836,
"learning_rate": 1.8352112676056338e-05,
"loss": 0.051,
"step": 13490
},
{
"epoch": 3.17,
"grad_norm": 0.1547522097826004,
"learning_rate": 1.8328638497652585e-05,
"loss": 0.0327,
"step": 13500
},
{
"epoch": 3.17,
"eval_loss": 0.0615035742521286,
"eval_macro/f1": 0.9206273949130451,
"eval_macro/precision": 0.919397009262914,
"eval_macro/recall": 0.9224895351325427,
"eval_micro/f1": 0.9222305747283537,
"eval_micro/precision": 0.9204692722188118,
"eval_micro/recall": 0.9239986306059569,
"eval_runtime": 29.134,
"eval_samples/accuracy": 0.915576857240671,
"eval_samples_per_second": 501.304,
"eval_steps_per_second": 15.686,
"step": 13500
},
{
"epoch": 3.17,
"grad_norm": 0.1404021680355072,
"learning_rate": 1.8305164319248826e-05,
"loss": 0.0341,
"step": 13510
},
{
"epoch": 3.17,
"grad_norm": 1.9744393825531006,
"learning_rate": 1.828169014084507e-05,
"loss": 0.0404,
"step": 13520
},
{
"epoch": 3.18,
"grad_norm": 3.0998284816741943,
"learning_rate": 1.8258215962441317e-05,
"loss": 0.0467,
"step": 13530
},
{
"epoch": 3.18,
"grad_norm": 2.285637855529785,
"learning_rate": 1.823474178403756e-05,
"loss": 0.0409,
"step": 13540
},
{
"epoch": 3.18,
"grad_norm": 2.0926291942596436,
"learning_rate": 1.82112676056338e-05,
"loss": 0.0314,
"step": 13550
},
{
"epoch": 3.18,
"grad_norm": 1.4188313484191895,
"learning_rate": 1.818779342723005e-05,
"loss": 0.0382,
"step": 13560
},
{
"epoch": 3.19,
"grad_norm": 2.5279481410980225,
"learning_rate": 1.8164319248826293e-05,
"loss": 0.019,
"step": 13570
},
{
"epoch": 3.19,
"grad_norm": 1.5087552070617676,
"learning_rate": 1.8140845070422537e-05,
"loss": 0.0221,
"step": 13580
},
{
"epoch": 3.19,
"grad_norm": 1.3917876482009888,
"learning_rate": 1.811737089201878e-05,
"loss": 0.028,
"step": 13590
},
{
"epoch": 3.19,
"grad_norm": 1.1080641746520996,
"learning_rate": 1.8093896713615025e-05,
"loss": 0.0451,
"step": 13600
},
{
"epoch": 3.19,
"grad_norm": 0.10831775516271591,
"learning_rate": 1.807042253521127e-05,
"loss": 0.0291,
"step": 13610
},
{
"epoch": 3.2,
"grad_norm": 1.2383673191070557,
"learning_rate": 1.8046948356807513e-05,
"loss": 0.0353,
"step": 13620
},
{
"epoch": 3.2,
"grad_norm": 2.3589017391204834,
"learning_rate": 1.8023474178403756e-05,
"loss": 0.0368,
"step": 13630
},
{
"epoch": 3.2,
"grad_norm": 0.8582547307014465,
"learning_rate": 1.8e-05,
"loss": 0.043,
"step": 13640
},
{
"epoch": 3.2,
"grad_norm": 1.0989599227905273,
"learning_rate": 1.7976525821596248e-05,
"loss": 0.0699,
"step": 13650
},
{
"epoch": 3.21,
"grad_norm": 1.3760502338409424,
"learning_rate": 1.795305164319249e-05,
"loss": 0.0447,
"step": 13660
},
{
"epoch": 3.21,
"grad_norm": 1.222806692123413,
"learning_rate": 1.7929577464788732e-05,
"loss": 0.0353,
"step": 13670
},
{
"epoch": 3.21,
"grad_norm": 5.3245368003845215,
"learning_rate": 1.790610328638498e-05,
"loss": 0.039,
"step": 13680
},
{
"epoch": 3.21,
"grad_norm": 0.2474091649055481,
"learning_rate": 1.788262910798122e-05,
"loss": 0.0246,
"step": 13690
},
{
"epoch": 3.22,
"grad_norm": 0.02522147260606289,
"learning_rate": 1.7859154929577464e-05,
"loss": 0.0226,
"step": 13700
},
{
"epoch": 3.22,
"grad_norm": 0.7612372636795044,
"learning_rate": 1.7835680751173708e-05,
"loss": 0.026,
"step": 13710
},
{
"epoch": 3.22,
"grad_norm": 0.646409273147583,
"learning_rate": 1.7812206572769956e-05,
"loss": 0.0292,
"step": 13720
},
{
"epoch": 3.22,
"grad_norm": 1.2514673471450806,
"learning_rate": 1.7788732394366196e-05,
"loss": 0.0415,
"step": 13730
},
{
"epoch": 3.23,
"grad_norm": 1.112414836883545,
"learning_rate": 1.776525821596244e-05,
"loss": 0.0257,
"step": 13740
},
{
"epoch": 3.23,
"grad_norm": 1.2794502973556519,
"learning_rate": 1.7741784037558687e-05,
"loss": 0.0403,
"step": 13750
},
{
"epoch": 3.23,
"grad_norm": 2.2429542541503906,
"learning_rate": 1.771830985915493e-05,
"loss": 0.0329,
"step": 13760
},
{
"epoch": 3.23,
"grad_norm": 0.08626211434602737,
"learning_rate": 1.7694835680751172e-05,
"loss": 0.0322,
"step": 13770
},
{
"epoch": 3.23,
"grad_norm": 0.9771416783332825,
"learning_rate": 1.767136150234742e-05,
"loss": 0.0223,
"step": 13780
},
{
"epoch": 3.24,
"grad_norm": 29.060523986816406,
"learning_rate": 1.7647887323943663e-05,
"loss": 0.041,
"step": 13790
},
{
"epoch": 3.24,
"grad_norm": 0.3592706322669983,
"learning_rate": 1.7624413145539907e-05,
"loss": 0.0342,
"step": 13800
},
{
"epoch": 3.24,
"grad_norm": 1.4272717237472534,
"learning_rate": 1.760093896713615e-05,
"loss": 0.0263,
"step": 13810
},
{
"epoch": 3.24,
"grad_norm": 1.0685161352157593,
"learning_rate": 1.7577464788732395e-05,
"loss": 0.0516,
"step": 13820
},
{
"epoch": 3.25,
"grad_norm": 2.785543203353882,
"learning_rate": 1.755399061032864e-05,
"loss": 0.0533,
"step": 13830
},
{
"epoch": 3.25,
"grad_norm": 2.7001149654388428,
"learning_rate": 1.7530516431924883e-05,
"loss": 0.048,
"step": 13840
},
{
"epoch": 3.25,
"grad_norm": 1.7706950902938843,
"learning_rate": 1.7507042253521127e-05,
"loss": 0.0407,
"step": 13850
},
{
"epoch": 3.25,
"grad_norm": 2.3187646865844727,
"learning_rate": 1.748356807511737e-05,
"loss": 0.0212,
"step": 13860
},
{
"epoch": 3.26,
"grad_norm": 1.5856642723083496,
"learning_rate": 1.7460093896713618e-05,
"loss": 0.0345,
"step": 13870
},
{
"epoch": 3.26,
"grad_norm": 0.3232921361923218,
"learning_rate": 1.743661971830986e-05,
"loss": 0.0326,
"step": 13880
},
{
"epoch": 3.26,
"grad_norm": 0.44799041748046875,
"learning_rate": 1.7413145539906103e-05,
"loss": 0.0329,
"step": 13890
},
{
"epoch": 3.26,
"grad_norm": 6.359521389007568,
"learning_rate": 1.738967136150235e-05,
"loss": 0.041,
"step": 13900
},
{
"epoch": 3.27,
"grad_norm": 0.3691033720970154,
"learning_rate": 1.7366197183098594e-05,
"loss": 0.0281,
"step": 13910
},
{
"epoch": 3.27,
"grad_norm": 2.6704986095428467,
"learning_rate": 1.7342723004694835e-05,
"loss": 0.0579,
"step": 13920
},
{
"epoch": 3.27,
"grad_norm": 2.6575472354888916,
"learning_rate": 1.7319248826291082e-05,
"loss": 0.0325,
"step": 13930
},
{
"epoch": 3.27,
"grad_norm": 3.007859468460083,
"learning_rate": 1.7295774647887326e-05,
"loss": 0.0338,
"step": 13940
},
{
"epoch": 3.27,
"grad_norm": 1.2672277688980103,
"learning_rate": 1.727230046948357e-05,
"loss": 0.0242,
"step": 13950
},
{
"epoch": 3.28,
"grad_norm": 1.3012404441833496,
"learning_rate": 1.7248826291079814e-05,
"loss": 0.0577,
"step": 13960
},
{
"epoch": 3.28,
"grad_norm": 1.5620529651641846,
"learning_rate": 1.7225352112676058e-05,
"loss": 0.0474,
"step": 13970
},
{
"epoch": 3.28,
"grad_norm": 0.07768592238426208,
"learning_rate": 1.7201877934272302e-05,
"loss": 0.0414,
"step": 13980
},
{
"epoch": 3.28,
"grad_norm": 2.158637046813965,
"learning_rate": 1.7178403755868546e-05,
"loss": 0.0545,
"step": 13990
},
{
"epoch": 3.29,
"grad_norm": 0.3285406827926636,
"learning_rate": 1.715492957746479e-05,
"loss": 0.0223,
"step": 14000
},
{
"epoch": 3.29,
"eval_loss": 0.06010037660598755,
"eval_macro/f1": 0.9183699014717978,
"eval_macro/precision": 0.9199256679585665,
"eval_macro/recall": 0.9176130775774011,
"eval_micro/f1": 0.9193316441827022,
"eval_micro/precision": 0.9194575713992192,
"eval_micro/recall": 0.9192057514549812,
"eval_runtime": 28.7682,
"eval_samples/accuracy": 0.9106470386853818,
"eval_samples_per_second": 507.678,
"eval_steps_per_second": 15.886,
"step": 14000
},
{
"epoch": 3.29,
"grad_norm": 1.8858875036239624,
"learning_rate": 1.7131455399061034e-05,
"loss": 0.0265,
"step": 14010
},
{
"epoch": 3.29,
"grad_norm": 1.7713499069213867,
"learning_rate": 1.7107981220657278e-05,
"loss": 0.0581,
"step": 14020
},
{
"epoch": 3.29,
"grad_norm": 0.39804449677467346,
"learning_rate": 1.708450704225352e-05,
"loss": 0.026,
"step": 14030
},
{
"epoch": 3.3,
"grad_norm": 0.16412827372550964,
"learning_rate": 1.7061032863849766e-05,
"loss": 0.0249,
"step": 14040
},
{
"epoch": 3.3,
"grad_norm": 2.8789305686950684,
"learning_rate": 1.703755868544601e-05,
"loss": 0.0495,
"step": 14050
},
{
"epoch": 3.3,
"grad_norm": 3.439175605773926,
"learning_rate": 1.7014084507042253e-05,
"loss": 0.0444,
"step": 14060
},
{
"epoch": 3.3,
"grad_norm": 0.4579373300075531,
"learning_rate": 1.6990610328638497e-05,
"loss": 0.054,
"step": 14070
},
{
"epoch": 3.31,
"grad_norm": 2.6613855361938477,
"learning_rate": 1.696713615023474e-05,
"loss": 0.0411,
"step": 14080
},
{
"epoch": 3.31,
"grad_norm": 1.7336716651916504,
"learning_rate": 1.694366197183099e-05,
"loss": 0.0532,
"step": 14090
},
{
"epoch": 3.31,
"grad_norm": 2.034172773361206,
"learning_rate": 1.692018779342723e-05,
"loss": 0.02,
"step": 14100
},
{
"epoch": 3.31,
"grad_norm": 1.298073410987854,
"learning_rate": 1.6896713615023473e-05,
"loss": 0.0449,
"step": 14110
},
{
"epoch": 3.31,
"grad_norm": 0.09053938835859299,
"learning_rate": 1.687323943661972e-05,
"loss": 0.0132,
"step": 14120
},
{
"epoch": 3.32,
"grad_norm": 1.7225825786590576,
"learning_rate": 1.6849765258215965e-05,
"loss": 0.0345,
"step": 14130
},
{
"epoch": 3.32,
"grad_norm": 0.9660710692405701,
"learning_rate": 1.6826291079812205e-05,
"loss": 0.0331,
"step": 14140
},
{
"epoch": 3.32,
"grad_norm": 2.697035312652588,
"learning_rate": 1.6802816901408453e-05,
"loss": 0.0385,
"step": 14150
},
{
"epoch": 3.32,
"grad_norm": 3.3782408237457275,
"learning_rate": 1.6779342723004696e-05,
"loss": 0.0559,
"step": 14160
},
{
"epoch": 3.33,
"grad_norm": 2.809810161590576,
"learning_rate": 1.675586854460094e-05,
"loss": 0.0395,
"step": 14170
},
{
"epoch": 3.33,
"grad_norm": 0.35062745213508606,
"learning_rate": 1.6732394366197184e-05,
"loss": 0.0229,
"step": 14180
},
{
"epoch": 3.33,
"grad_norm": 0.4234026372432709,
"learning_rate": 1.670892018779343e-05,
"loss": 0.0468,
"step": 14190
},
{
"epoch": 3.33,
"grad_norm": 1.09328293800354,
"learning_rate": 1.6685446009389672e-05,
"loss": 0.0286,
"step": 14200
},
{
"epoch": 3.34,
"grad_norm": 0.16104546189308167,
"learning_rate": 1.6661971830985916e-05,
"loss": 0.0317,
"step": 14210
},
{
"epoch": 3.34,
"grad_norm": 1.0694267749786377,
"learning_rate": 1.663849765258216e-05,
"loss": 0.0383,
"step": 14220
},
{
"epoch": 3.34,
"grad_norm": 2.887817621231079,
"learning_rate": 1.6615023474178404e-05,
"loss": 0.0273,
"step": 14230
},
{
"epoch": 3.34,
"grad_norm": 1.8802268505096436,
"learning_rate": 1.659154929577465e-05,
"loss": 0.025,
"step": 14240
},
{
"epoch": 3.35,
"grad_norm": 0.8812164664268494,
"learning_rate": 1.6568075117370892e-05,
"loss": 0.0298,
"step": 14250
},
{
"epoch": 3.35,
"grad_norm": 0.48760342597961426,
"learning_rate": 1.6544600938967136e-05,
"loss": 0.0247,
"step": 14260
},
{
"epoch": 3.35,
"grad_norm": 1.062553882598877,
"learning_rate": 1.6521126760563383e-05,
"loss": 0.0319,
"step": 14270
},
{
"epoch": 3.35,
"grad_norm": 1.5591585636138916,
"learning_rate": 1.6497652582159624e-05,
"loss": 0.0356,
"step": 14280
},
{
"epoch": 3.35,
"grad_norm": 1.9570565223693848,
"learning_rate": 1.6474178403755868e-05,
"loss": 0.0356,
"step": 14290
},
{
"epoch": 3.36,
"grad_norm": 0.9120977520942688,
"learning_rate": 1.6450704225352112e-05,
"loss": 0.0229,
"step": 14300
},
{
"epoch": 3.36,
"grad_norm": 2.795611619949341,
"learning_rate": 1.642723004694836e-05,
"loss": 0.0505,
"step": 14310
},
{
"epoch": 3.36,
"grad_norm": 0.9692701101303101,
"learning_rate": 1.64037558685446e-05,
"loss": 0.0404,
"step": 14320
},
{
"epoch": 3.36,
"grad_norm": 0.7188575267791748,
"learning_rate": 1.6380281690140844e-05,
"loss": 0.0237,
"step": 14330
},
{
"epoch": 3.37,
"grad_norm": 0.2590070366859436,
"learning_rate": 1.635680751173709e-05,
"loss": 0.019,
"step": 14340
},
{
"epoch": 3.37,
"grad_norm": 0.07495612651109695,
"learning_rate": 1.6333333333333335e-05,
"loss": 0.0306,
"step": 14350
},
{
"epoch": 3.37,
"grad_norm": 4.267418384552002,
"learning_rate": 1.6309859154929576e-05,
"loss": 0.0445,
"step": 14360
},
{
"epoch": 3.37,
"grad_norm": 0.051151078194379807,
"learning_rate": 1.6286384976525823e-05,
"loss": 0.0346,
"step": 14370
},
{
"epoch": 3.38,
"grad_norm": 3.307426929473877,
"learning_rate": 1.6262910798122067e-05,
"loss": 0.0602,
"step": 14380
},
{
"epoch": 3.38,
"grad_norm": 0.3463471829891205,
"learning_rate": 1.623943661971831e-05,
"loss": 0.0391,
"step": 14390
},
{
"epoch": 3.38,
"grad_norm": 4.471025466918945,
"learning_rate": 1.6215962441314555e-05,
"loss": 0.0551,
"step": 14400
},
{
"epoch": 3.38,
"grad_norm": 0.8686420321464539,
"learning_rate": 1.61924882629108e-05,
"loss": 0.038,
"step": 14410
},
{
"epoch": 3.38,
"grad_norm": 0.5848603248596191,
"learning_rate": 1.6169014084507043e-05,
"loss": 0.0276,
"step": 14420
},
{
"epoch": 3.39,
"grad_norm": 1.7548906803131104,
"learning_rate": 1.6145539906103287e-05,
"loss": 0.0565,
"step": 14430
},
{
"epoch": 3.39,
"grad_norm": 0.0470554418861866,
"learning_rate": 1.612206572769953e-05,
"loss": 0.0671,
"step": 14440
},
{
"epoch": 3.39,
"grad_norm": 0.6341148614883423,
"learning_rate": 1.6098591549295775e-05,
"loss": 0.0286,
"step": 14450
},
{
"epoch": 3.39,
"grad_norm": 1.4463139772415161,
"learning_rate": 1.6075117370892022e-05,
"loss": 0.024,
"step": 14460
},
{
"epoch": 3.4,
"grad_norm": 2.489950180053711,
"learning_rate": 1.6051643192488263e-05,
"loss": 0.0512,
"step": 14470
},
{
"epoch": 3.4,
"grad_norm": 1.4375927448272705,
"learning_rate": 1.6028169014084507e-05,
"loss": 0.027,
"step": 14480
},
{
"epoch": 3.4,
"grad_norm": 2.1097233295440674,
"learning_rate": 1.6004694835680754e-05,
"loss": 0.0186,
"step": 14490
},
{
"epoch": 3.4,
"grad_norm": 3.670124053955078,
"learning_rate": 1.5981220657276998e-05,
"loss": 0.0252,
"step": 14500
},
{
"epoch": 3.4,
"eval_loss": 0.06111237779259682,
"eval_macro/f1": 0.920550554269432,
"eval_macro/precision": 0.9202120705749197,
"eval_macro/recall": 0.9219955061647757,
"eval_micro/f1": 0.9220566115137426,
"eval_micro/precision": 0.9207346214241824,
"eval_micro/recall": 0.9233824032865457,
"eval_runtime": 30.7947,
"eval_samples/accuracy": 0.9153714481342006,
"eval_samples_per_second": 474.271,
"eval_steps_per_second": 14.84,
"step": 14500
},
{
"epoch": 3.41,
"grad_norm": 0.2565593421459198,
"learning_rate": 1.595774647887324e-05,
"loss": 0.0526,
"step": 14510
},
{
"epoch": 3.41,
"grad_norm": 0.10212003439664841,
"learning_rate": 1.5934272300469486e-05,
"loss": 0.0296,
"step": 14520
},
{
"epoch": 3.41,
"grad_norm": 1.9271149635314941,
"learning_rate": 1.591079812206573e-05,
"loss": 0.0342,
"step": 14530
},
{
"epoch": 3.41,
"grad_norm": 1.296161413192749,
"learning_rate": 1.588732394366197e-05,
"loss": 0.0505,
"step": 14540
},
{
"epoch": 3.42,
"grad_norm": 0.2935797870159149,
"learning_rate": 1.5863849765258218e-05,
"loss": 0.0316,
"step": 14550
},
{
"epoch": 3.42,
"grad_norm": 0.09829127788543701,
"learning_rate": 1.584037558685446e-05,
"loss": 0.0393,
"step": 14560
},
{
"epoch": 3.42,
"grad_norm": 1.6144376993179321,
"learning_rate": 1.5816901408450706e-05,
"loss": 0.0618,
"step": 14570
},
{
"epoch": 3.42,
"grad_norm": 0.642356812953949,
"learning_rate": 1.579342723004695e-05,
"loss": 0.0553,
"step": 14580
},
{
"epoch": 3.42,
"grad_norm": 1.8286004066467285,
"learning_rate": 1.5769953051643193e-05,
"loss": 0.0349,
"step": 14590
},
{
"epoch": 3.43,
"grad_norm": 0.8908696174621582,
"learning_rate": 1.5746478873239437e-05,
"loss": 0.0424,
"step": 14600
},
{
"epoch": 3.43,
"grad_norm": 0.6125556230545044,
"learning_rate": 1.572300469483568e-05,
"loss": 0.0315,
"step": 14610
},
{
"epoch": 3.43,
"grad_norm": 0.11555056273937225,
"learning_rate": 1.5699530516431925e-05,
"loss": 0.042,
"step": 14620
},
{
"epoch": 3.43,
"grad_norm": 1.4078800678253174,
"learning_rate": 1.567605633802817e-05,
"loss": 0.0206,
"step": 14630
},
{
"epoch": 3.44,
"grad_norm": 0.036080729216337204,
"learning_rate": 1.5652582159624413e-05,
"loss": 0.0247,
"step": 14640
},
{
"epoch": 3.44,
"grad_norm": 0.10510585457086563,
"learning_rate": 1.5629107981220657e-05,
"loss": 0.0416,
"step": 14650
},
{
"epoch": 3.44,
"grad_norm": 1.879212498664856,
"learning_rate": 1.56056338028169e-05,
"loss": 0.0364,
"step": 14660
},
{
"epoch": 3.44,
"grad_norm": 10.751227378845215,
"learning_rate": 1.5582159624413145e-05,
"loss": 0.0471,
"step": 14670
},
{
"epoch": 3.45,
"grad_norm": 0.3223772346973419,
"learning_rate": 1.5558685446009392e-05,
"loss": 0.0438,
"step": 14680
},
{
"epoch": 3.45,
"grad_norm": 2.0051674842834473,
"learning_rate": 1.5535211267605633e-05,
"loss": 0.0373,
"step": 14690
},
{
"epoch": 3.45,
"grad_norm": 2.243884325027466,
"learning_rate": 1.5511737089201877e-05,
"loss": 0.0402,
"step": 14700
},
{
"epoch": 3.45,
"grad_norm": 1.409742832183838,
"learning_rate": 1.5488262910798124e-05,
"loss": 0.0452,
"step": 14710
},
{
"epoch": 3.46,
"grad_norm": 1.0107128620147705,
"learning_rate": 1.546478873239437e-05,
"loss": 0.0268,
"step": 14720
},
{
"epoch": 3.46,
"grad_norm": 0.6701316237449646,
"learning_rate": 1.544131455399061e-05,
"loss": 0.0398,
"step": 14730
},
{
"epoch": 3.46,
"grad_norm": 1.3181270360946655,
"learning_rate": 1.5417840375586856e-05,
"loss": 0.0526,
"step": 14740
},
{
"epoch": 3.46,
"grad_norm": 2.4367566108703613,
"learning_rate": 1.53943661971831e-05,
"loss": 0.0254,
"step": 14750
},
{
"epoch": 3.46,
"grad_norm": 0.1977778524160385,
"learning_rate": 1.5370892018779344e-05,
"loss": 0.0533,
"step": 14760
},
{
"epoch": 3.47,
"grad_norm": 0.5955461263656616,
"learning_rate": 1.5347417840375588e-05,
"loss": 0.0332,
"step": 14770
},
{
"epoch": 3.47,
"grad_norm": 0.13952666521072388,
"learning_rate": 1.5323943661971832e-05,
"loss": 0.0521,
"step": 14780
},
{
"epoch": 3.47,
"grad_norm": 1.3798327445983887,
"learning_rate": 1.5300469483568076e-05,
"loss": 0.066,
"step": 14790
},
{
"epoch": 3.47,
"grad_norm": 0.49183735251426697,
"learning_rate": 1.527699530516432e-05,
"loss": 0.0346,
"step": 14800
},
{
"epoch": 3.48,
"grad_norm": 1.1325535774230957,
"learning_rate": 1.5253521126760564e-05,
"loss": 0.0517,
"step": 14810
},
{
"epoch": 3.48,
"grad_norm": 1.626822829246521,
"learning_rate": 1.5230046948356808e-05,
"loss": 0.062,
"step": 14820
},
{
"epoch": 3.48,
"grad_norm": 1.8618102073669434,
"learning_rate": 1.5206572769953054e-05,
"loss": 0.0363,
"step": 14830
},
{
"epoch": 3.48,
"grad_norm": 1.8805667161941528,
"learning_rate": 1.5183098591549298e-05,
"loss": 0.0251,
"step": 14840
},
{
"epoch": 3.49,
"grad_norm": 1.918205976486206,
"learning_rate": 1.515962441314554e-05,
"loss": 0.0425,
"step": 14850
},
{
"epoch": 3.49,
"grad_norm": 2.2678277492523193,
"learning_rate": 1.5136150234741785e-05,
"loss": 0.0356,
"step": 14860
},
{
"epoch": 3.49,
"grad_norm": 2.2082948684692383,
"learning_rate": 1.511267605633803e-05,
"loss": 0.0367,
"step": 14870
},
{
"epoch": 3.49,
"grad_norm": 0.748775839805603,
"learning_rate": 1.5089201877934273e-05,
"loss": 0.0278,
"step": 14880
},
{
"epoch": 3.5,
"grad_norm": 2.2440285682678223,
"learning_rate": 1.5065727699530516e-05,
"loss": 0.0636,
"step": 14890
},
{
"epoch": 3.5,
"grad_norm": 1.5458263158798218,
"learning_rate": 1.5042253521126761e-05,
"loss": 0.0364,
"step": 14900
},
{
"epoch": 3.5,
"grad_norm": 0.6863703727722168,
"learning_rate": 1.5018779342723005e-05,
"loss": 0.0127,
"step": 14910
},
{
"epoch": 3.5,
"grad_norm": 0.5138887166976929,
"learning_rate": 1.4995305164319247e-05,
"loss": 0.0265,
"step": 14920
},
{
"epoch": 3.5,
"grad_norm": 0.6225917339324951,
"learning_rate": 1.4971830985915495e-05,
"loss": 0.047,
"step": 14930
},
{
"epoch": 3.51,
"grad_norm": 1.9543975591659546,
"learning_rate": 1.4948356807511737e-05,
"loss": 0.0324,
"step": 14940
},
{
"epoch": 3.51,
"grad_norm": 0.7699962854385376,
"learning_rate": 1.4924882629107981e-05,
"loss": 0.0357,
"step": 14950
},
{
"epoch": 3.51,
"grad_norm": 0.03310937061905861,
"learning_rate": 1.4901408450704227e-05,
"loss": 0.0138,
"step": 14960
},
{
"epoch": 3.51,
"grad_norm": 3.659010171890259,
"learning_rate": 1.487793427230047e-05,
"loss": 0.0365,
"step": 14970
},
{
"epoch": 3.52,
"grad_norm": 0.07815925031900406,
"learning_rate": 1.4854460093896713e-05,
"loss": 0.0212,
"step": 14980
},
{
"epoch": 3.52,
"grad_norm": 0.031491972506046295,
"learning_rate": 1.4830985915492959e-05,
"loss": 0.0272,
"step": 14990
},
{
"epoch": 3.52,
"grad_norm": 4.178372859954834,
"learning_rate": 1.4807511737089203e-05,
"loss": 0.0216,
"step": 15000
},
{
"epoch": 3.52,
"eval_loss": 0.058485910296440125,
"eval_macro/f1": 0.924584638738032,
"eval_macro/precision": 0.9249554071277554,
"eval_macro/recall": 0.9245806833373952,
"eval_micro/f1": 0.9260513978715396,
"eval_micro/precision": 0.925639622383363,
"eval_micro/recall": 0.9264635398836015,
"eval_runtime": 30.7706,
"eval_samples/accuracy": 0.919274221157138,
"eval_samples_per_second": 474.641,
"eval_steps_per_second": 14.852,
"step": 15000
},
{
"epoch": 3.52,
"grad_norm": 0.2273653894662857,
"learning_rate": 1.4784037558685446e-05,
"loss": 0.0458,
"step": 15010
},
{
"epoch": 3.53,
"grad_norm": 1.6062771081924438,
"learning_rate": 1.4760563380281692e-05,
"loss": 0.0333,
"step": 15020
},
{
"epoch": 3.53,
"grad_norm": 6.850545406341553,
"learning_rate": 1.4737089201877934e-05,
"loss": 0.0098,
"step": 15030
},
{
"epoch": 3.53,
"grad_norm": 0.8278430104255676,
"learning_rate": 1.4713615023474178e-05,
"loss": 0.0311,
"step": 15040
},
{
"epoch": 3.53,
"grad_norm": 0.17055070400238037,
"learning_rate": 1.4690140845070424e-05,
"loss": 0.0377,
"step": 15050
},
{
"epoch": 3.54,
"grad_norm": 2.156358242034912,
"learning_rate": 1.4666666666666668e-05,
"loss": 0.0533,
"step": 15060
},
{
"epoch": 3.54,
"grad_norm": 2.780056953430176,
"learning_rate": 1.464319248826291e-05,
"loss": 0.0318,
"step": 15070
},
{
"epoch": 3.54,
"grad_norm": 1.9434489011764526,
"learning_rate": 1.4619718309859156e-05,
"loss": 0.0388,
"step": 15080
},
{
"epoch": 3.54,
"grad_norm": 0.6632909178733826,
"learning_rate": 1.45962441314554e-05,
"loss": 0.0197,
"step": 15090
},
{
"epoch": 3.54,
"grad_norm": 4.248073577880859,
"learning_rate": 1.4572769953051644e-05,
"loss": 0.0538,
"step": 15100
},
{
"epoch": 3.55,
"grad_norm": 2.3261492252349854,
"learning_rate": 1.454929577464789e-05,
"loss": 0.0356,
"step": 15110
},
{
"epoch": 3.55,
"grad_norm": 2.4972357749938965,
"learning_rate": 1.4525821596244132e-05,
"loss": 0.0505,
"step": 15120
},
{
"epoch": 3.55,
"grad_norm": 2.1739044189453125,
"learning_rate": 1.4502347417840376e-05,
"loss": 0.0451,
"step": 15130
},
{
"epoch": 3.55,
"grad_norm": 0.1503843516111374,
"learning_rate": 1.4478873239436621e-05,
"loss": 0.0455,
"step": 15140
},
{
"epoch": 3.56,
"grad_norm": 0.8404219150543213,
"learning_rate": 1.4455399061032865e-05,
"loss": 0.0345,
"step": 15150
},
{
"epoch": 3.56,
"grad_norm": 0.3419097065925598,
"learning_rate": 1.4431924882629108e-05,
"loss": 0.0443,
"step": 15160
},
{
"epoch": 3.56,
"grad_norm": 0.6726291179656982,
"learning_rate": 1.4408450704225355e-05,
"loss": 0.0221,
"step": 15170
},
{
"epoch": 3.56,
"grad_norm": 0.051636260002851486,
"learning_rate": 1.4384976525821597e-05,
"loss": 0.0381,
"step": 15180
},
{
"epoch": 3.57,
"grad_norm": 1.508140206336975,
"learning_rate": 1.4361502347417841e-05,
"loss": 0.0427,
"step": 15190
},
{
"epoch": 3.57,
"grad_norm": 2.3500542640686035,
"learning_rate": 1.4338028169014083e-05,
"loss": 0.0426,
"step": 15200
},
{
"epoch": 3.57,
"grad_norm": 0.3720274567604065,
"learning_rate": 1.4314553990610329e-05,
"loss": 0.0211,
"step": 15210
},
{
"epoch": 3.57,
"grad_norm": 0.3215431272983551,
"learning_rate": 1.4291079812206573e-05,
"loss": 0.025,
"step": 15220
},
{
"epoch": 3.58,
"grad_norm": 4.001272678375244,
"learning_rate": 1.4267605633802817e-05,
"loss": 0.0434,
"step": 15230
},
{
"epoch": 3.58,
"grad_norm": 0.29120638966560364,
"learning_rate": 1.4244131455399063e-05,
"loss": 0.0309,
"step": 15240
},
{
"epoch": 3.58,
"grad_norm": 0.6027160882949829,
"learning_rate": 1.4220657276995305e-05,
"loss": 0.0304,
"step": 15250
},
{
"epoch": 3.58,
"grad_norm": 0.1958913952112198,
"learning_rate": 1.4197183098591549e-05,
"loss": 0.0381,
"step": 15260
},
{
"epoch": 3.58,
"grad_norm": 0.306017130613327,
"learning_rate": 1.4173708920187794e-05,
"loss": 0.0402,
"step": 15270
},
{
"epoch": 3.59,
"grad_norm": 3.3421289920806885,
"learning_rate": 1.4150234741784038e-05,
"loss": 0.0313,
"step": 15280
},
{
"epoch": 3.59,
"grad_norm": 0.6600431799888611,
"learning_rate": 1.412676056338028e-05,
"loss": 0.0408,
"step": 15290
},
{
"epoch": 3.59,
"grad_norm": 1.1957746744155884,
"learning_rate": 1.4103286384976528e-05,
"loss": 0.0367,
"step": 15300
},
{
"epoch": 3.59,
"grad_norm": 2.5639190673828125,
"learning_rate": 1.407981220657277e-05,
"loss": 0.0337,
"step": 15310
},
{
"epoch": 3.6,
"grad_norm": 1.6588279008865356,
"learning_rate": 1.4056338028169014e-05,
"loss": 0.0521,
"step": 15320
},
{
"epoch": 3.6,
"grad_norm": 0.6526573896408081,
"learning_rate": 1.403286384976526e-05,
"loss": 0.0238,
"step": 15330
},
{
"epoch": 3.6,
"grad_norm": 1.4112497568130493,
"learning_rate": 1.4009389671361502e-05,
"loss": 0.0274,
"step": 15340
},
{
"epoch": 3.6,
"grad_norm": 0.15742766857147217,
"learning_rate": 1.3985915492957746e-05,
"loss": 0.0424,
"step": 15350
},
{
"epoch": 3.61,
"grad_norm": 2.976287841796875,
"learning_rate": 1.3962441314553992e-05,
"loss": 0.0314,
"step": 15360
},
{
"epoch": 3.61,
"grad_norm": 1.613516926765442,
"learning_rate": 1.3938967136150236e-05,
"loss": 0.0352,
"step": 15370
},
{
"epoch": 3.61,
"grad_norm": 2.304145336151123,
"learning_rate": 1.3915492957746478e-05,
"loss": 0.0329,
"step": 15380
},
{
"epoch": 3.61,
"grad_norm": 2.545947551727295,
"learning_rate": 1.3892018779342725e-05,
"loss": 0.0279,
"step": 15390
},
{
"epoch": 3.62,
"grad_norm": 0.5519721508026123,
"learning_rate": 1.3868544600938968e-05,
"loss": 0.0393,
"step": 15400
},
{
"epoch": 3.62,
"grad_norm": 0.135732501745224,
"learning_rate": 1.3845070422535212e-05,
"loss": 0.0287,
"step": 15410
},
{
"epoch": 3.62,
"grad_norm": 0.8303437829017639,
"learning_rate": 1.3821596244131457e-05,
"loss": 0.0371,
"step": 15420
},
{
"epoch": 3.62,
"grad_norm": 2.1903645992279053,
"learning_rate": 1.3798122065727701e-05,
"loss": 0.0334,
"step": 15430
},
{
"epoch": 3.62,
"grad_norm": 2.582015037536621,
"learning_rate": 1.3774647887323943e-05,
"loss": 0.0433,
"step": 15440
},
{
"epoch": 3.63,
"grad_norm": 0.15312707424163818,
"learning_rate": 1.3751173708920189e-05,
"loss": 0.0527,
"step": 15450
},
{
"epoch": 3.63,
"grad_norm": 2.6689505577087402,
"learning_rate": 1.3727699530516433e-05,
"loss": 0.0503,
"step": 15460
},
{
"epoch": 3.63,
"grad_norm": 2.9017531871795654,
"learning_rate": 1.3704225352112677e-05,
"loss": 0.0719,
"step": 15470
},
{
"epoch": 3.63,
"grad_norm": 0.6398376822471619,
"learning_rate": 1.3680751173708923e-05,
"loss": 0.0221,
"step": 15480
},
{
"epoch": 3.64,
"grad_norm": 1.0251731872558594,
"learning_rate": 1.3657276995305165e-05,
"loss": 0.0326,
"step": 15490
},
{
"epoch": 3.64,
"grad_norm": 1.954060435295105,
"learning_rate": 1.3633802816901409e-05,
"loss": 0.0346,
"step": 15500
},
{
"epoch": 3.64,
"eval_loss": 0.05969972163438797,
"eval_macro/f1": 0.9216894852226254,
"eval_macro/precision": 0.9223725171238232,
"eval_macro/recall": 0.9214463830733233,
"eval_micro/f1": 0.9234086242299795,
"eval_micro/precision": 0.9230927129661307,
"eval_micro/recall": 0.9237247517973297,
"eval_runtime": 27.8691,
"eval_samples/accuracy": 0.9168777815816501,
"eval_samples_per_second": 524.057,
"eval_steps_per_second": 16.398,
"step": 15500
},
{
"epoch": 3.64,
"grad_norm": 2.410942792892456,
"learning_rate": 1.3610328638497651e-05,
"loss": 0.0299,
"step": 15510
},
{
"epoch": 3.64,
"grad_norm": 0.29106903076171875,
"learning_rate": 1.3586854460093899e-05,
"loss": 0.0311,
"step": 15520
},
{
"epoch": 3.65,
"grad_norm": 1.473370909690857,
"learning_rate": 1.356338028169014e-05,
"loss": 0.0221,
"step": 15530
},
{
"epoch": 3.65,
"grad_norm": 3.0741446018218994,
"learning_rate": 1.3539906103286385e-05,
"loss": 0.0407,
"step": 15540
},
{
"epoch": 3.65,
"grad_norm": 1.4671398401260376,
"learning_rate": 1.351643192488263e-05,
"loss": 0.0318,
"step": 15550
},
{
"epoch": 3.65,
"grad_norm": 1.6664915084838867,
"learning_rate": 1.3492957746478874e-05,
"loss": 0.0484,
"step": 15560
},
{
"epoch": 3.65,
"grad_norm": 3.2199301719665527,
"learning_rate": 1.3469483568075117e-05,
"loss": 0.0558,
"step": 15570
},
{
"epoch": 3.66,
"grad_norm": 1.3700677156448364,
"learning_rate": 1.3446009389671362e-05,
"loss": 0.0346,
"step": 15580
},
{
"epoch": 3.66,
"grad_norm": 0.7739179730415344,
"learning_rate": 1.3422535211267606e-05,
"loss": 0.0504,
"step": 15590
},
{
"epoch": 3.66,
"grad_norm": 0.4577118754386902,
"learning_rate": 1.339906103286385e-05,
"loss": 0.0258,
"step": 15600
},
{
"epoch": 3.66,
"grad_norm": 0.44119253754615784,
"learning_rate": 1.3375586854460096e-05,
"loss": 0.0405,
"step": 15610
},
{
"epoch": 3.67,
"grad_norm": 4.451091766357422,
"learning_rate": 1.3352112676056338e-05,
"loss": 0.0683,
"step": 15620
},
{
"epoch": 3.67,
"grad_norm": 3.0964515209198,
"learning_rate": 1.3328638497652582e-05,
"loss": 0.0373,
"step": 15630
},
{
"epoch": 3.67,
"grad_norm": 2.7771458625793457,
"learning_rate": 1.3305164319248828e-05,
"loss": 0.0745,
"step": 15640
},
{
"epoch": 3.67,
"grad_norm": 2.804838180541992,
"learning_rate": 1.3281690140845072e-05,
"loss": 0.0397,
"step": 15650
},
{
"epoch": 3.68,
"grad_norm": 1.006893277168274,
"learning_rate": 1.3258215962441314e-05,
"loss": 0.0428,
"step": 15660
},
{
"epoch": 3.68,
"grad_norm": 0.8771440982818604,
"learning_rate": 1.323474178403756e-05,
"loss": 0.0282,
"step": 15670
},
{
"epoch": 3.68,
"grad_norm": 0.4935603737831116,
"learning_rate": 1.3211267605633804e-05,
"loss": 0.0293,
"step": 15680
},
{
"epoch": 3.68,
"grad_norm": 0.03460874408483505,
"learning_rate": 1.3187793427230048e-05,
"loss": 0.0509,
"step": 15690
},
{
"epoch": 3.69,
"grad_norm": 0.2056482583284378,
"learning_rate": 1.3164319248826293e-05,
"loss": 0.0358,
"step": 15700
},
{
"epoch": 3.69,
"grad_norm": 3.385071039199829,
"learning_rate": 1.3140845070422535e-05,
"loss": 0.043,
"step": 15710
},
{
"epoch": 3.69,
"grad_norm": 1.2948657274246216,
"learning_rate": 1.311737089201878e-05,
"loss": 0.0186,
"step": 15720
},
{
"epoch": 3.69,
"grad_norm": 0.2789139151573181,
"learning_rate": 1.3093896713615025e-05,
"loss": 0.0441,
"step": 15730
},
{
"epoch": 3.69,
"grad_norm": 2.7602319717407227,
"learning_rate": 1.3070422535211269e-05,
"loss": 0.0468,
"step": 15740
},
{
"epoch": 3.7,
"grad_norm": 0.09470643103122711,
"learning_rate": 1.3046948356807511e-05,
"loss": 0.0173,
"step": 15750
},
{
"epoch": 3.7,
"grad_norm": 2.7615208625793457,
"learning_rate": 1.3023474178403759e-05,
"loss": 0.0472,
"step": 15760
},
{
"epoch": 3.7,
"grad_norm": 1.1640163660049438,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.0328,
"step": 15770
},
{
"epoch": 3.7,
"grad_norm": 0.09145846962928772,
"learning_rate": 1.2976525821596245e-05,
"loss": 0.0282,
"step": 15780
},
{
"epoch": 3.71,
"grad_norm": 0.874416172504425,
"learning_rate": 1.2953051643192487e-05,
"loss": 0.0524,
"step": 15790
},
{
"epoch": 3.71,
"grad_norm": 1.1191848516464233,
"learning_rate": 1.2929577464788733e-05,
"loss": 0.0483,
"step": 15800
},
{
"epoch": 3.71,
"grad_norm": 0.3583777844905853,
"learning_rate": 1.2906103286384977e-05,
"loss": 0.0227,
"step": 15810
},
{
"epoch": 3.71,
"grad_norm": 4.477143287658691,
"learning_rate": 1.288262910798122e-05,
"loss": 0.0354,
"step": 15820
},
{
"epoch": 3.72,
"grad_norm": 1.5109944343566895,
"learning_rate": 1.2859154929577466e-05,
"loss": 0.0257,
"step": 15830
},
{
"epoch": 3.72,
"grad_norm": 0.3320276141166687,
"learning_rate": 1.2835680751173709e-05,
"loss": 0.0173,
"step": 15840
},
{
"epoch": 3.72,
"grad_norm": 1.1068660020828247,
"learning_rate": 1.2812206572769953e-05,
"loss": 0.0243,
"step": 15850
},
{
"epoch": 3.72,
"grad_norm": 4.6560773849487305,
"learning_rate": 1.2788732394366198e-05,
"loss": 0.0407,
"step": 15860
},
{
"epoch": 3.73,
"grad_norm": 1.1727485656738281,
"learning_rate": 1.2765258215962442e-05,
"loss": 0.0409,
"step": 15870
},
{
"epoch": 3.73,
"grad_norm": 1.807950735092163,
"learning_rate": 1.2741784037558684e-05,
"loss": 0.0177,
"step": 15880
},
{
"epoch": 3.73,
"grad_norm": 0.8868434429168701,
"learning_rate": 1.2718309859154932e-05,
"loss": 0.0377,
"step": 15890
},
{
"epoch": 3.73,
"grad_norm": 0.8664716482162476,
"learning_rate": 1.2694835680751174e-05,
"loss": 0.0251,
"step": 15900
},
{
"epoch": 3.73,
"grad_norm": 0.5700992345809937,
"learning_rate": 1.2671361502347418e-05,
"loss": 0.045,
"step": 15910
},
{
"epoch": 3.74,
"grad_norm": 2.4590039253234863,
"learning_rate": 1.2647887323943664e-05,
"loss": 0.0379,
"step": 15920
},
{
"epoch": 3.74,
"grad_norm": 0.2064366489648819,
"learning_rate": 1.2624413145539906e-05,
"loss": 0.0275,
"step": 15930
},
{
"epoch": 3.74,
"grad_norm": 0.28647980093955994,
"learning_rate": 1.260093896713615e-05,
"loss": 0.0333,
"step": 15940
},
{
"epoch": 3.74,
"grad_norm": 0.4945986568927765,
"learning_rate": 1.2577464788732396e-05,
"loss": 0.0267,
"step": 15950
},
{
"epoch": 3.75,
"grad_norm": 3.3648037910461426,
"learning_rate": 1.255399061032864e-05,
"loss": 0.0359,
"step": 15960
},
{
"epoch": 3.75,
"grad_norm": 2.6217010021209717,
"learning_rate": 1.2530516431924882e-05,
"loss": 0.035,
"step": 15970
},
{
"epoch": 3.75,
"grad_norm": 1.323452115058899,
"learning_rate": 1.2507042253521129e-05,
"loss": 0.0249,
"step": 15980
},
{
"epoch": 3.75,
"grad_norm": 0.818280816078186,
"learning_rate": 1.2483568075117371e-05,
"loss": 0.046,
"step": 15990
},
{
"epoch": 3.76,
"grad_norm": 1.3233391046524048,
"learning_rate": 1.2460093896713615e-05,
"loss": 0.027,
"step": 16000
},
{
"epoch": 3.76,
"eval_loss": 0.060248348861932755,
"eval_macro/f1": 0.9231307675203939,
"eval_macro/precision": 0.9238576142304777,
"eval_macro/recall": 0.9234269768144926,
"eval_micro/f1": 0.9243082395594623,
"eval_micro/precision": 0.9234554401312193,
"eval_micro/recall": 0.9251626155426224,
"eval_runtime": 29.6965,
"eval_samples/accuracy": 0.9184525847312565,
"eval_samples_per_second": 491.809,
"eval_steps_per_second": 15.389,
"step": 16000
},
{
"epoch": 3.76,
"grad_norm": 0.4075230360031128,
"learning_rate": 1.243661971830986e-05,
"loss": 0.0353,
"step": 16010
},
{
"epoch": 3.76,
"grad_norm": 1.8514373302459717,
"learning_rate": 1.2413145539906105e-05,
"loss": 0.0317,
"step": 16020
},
{
"epoch": 3.76,
"grad_norm": 0.3068770170211792,
"learning_rate": 1.2389671361502347e-05,
"loss": 0.0262,
"step": 16030
},
{
"epoch": 3.77,
"grad_norm": 1.1337624788284302,
"learning_rate": 1.2366197183098593e-05,
"loss": 0.0446,
"step": 16040
},
{
"epoch": 3.77,
"grad_norm": 0.8808763027191162,
"learning_rate": 1.2342723004694837e-05,
"loss": 0.0255,
"step": 16050
},
{
"epoch": 3.77,
"grad_norm": 2.083991527557373,
"learning_rate": 1.2319248826291079e-05,
"loss": 0.0243,
"step": 16060
},
{
"epoch": 3.77,
"grad_norm": 2.384486198425293,
"learning_rate": 1.2295774647887325e-05,
"loss": 0.0348,
"step": 16070
},
{
"epoch": 3.77,
"grad_norm": 1.3079556226730347,
"learning_rate": 1.2272300469483569e-05,
"loss": 0.0574,
"step": 16080
},
{
"epoch": 3.78,
"grad_norm": 0.15147972106933594,
"learning_rate": 1.2248826291079813e-05,
"loss": 0.0221,
"step": 16090
},
{
"epoch": 3.78,
"grad_norm": 2.0932321548461914,
"learning_rate": 1.2225352112676057e-05,
"loss": 0.0345,
"step": 16100
},
{
"epoch": 3.78,
"grad_norm": 1.885918140411377,
"learning_rate": 1.2201877934272302e-05,
"loss": 0.0303,
"step": 16110
},
{
"epoch": 3.78,
"grad_norm": 1.7613309621810913,
"learning_rate": 1.2178403755868545e-05,
"loss": 0.0449,
"step": 16120
},
{
"epoch": 3.79,
"grad_norm": 0.49504420161247253,
"learning_rate": 1.215492957746479e-05,
"loss": 0.0318,
"step": 16130
},
{
"epoch": 3.79,
"grad_norm": 1.404921054840088,
"learning_rate": 1.2131455399061034e-05,
"loss": 0.0368,
"step": 16140
},
{
"epoch": 3.79,
"grad_norm": 1.087222933769226,
"learning_rate": 1.2107981220657278e-05,
"loss": 0.0251,
"step": 16150
},
{
"epoch": 3.79,
"grad_norm": 0.017098503187298775,
"learning_rate": 1.2084507042253522e-05,
"loss": 0.0265,
"step": 16160
},
{
"epoch": 3.8,
"grad_norm": 2.125025987625122,
"learning_rate": 1.2061032863849766e-05,
"loss": 0.0361,
"step": 16170
},
{
"epoch": 3.8,
"grad_norm": 1.9100775718688965,
"learning_rate": 1.203755868544601e-05,
"loss": 0.0572,
"step": 16180
},
{
"epoch": 3.8,
"grad_norm": 0.4402145445346832,
"learning_rate": 1.2014084507042254e-05,
"loss": 0.0463,
"step": 16190
},
{
"epoch": 3.8,
"grad_norm": 0.9230586886405945,
"learning_rate": 1.1990610328638498e-05,
"loss": 0.0328,
"step": 16200
},
{
"epoch": 3.81,
"grad_norm": 0.3850882053375244,
"learning_rate": 1.1967136150234742e-05,
"loss": 0.0488,
"step": 16210
},
{
"epoch": 3.81,
"grad_norm": 0.6668741703033447,
"learning_rate": 1.1943661971830987e-05,
"loss": 0.0282,
"step": 16220
},
{
"epoch": 3.81,
"grad_norm": 0.18775498867034912,
"learning_rate": 1.192018779342723e-05,
"loss": 0.0257,
"step": 16230
},
{
"epoch": 3.81,
"grad_norm": 4.097379684448242,
"learning_rate": 1.1896713615023475e-05,
"loss": 0.0369,
"step": 16240
},
{
"epoch": 3.81,
"grad_norm": 3.4568750858306885,
"learning_rate": 1.187323943661972e-05,
"loss": 0.0487,
"step": 16250
},
{
"epoch": 3.82,
"grad_norm": 1.5609110593795776,
"learning_rate": 1.1849765258215963e-05,
"loss": 0.0358,
"step": 16260
},
{
"epoch": 3.82,
"grad_norm": 1.8959105014801025,
"learning_rate": 1.1826291079812207e-05,
"loss": 0.0333,
"step": 16270
},
{
"epoch": 3.82,
"grad_norm": 1.7780373096466064,
"learning_rate": 1.1802816901408451e-05,
"loss": 0.0389,
"step": 16280
},
{
"epoch": 3.82,
"grad_norm": 0.4024539291858673,
"learning_rate": 1.1779342723004695e-05,
"loss": 0.0317,
"step": 16290
},
{
"epoch": 3.83,
"grad_norm": 2.487910747528076,
"learning_rate": 1.175586854460094e-05,
"loss": 0.0473,
"step": 16300
},
{
"epoch": 3.83,
"grad_norm": 0.5158886313438416,
"learning_rate": 1.1732394366197183e-05,
"loss": 0.0493,
"step": 16310
},
{
"epoch": 3.83,
"grad_norm": 1.9161232709884644,
"learning_rate": 1.1708920187793427e-05,
"loss": 0.0421,
"step": 16320
},
{
"epoch": 3.83,
"grad_norm": 1.4477870464324951,
"learning_rate": 1.1685446009389673e-05,
"loss": 0.0442,
"step": 16330
},
{
"epoch": 3.84,
"grad_norm": 3.5894601345062256,
"learning_rate": 1.1661971830985915e-05,
"loss": 0.0483,
"step": 16340
},
{
"epoch": 3.84,
"grad_norm": 0.6837550401687622,
"learning_rate": 1.163849765258216e-05,
"loss": 0.0396,
"step": 16350
},
{
"epoch": 3.84,
"grad_norm": 0.9177005290985107,
"learning_rate": 1.1615023474178405e-05,
"loss": 0.0159,
"step": 16360
},
{
"epoch": 3.84,
"grad_norm": 1.36934494972229,
"learning_rate": 1.1591549295774649e-05,
"loss": 0.02,
"step": 16370
},
{
"epoch": 3.85,
"grad_norm": 0.41758379340171814,
"learning_rate": 1.1568075117370893e-05,
"loss": 0.0305,
"step": 16380
},
{
"epoch": 3.85,
"grad_norm": 0.9088613390922546,
"learning_rate": 1.1544600938967136e-05,
"loss": 0.0282,
"step": 16390
},
{
"epoch": 3.85,
"grad_norm": 0.19039836525917053,
"learning_rate": 1.152112676056338e-05,
"loss": 0.0276,
"step": 16400
},
{
"epoch": 3.85,
"grad_norm": 3.383873462677002,
"learning_rate": 1.1497652582159624e-05,
"loss": 0.0529,
"step": 16410
},
{
"epoch": 3.85,
"grad_norm": 0.04074535891413689,
"learning_rate": 1.147417840375587e-05,
"loss": 0.0341,
"step": 16420
},
{
"epoch": 3.86,
"grad_norm": 2.3034849166870117,
"learning_rate": 1.1450704225352112e-05,
"loss": 0.068,
"step": 16430
},
{
"epoch": 3.86,
"grad_norm": 1.1354528665542603,
"learning_rate": 1.1427230046948358e-05,
"loss": 0.0133,
"step": 16440
},
{
"epoch": 3.86,
"grad_norm": 1.0032627582550049,
"learning_rate": 1.14037558685446e-05,
"loss": 0.0327,
"step": 16450
},
{
"epoch": 3.86,
"grad_norm": 1.8904528617858887,
"learning_rate": 1.1380281690140846e-05,
"loss": 0.039,
"step": 16460
},
{
"epoch": 3.87,
"grad_norm": 2.2675487995147705,
"learning_rate": 1.135680751173709e-05,
"loss": 0.0538,
"step": 16470
},
{
"epoch": 3.87,
"grad_norm": 1.8295910358428955,
"learning_rate": 1.1333333333333334e-05,
"loss": 0.0551,
"step": 16480
},
{
"epoch": 3.87,
"grad_norm": 1.6022156476974487,
"learning_rate": 1.1309859154929578e-05,
"loss": 0.0266,
"step": 16490
},
{
"epoch": 3.87,
"grad_norm": 1.1555670499801636,
"learning_rate": 1.1286384976525822e-05,
"loss": 0.0454,
"step": 16500
},
{
"epoch": 3.87,
"eval_loss": 0.05463627353310585,
"eval_macro/f1": 0.9259127675842223,
"eval_macro/precision": 0.9267482357187499,
"eval_macro/recall": 0.92584653320531,
"eval_micro/f1": 0.9269678302532512,
"eval_micro/precision": 0.9266507013342457,
"eval_micro/recall": 0.927285176309483,
"eval_runtime": 29.5895,
"eval_samples/accuracy": 0.9209859637110579,
"eval_samples_per_second": 493.587,
"eval_steps_per_second": 15.445,
"step": 16500
},
{
"epoch": 3.88,
"grad_norm": 1.2500646114349365,
"learning_rate": 1.1262910798122066e-05,
"loss": 0.0334,
"step": 16510
},
{
"epoch": 3.88,
"grad_norm": 1.2711575031280518,
"learning_rate": 1.123943661971831e-05,
"loss": 0.0318,
"step": 16520
},
{
"epoch": 3.88,
"grad_norm": 1.3326259851455688,
"learning_rate": 1.1215962441314555e-05,
"loss": 0.014,
"step": 16530
},
{
"epoch": 3.88,
"grad_norm": 0.792326033115387,
"learning_rate": 1.1192488262910798e-05,
"loss": 0.0391,
"step": 16540
},
{
"epoch": 3.88,
"grad_norm": 0.5718660354614258,
"learning_rate": 1.1169014084507043e-05,
"loss": 0.0382,
"step": 16550
},
{
"epoch": 3.89,
"grad_norm": 0.5356886982917786,
"learning_rate": 1.1145539906103287e-05,
"loss": 0.0388,
"step": 16560
},
{
"epoch": 3.89,
"grad_norm": 1.5403183698654175,
"learning_rate": 1.1122065727699531e-05,
"loss": 0.0301,
"step": 16570
},
{
"epoch": 3.89,
"grad_norm": 4.721304893493652,
"learning_rate": 1.1098591549295775e-05,
"loss": 0.0655,
"step": 16580
},
{
"epoch": 3.89,
"grad_norm": 1.5049669742584229,
"learning_rate": 1.1075117370892019e-05,
"loss": 0.0249,
"step": 16590
},
{
"epoch": 3.9,
"grad_norm": 0.16248831152915955,
"learning_rate": 1.1051643192488263e-05,
"loss": 0.0391,
"step": 16600
},
{
"epoch": 3.9,
"grad_norm": 1.5317431688308716,
"learning_rate": 1.1028169014084509e-05,
"loss": 0.0336,
"step": 16610
},
{
"epoch": 3.9,
"grad_norm": 4.947154998779297,
"learning_rate": 1.1004694835680751e-05,
"loss": 0.0512,
"step": 16620
},
{
"epoch": 3.9,
"grad_norm": 2.626268148422241,
"learning_rate": 1.0981220657276995e-05,
"loss": 0.0394,
"step": 16630
},
{
"epoch": 3.91,
"grad_norm": 3.1185550689697266,
"learning_rate": 1.095774647887324e-05,
"loss": 0.036,
"step": 16640
},
{
"epoch": 3.91,
"grad_norm": 1.6116607189178467,
"learning_rate": 1.0934272300469483e-05,
"loss": 0.0396,
"step": 16650
},
{
"epoch": 3.91,
"grad_norm": 0.3400775194168091,
"learning_rate": 1.0910798122065728e-05,
"loss": 0.0316,
"step": 16660
},
{
"epoch": 3.91,
"grad_norm": 0.26096880435943604,
"learning_rate": 1.0887323943661972e-05,
"loss": 0.026,
"step": 16670
},
{
"epoch": 3.92,
"grad_norm": 2.0492286682128906,
"learning_rate": 1.0863849765258216e-05,
"loss": 0.0336,
"step": 16680
},
{
"epoch": 3.92,
"grad_norm": 2.7371644973754883,
"learning_rate": 1.084037558685446e-05,
"loss": 0.0474,
"step": 16690
},
{
"epoch": 3.92,
"grad_norm": 0.3516647219657898,
"learning_rate": 1.0816901408450706e-05,
"loss": 0.0233,
"step": 16700
},
{
"epoch": 3.92,
"grad_norm": 1.8178682327270508,
"learning_rate": 1.0793427230046948e-05,
"loss": 0.0429,
"step": 16710
},
{
"epoch": 3.92,
"grad_norm": 0.6841596961021423,
"learning_rate": 1.0769953051643194e-05,
"loss": 0.0171,
"step": 16720
},
{
"epoch": 3.93,
"grad_norm": 0.06779265403747559,
"learning_rate": 1.0746478873239438e-05,
"loss": 0.0233,
"step": 16730
},
{
"epoch": 3.93,
"grad_norm": 1.7634962797164917,
"learning_rate": 1.0723004694835682e-05,
"loss": 0.0191,
"step": 16740
},
{
"epoch": 3.93,
"grad_norm": 1.6839202642440796,
"learning_rate": 1.0699530516431926e-05,
"loss": 0.0508,
"step": 16750
},
{
"epoch": 3.93,
"grad_norm": 0.2583700716495514,
"learning_rate": 1.067605633802817e-05,
"loss": 0.0283,
"step": 16760
},
{
"epoch": 3.94,
"grad_norm": 2.4837722778320312,
"learning_rate": 1.0652582159624414e-05,
"loss": 0.0249,
"step": 16770
},
{
"epoch": 3.94,
"grad_norm": 3.755295753479004,
"learning_rate": 1.0629107981220658e-05,
"loss": 0.0399,
"step": 16780
},
{
"epoch": 3.94,
"grad_norm": 1.3216301202774048,
"learning_rate": 1.0605633802816902e-05,
"loss": 0.0495,
"step": 16790
},
{
"epoch": 3.94,
"grad_norm": 0.964759886264801,
"learning_rate": 1.0582159624413146e-05,
"loss": 0.0277,
"step": 16800
},
{
"epoch": 3.95,
"grad_norm": 0.13690391182899475,
"learning_rate": 1.0558685446009391e-05,
"loss": 0.0453,
"step": 16810
},
{
"epoch": 3.95,
"grad_norm": 0.9749215245246887,
"learning_rate": 1.0535211267605633e-05,
"loss": 0.0202,
"step": 16820
},
{
"epoch": 3.95,
"grad_norm": 1.2910761833190918,
"learning_rate": 1.0511737089201879e-05,
"loss": 0.0255,
"step": 16830
},
{
"epoch": 3.95,
"grad_norm": 1.0894033908843994,
"learning_rate": 1.0488262910798123e-05,
"loss": 0.0286,
"step": 16840
},
{
"epoch": 3.96,
"grad_norm": 0.8152934908866882,
"learning_rate": 1.0464788732394367e-05,
"loss": 0.0375,
"step": 16850
},
{
"epoch": 3.96,
"grad_norm": 2.546592950820923,
"learning_rate": 1.0441314553990611e-05,
"loss": 0.0217,
"step": 16860
},
{
"epoch": 3.96,
"grad_norm": 2.69392466545105,
"learning_rate": 1.0417840375586855e-05,
"loss": 0.0374,
"step": 16870
},
{
"epoch": 3.96,
"grad_norm": 2.5685863494873047,
"learning_rate": 1.0394366197183099e-05,
"loss": 0.039,
"step": 16880
},
{
"epoch": 3.96,
"grad_norm": 2.294158697128296,
"learning_rate": 1.0370892018779343e-05,
"loss": 0.0268,
"step": 16890
},
{
"epoch": 3.97,
"grad_norm": 1.6547008752822876,
"learning_rate": 1.0347417840375587e-05,
"loss": 0.0163,
"step": 16900
},
{
"epoch": 3.97,
"grad_norm": 2.758500576019287,
"learning_rate": 1.032394366197183e-05,
"loss": 0.0382,
"step": 16910
},
{
"epoch": 3.97,
"grad_norm": 0.08954645693302155,
"learning_rate": 1.0300469483568076e-05,
"loss": 0.0409,
"step": 16920
},
{
"epoch": 3.97,
"grad_norm": 1.6631460189819336,
"learning_rate": 1.0276995305164319e-05,
"loss": 0.0429,
"step": 16930
},
{
"epoch": 3.98,
"grad_norm": 0.3024432063102722,
"learning_rate": 1.0253521126760564e-05,
"loss": 0.0466,
"step": 16940
},
{
"epoch": 3.98,
"grad_norm": 2.2123658657073975,
"learning_rate": 1.0230046948356808e-05,
"loss": 0.0389,
"step": 16950
},
{
"epoch": 3.98,
"grad_norm": 2.56610369682312,
"learning_rate": 1.0206572769953052e-05,
"loss": 0.0516,
"step": 16960
},
{
"epoch": 3.98,
"grad_norm": 2.63765549659729,
"learning_rate": 1.0183098591549296e-05,
"loss": 0.0288,
"step": 16970
},
{
"epoch": 3.99,
"grad_norm": 1.9523321390151978,
"learning_rate": 1.015962441314554e-05,
"loss": 0.0274,
"step": 16980
},
{
"epoch": 3.99,
"grad_norm": 1.6789344549179077,
"learning_rate": 1.0136150234741784e-05,
"loss": 0.0283,
"step": 16990
},
{
"epoch": 3.99,
"grad_norm": 2.195377826690674,
"learning_rate": 1.0112676056338028e-05,
"loss": 0.0362,
"step": 17000
},
{
"epoch": 3.99,
"eval_loss": 0.05907091498374939,
"eval_macro/f1": 0.9221598708411745,
"eval_macro/precision": 0.9243403631803216,
"eval_macro/recall": 0.9208585140851037,
"eval_micro/f1": 0.9233931752775113,
"eval_micro/precision": 0.9240896934787081,
"eval_micro/recall": 0.9226977062649777,
"eval_runtime": 28.8245,
"eval_samples/accuracy": 0.915576857240671,
"eval_samples_per_second": 506.687,
"eval_steps_per_second": 15.855,
"step": 17000
},
{
"epoch": 3.99,
"grad_norm": 1.6797447204589844,
"learning_rate": 1.0089201877934274e-05,
"loss": 0.0542,
"step": 17010
},
{
"epoch": 4.0,
"grad_norm": 1.0108383893966675,
"learning_rate": 1.0065727699530516e-05,
"loss": 0.0444,
"step": 17020
},
{
"epoch": 4.0,
"grad_norm": 4.3218183517456055,
"learning_rate": 1.0042253521126762e-05,
"loss": 0.0333,
"step": 17030
},
{
"epoch": 4.0,
"grad_norm": 0.9301080703735352,
"learning_rate": 1.0018779342723004e-05,
"loss": 0.0269,
"step": 17040
},
{
"epoch": 4.0,
"grad_norm": 1.6253621578216553,
"learning_rate": 9.99530516431925e-06,
"loss": 0.0384,
"step": 17050
},
{
"epoch": 4.0,
"grad_norm": 2.4339439868927,
"learning_rate": 9.971830985915494e-06,
"loss": 0.0321,
"step": 17060
},
{
"epoch": 4.01,
"grad_norm": 3.879636287689209,
"learning_rate": 9.948356807511738e-06,
"loss": 0.023,
"step": 17070
},
{
"epoch": 4.01,
"grad_norm": 2.669905424118042,
"learning_rate": 9.924882629107981e-06,
"loss": 0.0337,
"step": 17080
},
{
"epoch": 4.01,
"grad_norm": 0.813624382019043,
"learning_rate": 9.901408450704225e-06,
"loss": 0.0326,
"step": 17090
},
{
"epoch": 4.01,
"grad_norm": 1.2216477394104004,
"learning_rate": 9.87793427230047e-06,
"loss": 0.0185,
"step": 17100
},
{
"epoch": 4.02,
"grad_norm": 1.9044301509857178,
"learning_rate": 9.854460093896713e-06,
"loss": 0.0439,
"step": 17110
},
{
"epoch": 4.02,
"grad_norm": 1.891352653503418,
"learning_rate": 9.830985915492959e-06,
"loss": 0.0384,
"step": 17120
},
{
"epoch": 4.02,
"grad_norm": 1.359215259552002,
"learning_rate": 9.807511737089201e-06,
"loss": 0.0127,
"step": 17130
},
{
"epoch": 4.02,
"grad_norm": 0.31286099553108215,
"learning_rate": 9.784037558685447e-06,
"loss": 0.0182,
"step": 17140
},
{
"epoch": 4.03,
"grad_norm": 1.1283832788467407,
"learning_rate": 9.760563380281691e-06,
"loss": 0.0183,
"step": 17150
},
{
"epoch": 4.03,
"grad_norm": 0.06705621629953384,
"learning_rate": 9.737089201877935e-06,
"loss": 0.0259,
"step": 17160
},
{
"epoch": 4.03,
"grad_norm": 0.1862085461616516,
"learning_rate": 9.713615023474179e-06,
"loss": 0.0183,
"step": 17170
},
{
"epoch": 4.03,
"grad_norm": 2.7586522102355957,
"learning_rate": 9.690140845070424e-06,
"loss": 0.0405,
"step": 17180
},
{
"epoch": 4.04,
"grad_norm": 4.007384777069092,
"learning_rate": 9.666666666666667e-06,
"loss": 0.0303,
"step": 17190
},
{
"epoch": 4.04,
"grad_norm": 1.2727338075637817,
"learning_rate": 9.643192488262912e-06,
"loss": 0.0324,
"step": 17200
},
{
"epoch": 4.04,
"grad_norm": 0.08033546805381775,
"learning_rate": 9.619718309859155e-06,
"loss": 0.031,
"step": 17210
},
{
"epoch": 4.04,
"grad_norm": 4.440250873565674,
"learning_rate": 9.596244131455399e-06,
"loss": 0.04,
"step": 17220
},
{
"epoch": 4.04,
"grad_norm": 0.09834202378988266,
"learning_rate": 9.572769953051644e-06,
"loss": 0.0302,
"step": 17230
},
{
"epoch": 4.05,
"grad_norm": 1.6602915525436401,
"learning_rate": 9.549295774647887e-06,
"loss": 0.0329,
"step": 17240
},
{
"epoch": 4.05,
"grad_norm": 0.055043309926986694,
"learning_rate": 9.525821596244132e-06,
"loss": 0.0307,
"step": 17250
},
{
"epoch": 4.05,
"grad_norm": 1.081846833229065,
"learning_rate": 9.502347417840376e-06,
"loss": 0.0381,
"step": 17260
},
{
"epoch": 4.05,
"grad_norm": 0.8104713559150696,
"learning_rate": 9.47887323943662e-06,
"loss": 0.0359,
"step": 17270
},
{
"epoch": 4.06,
"grad_norm": 0.40220922231674194,
"learning_rate": 9.455399061032864e-06,
"loss": 0.0247,
"step": 17280
},
{
"epoch": 4.06,
"grad_norm": 4.490058898925781,
"learning_rate": 9.43192488262911e-06,
"loss": 0.0616,
"step": 17290
},
{
"epoch": 4.06,
"grad_norm": 1.7871899604797363,
"learning_rate": 9.408450704225352e-06,
"loss": 0.0212,
"step": 17300
},
{
"epoch": 4.06,
"grad_norm": 3.075772523880005,
"learning_rate": 9.384976525821598e-06,
"loss": 0.0312,
"step": 17310
},
{
"epoch": 4.07,
"grad_norm": 3.081249237060547,
"learning_rate": 9.361502347417842e-06,
"loss": 0.0294,
"step": 17320
},
{
"epoch": 4.07,
"grad_norm": 0.10270259529352188,
"learning_rate": 9.338028169014086e-06,
"loss": 0.0138,
"step": 17330
},
{
"epoch": 4.07,
"grad_norm": 2.901857614517212,
"learning_rate": 9.31455399061033e-06,
"loss": 0.046,
"step": 17340
},
{
"epoch": 4.07,
"grad_norm": 1.5097789764404297,
"learning_rate": 9.291079812206572e-06,
"loss": 0.034,
"step": 17350
},
{
"epoch": 4.08,
"grad_norm": 3.355926513671875,
"learning_rate": 9.267605633802817e-06,
"loss": 0.0209,
"step": 17360
},
{
"epoch": 4.08,
"grad_norm": 0.142938494682312,
"learning_rate": 9.244131455399061e-06,
"loss": 0.0305,
"step": 17370
},
{
"epoch": 4.08,
"grad_norm": 1.179494023323059,
"learning_rate": 9.220657276995305e-06,
"loss": 0.0341,
"step": 17380
},
{
"epoch": 4.08,
"grad_norm": 0.5627864599227905,
"learning_rate": 9.19718309859155e-06,
"loss": 0.0428,
"step": 17390
},
{
"epoch": 4.08,
"grad_norm": 1.4507837295532227,
"learning_rate": 9.173708920187795e-06,
"loss": 0.0387,
"step": 17400
},
{
"epoch": 4.09,
"grad_norm": 0.5361911058425903,
"learning_rate": 9.150234741784037e-06,
"loss": 0.0198,
"step": 17410
},
{
"epoch": 4.09,
"grad_norm": 0.12988106906414032,
"learning_rate": 9.126760563380283e-06,
"loss": 0.0191,
"step": 17420
},
{
"epoch": 4.09,
"grad_norm": 0.06474238634109497,
"learning_rate": 9.103286384976527e-06,
"loss": 0.0303,
"step": 17430
},
{
"epoch": 4.09,
"grad_norm": 1.0567501783370972,
"learning_rate": 9.07981220657277e-06,
"loss": 0.0416,
"step": 17440
},
{
"epoch": 4.1,
"grad_norm": 0.2987545132637024,
"learning_rate": 9.056338028169015e-06,
"loss": 0.0256,
"step": 17450
},
{
"epoch": 4.1,
"grad_norm": 4.924028396606445,
"learning_rate": 9.032863849765259e-06,
"loss": 0.0165,
"step": 17460
},
{
"epoch": 4.1,
"grad_norm": 2.6408851146698,
"learning_rate": 9.009389671361503e-06,
"loss": 0.0278,
"step": 17470
},
{
"epoch": 4.1,
"grad_norm": 0.41200679540634155,
"learning_rate": 8.985915492957747e-06,
"loss": 0.0392,
"step": 17480
},
{
"epoch": 4.11,
"grad_norm": 3.0683581829071045,
"learning_rate": 8.96244131455399e-06,
"loss": 0.0311,
"step": 17490
},
{
"epoch": 4.11,
"grad_norm": 0.34089791774749756,
"learning_rate": 8.938967136150235e-06,
"loss": 0.0174,
"step": 17500
},
{
"epoch": 4.11,
"eval_loss": 0.0591842420399189,
"eval_macro/f1": 0.9271865348802009,
"eval_macro/precision": 0.9263216362517599,
"eval_macro/recall": 0.9283185590241645,
"eval_micro/f1": 0.9284102564102564,
"eval_micro/precision": 0.9271423694093547,
"eval_micro/recall": 0.9296816158849709,
"eval_runtime": 28.0804,
"eval_samples/accuracy": 0.9222184183498802,
"eval_samples_per_second": 520.115,
"eval_steps_per_second": 16.275,
"step": 17500
},
{
"epoch": 4.11,
"grad_norm": 0.23726418614387512,
"learning_rate": 8.91549295774648e-06,
"loss": 0.0348,
"step": 17510
},
{
"epoch": 4.11,
"grad_norm": 0.15017840266227722,
"learning_rate": 8.892018779342722e-06,
"loss": 0.0207,
"step": 17520
},
{
"epoch": 4.12,
"grad_norm": 2.7328882217407227,
"learning_rate": 8.868544600938968e-06,
"loss": 0.0293,
"step": 17530
},
{
"epoch": 4.12,
"grad_norm": 0.2770448923110962,
"learning_rate": 8.845070422535212e-06,
"loss": 0.027,
"step": 17540
},
{
"epoch": 4.12,
"grad_norm": 1.4732258319854736,
"learning_rate": 8.821596244131456e-06,
"loss": 0.0481,
"step": 17550
},
{
"epoch": 4.12,
"grad_norm": 2.0700390338897705,
"learning_rate": 8.7981220657277e-06,
"loss": 0.0226,
"step": 17560
},
{
"epoch": 4.12,
"grad_norm": 0.1949704885482788,
"learning_rate": 8.774647887323944e-06,
"loss": 0.0369,
"step": 17570
},
{
"epoch": 4.13,
"grad_norm": 1.3958638906478882,
"learning_rate": 8.751173708920188e-06,
"loss": 0.0326,
"step": 17580
},
{
"epoch": 4.13,
"grad_norm": 2.3745298385620117,
"learning_rate": 8.727699530516432e-06,
"loss": 0.0277,
"step": 17590
},
{
"epoch": 4.13,
"grad_norm": 0.185311421751976,
"learning_rate": 8.704225352112677e-06,
"loss": 0.0203,
"step": 17600
},
{
"epoch": 4.13,
"grad_norm": 2.408078670501709,
"learning_rate": 8.68075117370892e-06,
"loss": 0.0268,
"step": 17610
},
{
"epoch": 4.14,
"grad_norm": 0.8216488361358643,
"learning_rate": 8.657276995305165e-06,
"loss": 0.0319,
"step": 17620
},
{
"epoch": 4.14,
"grad_norm": 0.5919995307922363,
"learning_rate": 8.633802816901408e-06,
"loss": 0.0339,
"step": 17630
},
{
"epoch": 4.14,
"grad_norm": 2.0640909671783447,
"learning_rate": 8.610328638497653e-06,
"loss": 0.0238,
"step": 17640
},
{
"epoch": 4.14,
"grad_norm": 0.6420736908912659,
"learning_rate": 8.586854460093897e-06,
"loss": 0.0241,
"step": 17650
},
{
"epoch": 4.15,
"grad_norm": 2.002295732498169,
"learning_rate": 8.563380281690141e-06,
"loss": 0.0363,
"step": 17660
},
{
"epoch": 4.15,
"grad_norm": 0.12747663259506226,
"learning_rate": 8.539906103286385e-06,
"loss": 0.0212,
"step": 17670
},
{
"epoch": 4.15,
"grad_norm": 1.799019694328308,
"learning_rate": 8.51643192488263e-06,
"loss": 0.0369,
"step": 17680
},
{
"epoch": 4.15,
"grad_norm": 0.7480834722518921,
"learning_rate": 8.492957746478873e-06,
"loss": 0.0383,
"step": 17690
},
{
"epoch": 4.15,
"grad_norm": 1.7629315853118896,
"learning_rate": 8.469483568075117e-06,
"loss": 0.0385,
"step": 17700
},
{
"epoch": 4.16,
"grad_norm": 2.2089579105377197,
"learning_rate": 8.446009389671363e-06,
"loss": 0.0287,
"step": 17710
},
{
"epoch": 4.16,
"grad_norm": 0.43293681740760803,
"learning_rate": 8.422535211267605e-06,
"loss": 0.0293,
"step": 17720
},
{
"epoch": 4.16,
"grad_norm": 0.13332372903823853,
"learning_rate": 8.39906103286385e-06,
"loss": 0.0307,
"step": 17730
},
{
"epoch": 4.16,
"grad_norm": 0.055583804845809937,
"learning_rate": 8.375586854460095e-06,
"loss": 0.0256,
"step": 17740
},
{
"epoch": 4.17,
"grad_norm": 3.303671360015869,
"learning_rate": 8.352112676056339e-06,
"loss": 0.0207,
"step": 17750
},
{
"epoch": 4.17,
"grad_norm": 3.193225383758545,
"learning_rate": 8.328638497652583e-06,
"loss": 0.0242,
"step": 17760
},
{
"epoch": 4.17,
"grad_norm": 0.5734906196594238,
"learning_rate": 8.305164319248828e-06,
"loss": 0.0206,
"step": 17770
},
{
"epoch": 4.17,
"grad_norm": 0.11326544731855392,
"learning_rate": 8.28169014084507e-06,
"loss": 0.0364,
"step": 17780
},
{
"epoch": 4.18,
"grad_norm": 0.0245682280510664,
"learning_rate": 8.258215962441314e-06,
"loss": 0.0187,
"step": 17790
},
{
"epoch": 4.18,
"grad_norm": 0.11930684745311737,
"learning_rate": 8.234741784037558e-06,
"loss": 0.0085,
"step": 17800
},
{
"epoch": 4.18,
"grad_norm": 0.308901309967041,
"learning_rate": 8.211267605633802e-06,
"loss": 0.0146,
"step": 17810
},
{
"epoch": 4.18,
"grad_norm": 0.14321398735046387,
"learning_rate": 8.187793427230048e-06,
"loss": 0.0226,
"step": 17820
},
{
"epoch": 4.19,
"grad_norm": 0.437814325094223,
"learning_rate": 8.16431924882629e-06,
"loss": 0.0427,
"step": 17830
},
{
"epoch": 4.19,
"grad_norm": 4.8848876953125,
"learning_rate": 8.140845070422536e-06,
"loss": 0.0292,
"step": 17840
},
{
"epoch": 4.19,
"grad_norm": 1.6253747940063477,
"learning_rate": 8.11737089201878e-06,
"loss": 0.0385,
"step": 17850
},
{
"epoch": 4.19,
"grad_norm": 0.15667077898979187,
"learning_rate": 8.093896713615024e-06,
"loss": 0.0305,
"step": 17860
},
{
"epoch": 4.19,
"grad_norm": 0.06896039843559265,
"learning_rate": 8.070422535211268e-06,
"loss": 0.0158,
"step": 17870
},
{
"epoch": 4.2,
"grad_norm": 0.7833272218704224,
"learning_rate": 8.046948356807513e-06,
"loss": 0.0332,
"step": 17880
},
{
"epoch": 4.2,
"grad_norm": 2.794677495956421,
"learning_rate": 8.023474178403756e-06,
"loss": 0.0243,
"step": 17890
},
{
"epoch": 4.2,
"grad_norm": 0.9475429654121399,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0249,
"step": 17900
},
{
"epoch": 4.2,
"grad_norm": 4.429962158203125,
"learning_rate": 7.976525821596245e-06,
"loss": 0.0311,
"step": 17910
},
{
"epoch": 4.21,
"grad_norm": 0.1972973346710205,
"learning_rate": 7.95305164319249e-06,
"loss": 0.0299,
"step": 17920
},
{
"epoch": 4.21,
"grad_norm": 0.13434672355651855,
"learning_rate": 7.929577464788733e-06,
"loss": 0.0255,
"step": 17930
},
{
"epoch": 4.21,
"grad_norm": 0.22750389575958252,
"learning_rate": 7.906103286384975e-06,
"loss": 0.0242,
"step": 17940
},
{
"epoch": 4.21,
"grad_norm": 1.5128264427185059,
"learning_rate": 7.882629107981221e-06,
"loss": 0.0158,
"step": 17950
},
{
"epoch": 4.22,
"grad_norm": 0.3299107551574707,
"learning_rate": 7.859154929577465e-06,
"loss": 0.0355,
"step": 17960
},
{
"epoch": 4.22,
"grad_norm": 1.590198040008545,
"learning_rate": 7.835680751173709e-06,
"loss": 0.0358,
"step": 17970
},
{
"epoch": 4.22,
"grad_norm": 0.29909548163414,
"learning_rate": 7.812206572769953e-06,
"loss": 0.0238,
"step": 17980
},
{
"epoch": 4.22,
"grad_norm": 0.14423009753227234,
"learning_rate": 7.788732394366199e-06,
"loss": 0.0218,
"step": 17990
},
{
"epoch": 4.23,
"grad_norm": 1.6336926221847534,
"learning_rate": 7.765258215962441e-06,
"loss": 0.0149,
"step": 18000
},
{
"epoch": 4.23,
"eval_loss": 0.06184852123260498,
"eval_macro/f1": 0.9275533285987433,
"eval_macro/precision": 0.927621636500257,
"eval_macro/recall": 0.9283999031694936,
"eval_micro/f1": 0.9285030103995622,
"eval_micro/precision": 0.9278047446503043,
"eval_micro/recall": 0.9292023279698733,
"eval_runtime": 29.8927,
"eval_samples/accuracy": 0.9221499486477234,
"eval_samples_per_second": 488.58,
"eval_steps_per_second": 15.288,
"step": 18000
},
{
"epoch": 4.23,
"grad_norm": 7.740451335906982,
"learning_rate": 7.741784037558687e-06,
"loss": 0.0434,
"step": 18010
},
{
"epoch": 4.23,
"grad_norm": 3.4866607189178467,
"learning_rate": 7.71830985915493e-06,
"loss": 0.0362,
"step": 18020
},
{
"epoch": 4.23,
"grad_norm": 3.1101765632629395,
"learning_rate": 7.694835680751174e-06,
"loss": 0.0091,
"step": 18030
},
{
"epoch": 4.23,
"grad_norm": 2.40651798248291,
"learning_rate": 7.671361502347418e-06,
"loss": 0.0197,
"step": 18040
},
{
"epoch": 4.24,
"grad_norm": 4.570631980895996,
"learning_rate": 7.647887323943662e-06,
"loss": 0.0127,
"step": 18050
},
{
"epoch": 4.24,
"grad_norm": 1.8891502618789673,
"learning_rate": 7.624413145539906e-06,
"loss": 0.0386,
"step": 18060
},
{
"epoch": 4.24,
"grad_norm": 2.7779672145843506,
"learning_rate": 7.600938967136151e-06,
"loss": 0.0221,
"step": 18070
},
{
"epoch": 4.24,
"grad_norm": 1.5719915628433228,
"learning_rate": 7.577464788732394e-06,
"loss": 0.024,
"step": 18080
},
{
"epoch": 4.25,
"grad_norm": 2.32023286819458,
"learning_rate": 7.553990610328639e-06,
"loss": 0.0352,
"step": 18090
},
{
"epoch": 4.25,
"grad_norm": 0.3694073259830475,
"learning_rate": 7.530516431924883e-06,
"loss": 0.021,
"step": 18100
},
{
"epoch": 4.25,
"grad_norm": 1.49236261844635,
"learning_rate": 7.507042253521126e-06,
"loss": 0.032,
"step": 18110
},
{
"epoch": 4.25,
"grad_norm": 0.02265837788581848,
"learning_rate": 7.483568075117371e-06,
"loss": 0.0375,
"step": 18120
},
{
"epoch": 4.26,
"grad_norm": 0.5602453947067261,
"learning_rate": 7.460093896713616e-06,
"loss": 0.0295,
"step": 18130
},
{
"epoch": 4.26,
"grad_norm": 0.8204029202461243,
"learning_rate": 7.436619718309859e-06,
"loss": 0.0221,
"step": 18140
},
{
"epoch": 4.26,
"grad_norm": 0.03510039299726486,
"learning_rate": 7.413145539906104e-06,
"loss": 0.0392,
"step": 18150
},
{
"epoch": 4.26,
"grad_norm": 0.16863106191158295,
"learning_rate": 7.3896713615023485e-06,
"loss": 0.0119,
"step": 18160
},
{
"epoch": 4.27,
"grad_norm": 0.08002981543540955,
"learning_rate": 7.366197183098592e-06,
"loss": 0.0129,
"step": 18170
},
{
"epoch": 4.27,
"grad_norm": 2.261359214782715,
"learning_rate": 7.342723004694836e-06,
"loss": 0.0292,
"step": 18180
},
{
"epoch": 4.27,
"grad_norm": 3.8216466903686523,
"learning_rate": 7.31924882629108e-06,
"loss": 0.0416,
"step": 18190
},
{
"epoch": 4.27,
"grad_norm": 0.11059773713350296,
"learning_rate": 7.295774647887324e-06,
"loss": 0.0225,
"step": 18200
},
{
"epoch": 4.27,
"grad_norm": 2.422400712966919,
"learning_rate": 7.272300469483568e-06,
"loss": 0.0331,
"step": 18210
},
{
"epoch": 4.28,
"grad_norm": 1.4523348808288574,
"learning_rate": 7.248826291079812e-06,
"loss": 0.0205,
"step": 18220
},
{
"epoch": 4.28,
"grad_norm": 2.5690364837646484,
"learning_rate": 7.225352112676056e-06,
"loss": 0.0328,
"step": 18230
},
{
"epoch": 4.28,
"grad_norm": 1.9039969444274902,
"learning_rate": 7.201877934272301e-06,
"loss": 0.0307,
"step": 18240
},
{
"epoch": 4.28,
"grad_norm": 3.139665365219116,
"learning_rate": 7.178403755868544e-06,
"loss": 0.0299,
"step": 18250
},
{
"epoch": 4.29,
"grad_norm": 0.07591520249843597,
"learning_rate": 7.154929577464789e-06,
"loss": 0.0228,
"step": 18260
},
{
"epoch": 4.29,
"grad_norm": 0.07442035526037216,
"learning_rate": 7.131455399061034e-06,
"loss": 0.0245,
"step": 18270
},
{
"epoch": 4.29,
"grad_norm": 0.11448445916175842,
"learning_rate": 7.107981220657277e-06,
"loss": 0.0265,
"step": 18280
},
{
"epoch": 4.29,
"grad_norm": 2.6437320709228516,
"learning_rate": 7.084507042253522e-06,
"loss": 0.0238,
"step": 18290
},
{
"epoch": 4.3,
"grad_norm": 1.5147851705551147,
"learning_rate": 7.0610328638497664e-06,
"loss": 0.0258,
"step": 18300
},
{
"epoch": 4.3,
"grad_norm": 0.25056353211402893,
"learning_rate": 7.0375586854460096e-06,
"loss": 0.0129,
"step": 18310
},
{
"epoch": 4.3,
"grad_norm": 2.1855599880218506,
"learning_rate": 7.014084507042254e-06,
"loss": 0.0317,
"step": 18320
},
{
"epoch": 4.3,
"grad_norm": 3.4039382934570312,
"learning_rate": 6.990610328638498e-06,
"loss": 0.0191,
"step": 18330
},
{
"epoch": 4.31,
"grad_norm": 0.2336108237504959,
"learning_rate": 6.9671361502347414e-06,
"loss": 0.0323,
"step": 18340
},
{
"epoch": 4.31,
"grad_norm": 0.5614345669746399,
"learning_rate": 6.943661971830986e-06,
"loss": 0.0338,
"step": 18350
},
{
"epoch": 4.31,
"grad_norm": 2.2648632526397705,
"learning_rate": 6.920187793427231e-06,
"loss": 0.0264,
"step": 18360
},
{
"epoch": 4.31,
"grad_norm": 1.8318125009536743,
"learning_rate": 6.896713615023474e-06,
"loss": 0.0356,
"step": 18370
},
{
"epoch": 4.31,
"grad_norm": 0.24477006494998932,
"learning_rate": 6.873239436619719e-06,
"loss": 0.011,
"step": 18380
},
{
"epoch": 4.32,
"grad_norm": 0.3729543685913086,
"learning_rate": 6.849765258215962e-06,
"loss": 0.04,
"step": 18390
},
{
"epoch": 4.32,
"grad_norm": 2.3714241981506348,
"learning_rate": 6.826291079812207e-06,
"loss": 0.0194,
"step": 18400
},
{
"epoch": 4.32,
"grad_norm": 8.464815139770508,
"learning_rate": 6.802816901408452e-06,
"loss": 0.0304,
"step": 18410
},
{
"epoch": 4.32,
"grad_norm": 0.38195595145225525,
"learning_rate": 6.779342723004695e-06,
"loss": 0.0253,
"step": 18420
},
{
"epoch": 4.33,
"grad_norm": 2.6723573207855225,
"learning_rate": 6.75586854460094e-06,
"loss": 0.047,
"step": 18430
},
{
"epoch": 4.33,
"grad_norm": 0.3452346920967102,
"learning_rate": 6.7323943661971836e-06,
"loss": 0.0235,
"step": 18440
},
{
"epoch": 4.33,
"grad_norm": 0.28022298216819763,
"learning_rate": 6.7089201877934275e-06,
"loss": 0.0353,
"step": 18450
},
{
"epoch": 4.33,
"grad_norm": 0.5109660625457764,
"learning_rate": 6.6854460093896715e-06,
"loss": 0.0281,
"step": 18460
},
{
"epoch": 4.34,
"grad_norm": 3.8708913326263428,
"learning_rate": 6.661971830985916e-06,
"loss": 0.0285,
"step": 18470
},
{
"epoch": 4.34,
"grad_norm": 1.4034435749053955,
"learning_rate": 6.638497652582159e-06,
"loss": 0.0391,
"step": 18480
},
{
"epoch": 4.34,
"grad_norm": 1.097798466682434,
"learning_rate": 6.615023474178404e-06,
"loss": 0.0405,
"step": 18490
},
{
"epoch": 4.34,
"grad_norm": 3.1356542110443115,
"learning_rate": 6.591549295774649e-06,
"loss": 0.018,
"step": 18500
},
{
"epoch": 4.34,
"eval_loss": 0.060135386884212494,
"eval_macro/f1": 0.927526156864245,
"eval_macro/precision": 0.9275914439000685,
"eval_macro/recall": 0.927618083612261,
"eval_micro/f1": 0.9285934238888699,
"eval_micro/precision": 0.9280536178361374,
"eval_micro/recall": 0.9291338582677166,
"eval_runtime": 28.74,
"eval_samples/accuracy": 0.922286888052037,
"eval_samples_per_second": 508.177,
"eval_steps_per_second": 15.901,
"step": 18500
},
{
"epoch": 4.35,
"grad_norm": 2.763146162033081,
"learning_rate": 6.568075117370892e-06,
"loss": 0.0352,
"step": 18510
},
{
"epoch": 4.35,
"grad_norm": 0.015551486052572727,
"learning_rate": 6.544600938967137e-06,
"loss": 0.0163,
"step": 18520
},
{
"epoch": 4.35,
"grad_norm": 2.646799325942993,
"learning_rate": 6.52112676056338e-06,
"loss": 0.0225,
"step": 18530
},
{
"epoch": 4.35,
"grad_norm": 0.29067888855934143,
"learning_rate": 6.497652582159625e-06,
"loss": 0.0174,
"step": 18540
},
{
"epoch": 4.35,
"grad_norm": 0.14220267534255981,
"learning_rate": 6.474178403755869e-06,
"loss": 0.0196,
"step": 18550
},
{
"epoch": 4.36,
"grad_norm": 0.1823500692844391,
"learning_rate": 6.450704225352113e-06,
"loss": 0.0115,
"step": 18560
},
{
"epoch": 4.36,
"grad_norm": 0.6547729969024658,
"learning_rate": 6.427230046948357e-06,
"loss": 0.037,
"step": 18570
},
{
"epoch": 4.36,
"grad_norm": 0.19114431738853455,
"learning_rate": 6.4037558685446015e-06,
"loss": 0.0221,
"step": 18580
},
{
"epoch": 4.36,
"grad_norm": 4.823287010192871,
"learning_rate": 6.380281690140845e-06,
"loss": 0.0166,
"step": 18590
},
{
"epoch": 4.37,
"grad_norm": 0.13954699039459229,
"learning_rate": 6.3568075117370894e-06,
"loss": 0.0455,
"step": 18600
},
{
"epoch": 4.37,
"grad_norm": 0.08931886404752731,
"learning_rate": 6.333333333333334e-06,
"loss": 0.0354,
"step": 18610
},
{
"epoch": 4.37,
"grad_norm": 3.5649523735046387,
"learning_rate": 6.309859154929577e-06,
"loss": 0.024,
"step": 18620
},
{
"epoch": 4.37,
"grad_norm": 2.356419324874878,
"learning_rate": 6.286384976525822e-06,
"loss": 0.0164,
"step": 18630
},
{
"epoch": 4.38,
"grad_norm": 1.0465301275253296,
"learning_rate": 6.262910798122067e-06,
"loss": 0.0258,
"step": 18640
},
{
"epoch": 4.38,
"grad_norm": 0.01890731416642666,
"learning_rate": 6.23943661971831e-06,
"loss": 0.0142,
"step": 18650
},
{
"epoch": 4.38,
"grad_norm": 2.5224521160125732,
"learning_rate": 6.215962441314555e-06,
"loss": 0.032,
"step": 18660
},
{
"epoch": 4.38,
"grad_norm": 1.1495766639709473,
"learning_rate": 6.192488262910799e-06,
"loss": 0.0314,
"step": 18670
},
{
"epoch": 4.38,
"grad_norm": 0.7142943143844604,
"learning_rate": 6.169014084507042e-06,
"loss": 0.0272,
"step": 18680
},
{
"epoch": 4.39,
"grad_norm": 0.24826817214488983,
"learning_rate": 6.145539906103287e-06,
"loss": 0.0187,
"step": 18690
},
{
"epoch": 4.39,
"grad_norm": 0.063688263297081,
"learning_rate": 6.122065727699531e-06,
"loss": 0.0208,
"step": 18700
},
{
"epoch": 4.39,
"grad_norm": 2.3652358055114746,
"learning_rate": 6.098591549295775e-06,
"loss": 0.035,
"step": 18710
},
{
"epoch": 4.39,
"grad_norm": 0.7242918014526367,
"learning_rate": 6.075117370892019e-06,
"loss": 0.0352,
"step": 18720
},
{
"epoch": 4.4,
"grad_norm": 4.603453636169434,
"learning_rate": 6.0516431924882634e-06,
"loss": 0.0543,
"step": 18730
},
{
"epoch": 4.4,
"grad_norm": 0.45003360509872437,
"learning_rate": 6.028169014084507e-06,
"loss": 0.0233,
"step": 18740
},
{
"epoch": 4.4,
"grad_norm": 2.918506383895874,
"learning_rate": 6.004694835680751e-06,
"loss": 0.0357,
"step": 18750
},
{
"epoch": 4.4,
"grad_norm": 2.2364795207977295,
"learning_rate": 5.981220657276996e-06,
"loss": 0.0345,
"step": 18760
},
{
"epoch": 4.41,
"grad_norm": 1.1748833656311035,
"learning_rate": 5.95774647887324e-06,
"loss": 0.031,
"step": 18770
},
{
"epoch": 4.41,
"grad_norm": 0.20631471276283264,
"learning_rate": 5.934272300469484e-06,
"loss": 0.0411,
"step": 18780
},
{
"epoch": 4.41,
"grad_norm": 2.653383255004883,
"learning_rate": 5.910798122065728e-06,
"loss": 0.0313,
"step": 18790
},
{
"epoch": 4.41,
"grad_norm": 3.3780014514923096,
"learning_rate": 5.887323943661972e-06,
"loss": 0.0299,
"step": 18800
},
{
"epoch": 4.42,
"grad_norm": 0.190080463886261,
"learning_rate": 5.863849765258216e-06,
"loss": 0.0246,
"step": 18810
},
{
"epoch": 4.42,
"grad_norm": 3.175036907196045,
"learning_rate": 5.84037558685446e-06,
"loss": 0.0198,
"step": 18820
},
{
"epoch": 4.42,
"grad_norm": 1.293860912322998,
"learning_rate": 5.816901408450705e-06,
"loss": 0.0413,
"step": 18830
},
{
"epoch": 4.42,
"grad_norm": 3.4616804122924805,
"learning_rate": 5.793427230046949e-06,
"loss": 0.0294,
"step": 18840
},
{
"epoch": 4.42,
"grad_norm": 0.1567220538854599,
"learning_rate": 5.769953051643193e-06,
"loss": 0.0348,
"step": 18850
},
{
"epoch": 4.43,
"grad_norm": 0.14279071986675262,
"learning_rate": 5.746478873239437e-06,
"loss": 0.0112,
"step": 18860
},
{
"epoch": 4.43,
"grad_norm": 4.581559181213379,
"learning_rate": 5.723004694835681e-06,
"loss": 0.0398,
"step": 18870
},
{
"epoch": 4.43,
"grad_norm": 1.2094709873199463,
"learning_rate": 5.699530516431925e-06,
"loss": 0.0274,
"step": 18880
},
{
"epoch": 4.43,
"grad_norm": 1.6614265441894531,
"learning_rate": 5.676056338028169e-06,
"loss": 0.0513,
"step": 18890
},
{
"epoch": 4.44,
"grad_norm": 1.3133680820465088,
"learning_rate": 5.652582159624414e-06,
"loss": 0.0221,
"step": 18900
},
{
"epoch": 4.44,
"grad_norm": 1.085091233253479,
"learning_rate": 5.629107981220657e-06,
"loss": 0.0297,
"step": 18910
},
{
"epoch": 4.44,
"grad_norm": 0.05348065122961998,
"learning_rate": 5.605633802816901e-06,
"loss": 0.0119,
"step": 18920
},
{
"epoch": 4.44,
"grad_norm": 0.11949329078197479,
"learning_rate": 5.582159624413145e-06,
"loss": 0.023,
"step": 18930
},
{
"epoch": 4.45,
"grad_norm": 0.890052080154419,
"learning_rate": 5.55868544600939e-06,
"loss": 0.0343,
"step": 18940
},
{
"epoch": 4.45,
"grad_norm": 2.084806442260742,
"learning_rate": 5.535211267605634e-06,
"loss": 0.0165,
"step": 18950
},
{
"epoch": 4.45,
"grad_norm": 0.42421096563339233,
"learning_rate": 5.511737089201878e-06,
"loss": 0.0273,
"step": 18960
},
{
"epoch": 4.45,
"grad_norm": 4.482558250427246,
"learning_rate": 5.488262910798123e-06,
"loss": 0.0347,
"step": 18970
},
{
"epoch": 4.46,
"grad_norm": 1.6675491333007812,
"learning_rate": 5.464788732394367e-06,
"loss": 0.0365,
"step": 18980
},
{
"epoch": 4.46,
"grad_norm": 0.973112940788269,
"learning_rate": 5.441314553990611e-06,
"loss": 0.0287,
"step": 18990
},
{
"epoch": 4.46,
"grad_norm": 1.0578975677490234,
"learning_rate": 5.4178403755868546e-06,
"loss": 0.0237,
"step": 19000
},
{
"epoch": 4.46,
"eval_loss": 0.05804529786109924,
"eval_macro/f1": 0.9300467382231257,
"eval_macro/precision": 0.9304321443218644,
"eval_macro/recall": 0.9299328529924998,
"eval_micro/f1": 0.9309872655073258,
"eval_micro/precision": 0.9309235298144725,
"eval_micro/recall": 0.9310510099281069,
"eval_runtime": 29.2424,
"eval_samples/accuracy": 0.9250256761383088,
"eval_samples_per_second": 499.446,
"eval_steps_per_second": 15.628,
"step": 19000
},
{
"epoch": 4.46,
"grad_norm": 0.13687680661678314,
"learning_rate": 5.394366197183099e-06,
"loss": 0.0209,
"step": 19010
},
{
"epoch": 4.46,
"grad_norm": 0.7590184807777405,
"learning_rate": 5.370892018779343e-06,
"loss": 0.0328,
"step": 19020
},
{
"epoch": 4.47,
"grad_norm": 2.366809606552124,
"learning_rate": 5.347417840375587e-06,
"loss": 0.0288,
"step": 19030
},
{
"epoch": 4.47,
"grad_norm": 0.280494749546051,
"learning_rate": 5.323943661971831e-06,
"loss": 0.0253,
"step": 19040
},
{
"epoch": 4.47,
"grad_norm": 2.3025805950164795,
"learning_rate": 5.300469483568075e-06,
"loss": 0.014,
"step": 19050
},
{
"epoch": 4.47,
"grad_norm": 1.3467135429382324,
"learning_rate": 5.276995305164319e-06,
"loss": 0.0291,
"step": 19060
},
{
"epoch": 4.48,
"grad_norm": 0.5404326915740967,
"learning_rate": 5.253521126760563e-06,
"loss": 0.0359,
"step": 19070
},
{
"epoch": 4.48,
"grad_norm": 1.894284725189209,
"learning_rate": 5.230046948356808e-06,
"loss": 0.0294,
"step": 19080
},
{
"epoch": 4.48,
"grad_norm": 0.44013938307762146,
"learning_rate": 5.206572769953052e-06,
"loss": 0.0294,
"step": 19090
},
{
"epoch": 4.48,
"grad_norm": 0.9481649398803711,
"learning_rate": 5.183098591549296e-06,
"loss": 0.016,
"step": 19100
},
{
"epoch": 4.49,
"grad_norm": 2.767650604248047,
"learning_rate": 5.159624413145541e-06,
"loss": 0.0107,
"step": 19110
},
{
"epoch": 4.49,
"grad_norm": 0.19418075680732727,
"learning_rate": 5.136150234741785e-06,
"loss": 0.0288,
"step": 19120
},
{
"epoch": 4.49,
"grad_norm": 3.5764660835266113,
"learning_rate": 5.1126760563380286e-06,
"loss": 0.0385,
"step": 19130
},
{
"epoch": 4.49,
"grad_norm": 0.20306912064552307,
"learning_rate": 5.0892018779342725e-06,
"loss": 0.0276,
"step": 19140
},
{
"epoch": 4.5,
"grad_norm": 0.028659334406256676,
"learning_rate": 5.0657276995305165e-06,
"loss": 0.021,
"step": 19150
},
{
"epoch": 4.5,
"grad_norm": 3.5171916484832764,
"learning_rate": 5.0422535211267604e-06,
"loss": 0.0216,
"step": 19160
},
{
"epoch": 4.5,
"grad_norm": 0.7346154451370239,
"learning_rate": 5.018779342723004e-06,
"loss": 0.0361,
"step": 19170
},
{
"epoch": 4.5,
"grad_norm": 0.5785157680511475,
"learning_rate": 4.995305164319249e-06,
"loss": 0.0248,
"step": 19180
},
{
"epoch": 4.5,
"grad_norm": 1.10182785987854,
"learning_rate": 4.971830985915493e-06,
"loss": 0.0284,
"step": 19190
},
{
"epoch": 4.51,
"grad_norm": 0.02137897163629532,
"learning_rate": 4.948356807511737e-06,
"loss": 0.0266,
"step": 19200
},
{
"epoch": 4.51,
"grad_norm": 0.47869399189949036,
"learning_rate": 4.924882629107982e-06,
"loss": 0.0344,
"step": 19210
},
{
"epoch": 4.51,
"grad_norm": 3.633575916290283,
"learning_rate": 4.901408450704226e-06,
"loss": 0.0205,
"step": 19220
},
{
"epoch": 4.51,
"grad_norm": 3.7721774578094482,
"learning_rate": 4.87793427230047e-06,
"loss": 0.038,
"step": 19230
},
{
"epoch": 4.52,
"grad_norm": 2.5708956718444824,
"learning_rate": 4.854460093896714e-06,
"loss": 0.0364,
"step": 19240
},
{
"epoch": 4.52,
"grad_norm": 0.7661235332489014,
"learning_rate": 4.830985915492959e-06,
"loss": 0.0297,
"step": 19250
},
{
"epoch": 4.52,
"grad_norm": 0.4676876366138458,
"learning_rate": 4.807511737089202e-06,
"loss": 0.026,
"step": 19260
},
{
"epoch": 4.52,
"grad_norm": 1.8789290189743042,
"learning_rate": 4.784037558685446e-06,
"loss": 0.0343,
"step": 19270
},
{
"epoch": 4.53,
"grad_norm": 4.177642822265625,
"learning_rate": 4.7605633802816905e-06,
"loss": 0.0264,
"step": 19280
},
{
"epoch": 4.53,
"grad_norm": 1.572719931602478,
"learning_rate": 4.7370892018779344e-06,
"loss": 0.0475,
"step": 19290
},
{
"epoch": 4.53,
"grad_norm": 2.2648651599884033,
"learning_rate": 4.713615023474178e-06,
"loss": 0.0262,
"step": 19300
},
{
"epoch": 4.53,
"grad_norm": 0.05855144187808037,
"learning_rate": 4.690140845070422e-06,
"loss": 0.0078,
"step": 19310
},
{
"epoch": 4.54,
"grad_norm": 0.0676891878247261,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0285,
"step": 19320
},
{
"epoch": 4.54,
"grad_norm": 2.633129596710205,
"learning_rate": 4.643192488262911e-06,
"loss": 0.0277,
"step": 19330
},
{
"epoch": 4.54,
"grad_norm": 4.038257598876953,
"learning_rate": 4.619718309859155e-06,
"loss": 0.026,
"step": 19340
},
{
"epoch": 4.54,
"grad_norm": 1.5584205389022827,
"learning_rate": 4.5962441314554e-06,
"loss": 0.0154,
"step": 19350
},
{
"epoch": 4.54,
"grad_norm": 2.5847558975219727,
"learning_rate": 4.572769953051644e-06,
"loss": 0.0115,
"step": 19360
},
{
"epoch": 4.55,
"grad_norm": 0.19932971894741058,
"learning_rate": 4.549295774647888e-06,
"loss": 0.03,
"step": 19370
},
{
"epoch": 4.55,
"grad_norm": 2.389786720275879,
"learning_rate": 4.525821596244132e-06,
"loss": 0.0281,
"step": 19380
},
{
"epoch": 4.55,
"grad_norm": 5.445662975311279,
"learning_rate": 4.502347417840376e-06,
"loss": 0.0157,
"step": 19390
},
{
"epoch": 4.55,
"grad_norm": 0.8284715414047241,
"learning_rate": 4.47887323943662e-06,
"loss": 0.0417,
"step": 19400
},
{
"epoch": 4.56,
"grad_norm": 0.39633145928382874,
"learning_rate": 4.455399061032864e-06,
"loss": 0.0254,
"step": 19410
},
{
"epoch": 4.56,
"grad_norm": 3.238429069519043,
"learning_rate": 4.4319248826291084e-06,
"loss": 0.0439,
"step": 19420
},
{
"epoch": 4.56,
"grad_norm": 0.1122848391532898,
"learning_rate": 4.408450704225352e-06,
"loss": 0.0215,
"step": 19430
},
{
"epoch": 4.56,
"grad_norm": 2.4247405529022217,
"learning_rate": 4.384976525821596e-06,
"loss": 0.0115,
"step": 19440
},
{
"epoch": 4.57,
"grad_norm": 2.5062484741210938,
"learning_rate": 4.36150234741784e-06,
"loss": 0.037,
"step": 19450
},
{
"epoch": 4.57,
"grad_norm": 2.332712411880493,
"learning_rate": 4.338028169014085e-06,
"loss": 0.0327,
"step": 19460
},
{
"epoch": 4.57,
"grad_norm": 2.1124820709228516,
"learning_rate": 4.314553990610329e-06,
"loss": 0.0244,
"step": 19470
},
{
"epoch": 4.57,
"grad_norm": 0.824579656124115,
"learning_rate": 4.291079812206573e-06,
"loss": 0.024,
"step": 19480
},
{
"epoch": 4.58,
"grad_norm": 1.3833364248275757,
"learning_rate": 4.267605633802817e-06,
"loss": 0.0387,
"step": 19490
},
{
"epoch": 4.58,
"grad_norm": 0.021480737254023552,
"learning_rate": 4.244131455399061e-06,
"loss": 0.0085,
"step": 19500
},
{
"epoch": 4.58,
"eval_loss": 0.056427136063575745,
"eval_macro/f1": 0.9295635716052294,
"eval_macro/precision": 0.9300451915383702,
"eval_macro/recall": 0.9294580707864561,
"eval_micro/f1": 0.9307310392056155,
"eval_micro/precision": 0.9308904109589041,
"eval_micro/recall": 0.9305717220130092,
"eval_runtime": 29.8549,
"eval_samples/accuracy": 0.923313933584389,
"eval_samples_per_second": 489.2,
"eval_steps_per_second": 15.307,
"step": 19500
},
{
"epoch": 4.58,
"grad_norm": 0.8990393280982971,
"learning_rate": 4.220657276995305e-06,
"loss": 0.0254,
"step": 19510
},
{
"epoch": 4.58,
"grad_norm": 3.251713514328003,
"learning_rate": 4.197183098591549e-06,
"loss": 0.0367,
"step": 19520
},
{
"epoch": 4.58,
"grad_norm": 1.872117280960083,
"learning_rate": 4.173708920187794e-06,
"loss": 0.0244,
"step": 19530
},
{
"epoch": 4.59,
"grad_norm": 0.2993544340133667,
"learning_rate": 4.150234741784038e-06,
"loss": 0.0209,
"step": 19540
},
{
"epoch": 4.59,
"grad_norm": 2.650440216064453,
"learning_rate": 4.126760563380282e-06,
"loss": 0.019,
"step": 19550
},
{
"epoch": 4.59,
"grad_norm": 3.6848528385162354,
"learning_rate": 4.103286384976526e-06,
"loss": 0.0194,
"step": 19560
},
{
"epoch": 4.59,
"grad_norm": 2.082089424133301,
"learning_rate": 4.07981220657277e-06,
"loss": 0.0293,
"step": 19570
},
{
"epoch": 4.6,
"grad_norm": 2.0846469402313232,
"learning_rate": 4.056338028169014e-06,
"loss": 0.0336,
"step": 19580
},
{
"epoch": 4.6,
"grad_norm": 1.2705551385879517,
"learning_rate": 4.032863849765258e-06,
"loss": 0.0237,
"step": 19590
},
{
"epoch": 4.6,
"grad_norm": 1.0715053081512451,
"learning_rate": 4.009389671361503e-06,
"loss": 0.0205,
"step": 19600
},
{
"epoch": 4.6,
"grad_norm": 2.3154516220092773,
"learning_rate": 3.985915492957747e-06,
"loss": 0.0349,
"step": 19610
},
{
"epoch": 4.61,
"grad_norm": 2.9410946369171143,
"learning_rate": 3.96244131455399e-06,
"loss": 0.0232,
"step": 19620
},
{
"epoch": 4.61,
"grad_norm": 0.5876528024673462,
"learning_rate": 3.938967136150235e-06,
"loss": 0.0327,
"step": 19630
},
{
"epoch": 4.61,
"grad_norm": 0.43626344203948975,
"learning_rate": 3.915492957746479e-06,
"loss": 0.0124,
"step": 19640
},
{
"epoch": 4.61,
"grad_norm": 2.820923328399658,
"learning_rate": 3.892018779342723e-06,
"loss": 0.0136,
"step": 19650
},
{
"epoch": 4.62,
"grad_norm": 3.3606836795806885,
"learning_rate": 3.868544600938967e-06,
"loss": 0.0295,
"step": 19660
},
{
"epoch": 4.62,
"grad_norm": 3.3373594284057617,
"learning_rate": 3.845070422535212e-06,
"loss": 0.0289,
"step": 19670
},
{
"epoch": 4.62,
"grad_norm": 0.04505685716867447,
"learning_rate": 3.821596244131456e-06,
"loss": 0.0103,
"step": 19680
},
{
"epoch": 4.62,
"grad_norm": 0.7895365357398987,
"learning_rate": 3.7981220657276996e-06,
"loss": 0.0145,
"step": 19690
},
{
"epoch": 4.62,
"grad_norm": 0.8932519555091858,
"learning_rate": 3.774647887323944e-06,
"loss": 0.0316,
"step": 19700
},
{
"epoch": 4.63,
"grad_norm": 0.3159659504890442,
"learning_rate": 3.751173708920188e-06,
"loss": 0.0216,
"step": 19710
},
{
"epoch": 4.63,
"grad_norm": 2.177913188934326,
"learning_rate": 3.727699530516432e-06,
"loss": 0.0212,
"step": 19720
},
{
"epoch": 4.63,
"grad_norm": 1.621932864189148,
"learning_rate": 3.7042253521126767e-06,
"loss": 0.0316,
"step": 19730
},
{
"epoch": 4.63,
"grad_norm": 0.0901319831609726,
"learning_rate": 3.6807511737089206e-06,
"loss": 0.0254,
"step": 19740
},
{
"epoch": 4.64,
"grad_norm": 2.3561832904815674,
"learning_rate": 3.6572769953051646e-06,
"loss": 0.043,
"step": 19750
},
{
"epoch": 4.64,
"grad_norm": 1.95290207862854,
"learning_rate": 3.6338028169014085e-06,
"loss": 0.0276,
"step": 19760
},
{
"epoch": 4.64,
"grad_norm": 2.1155362129211426,
"learning_rate": 3.610328638497653e-06,
"loss": 0.0475,
"step": 19770
},
{
"epoch": 4.64,
"grad_norm": 0.561687707901001,
"learning_rate": 3.586854460093897e-06,
"loss": 0.0227,
"step": 19780
},
{
"epoch": 4.65,
"grad_norm": 1.620744228363037,
"learning_rate": 3.563380281690141e-06,
"loss": 0.042,
"step": 19790
},
{
"epoch": 4.65,
"grad_norm": 3.074017286300659,
"learning_rate": 3.5399061032863856e-06,
"loss": 0.0436,
"step": 19800
},
{
"epoch": 4.65,
"grad_norm": 0.6212040781974792,
"learning_rate": 3.5164319248826296e-06,
"loss": 0.0421,
"step": 19810
},
{
"epoch": 4.65,
"grad_norm": 5.314796447753906,
"learning_rate": 3.492957746478873e-06,
"loss": 0.0372,
"step": 19820
},
{
"epoch": 4.65,
"grad_norm": 0.4144018292427063,
"learning_rate": 3.469483568075117e-06,
"loss": 0.025,
"step": 19830
},
{
"epoch": 4.66,
"grad_norm": 3.012010097503662,
"learning_rate": 3.446009389671362e-06,
"loss": 0.0127,
"step": 19840
},
{
"epoch": 4.66,
"grad_norm": 1.6481027603149414,
"learning_rate": 3.422535211267606e-06,
"loss": 0.0379,
"step": 19850
},
{
"epoch": 4.66,
"grad_norm": 1.0246399641036987,
"learning_rate": 3.39906103286385e-06,
"loss": 0.0243,
"step": 19860
},
{
"epoch": 4.66,
"grad_norm": 2.079977512359619,
"learning_rate": 3.375586854460094e-06,
"loss": 0.0382,
"step": 19870
},
{
"epoch": 4.67,
"grad_norm": 2.253732204437256,
"learning_rate": 3.352112676056338e-06,
"loss": 0.0308,
"step": 19880
},
{
"epoch": 4.67,
"grad_norm": 2.0721428394317627,
"learning_rate": 3.328638497652582e-06,
"loss": 0.018,
"step": 19890
},
{
"epoch": 4.67,
"grad_norm": 1.23079252243042,
"learning_rate": 3.305164319248826e-06,
"loss": 0.0317,
"step": 19900
},
{
"epoch": 4.67,
"grad_norm": 1.093954086303711,
"learning_rate": 3.281690140845071e-06,
"loss": 0.0226,
"step": 19910
},
{
"epoch": 4.68,
"grad_norm": 1.5708380937576294,
"learning_rate": 3.258215962441315e-06,
"loss": 0.03,
"step": 19920
},
{
"epoch": 4.68,
"grad_norm": 0.7118130922317505,
"learning_rate": 3.234741784037559e-06,
"loss": 0.02,
"step": 19930
},
{
"epoch": 4.68,
"grad_norm": 3.4391887187957764,
"learning_rate": 3.211267605633803e-06,
"loss": 0.0383,
"step": 19940
},
{
"epoch": 4.68,
"grad_norm": 0.6392399072647095,
"learning_rate": 3.187793427230047e-06,
"loss": 0.0525,
"step": 19950
},
{
"epoch": 4.69,
"grad_norm": 2.6325254440307617,
"learning_rate": 3.164319248826291e-06,
"loss": 0.0307,
"step": 19960
},
{
"epoch": 4.69,
"grad_norm": 1.9415335655212402,
"learning_rate": 3.140845070422535e-06,
"loss": 0.0145,
"step": 19970
},
{
"epoch": 4.69,
"grad_norm": 0.5345707535743713,
"learning_rate": 3.1173708920187794e-06,
"loss": 0.0371,
"step": 19980
},
{
"epoch": 4.69,
"grad_norm": 0.3850795030593872,
"learning_rate": 3.093896713615024e-06,
"loss": 0.011,
"step": 19990
},
{
"epoch": 4.69,
"grad_norm": 3.576430082321167,
"learning_rate": 3.0704225352112678e-06,
"loss": 0.032,
"step": 20000
},
{
"epoch": 4.69,
"eval_loss": 0.0599445179104805,
"eval_macro/f1": 0.9280226732438912,
"eval_macro/precision": 0.929679536317452,
"eval_macro/recall": 0.9267250235346973,
"eval_micro/f1": 0.9290442059055524,
"eval_micro/precision": 0.9306719802116257,
"eval_micro/recall": 0.9274221157137966,
"eval_runtime": 29.022,
"eval_samples/accuracy": 0.9212598425196851,
"eval_samples_per_second": 503.24,
"eval_steps_per_second": 15.747,
"step": 20000
},
{
"epoch": 4.7,
"grad_norm": 0.2423238307237625,
"learning_rate": 3.0469483568075117e-06,
"loss": 0.0142,
"step": 20010
},
{
"epoch": 4.7,
"grad_norm": 1.0201228857040405,
"learning_rate": 3.023474178403756e-06,
"loss": 0.0293,
"step": 20020
},
{
"epoch": 4.7,
"grad_norm": 0.4529697895050049,
"learning_rate": 3e-06,
"loss": 0.0359,
"step": 20030
},
{
"epoch": 4.7,
"grad_norm": 0.8453211784362793,
"learning_rate": 2.9765258215962445e-06,
"loss": 0.032,
"step": 20040
},
{
"epoch": 4.71,
"grad_norm": 0.892306923866272,
"learning_rate": 2.9530516431924884e-06,
"loss": 0.0281,
"step": 20050
},
{
"epoch": 4.71,
"grad_norm": 1.364424467086792,
"learning_rate": 2.9295774647887324e-06,
"loss": 0.0364,
"step": 20060
},
{
"epoch": 4.71,
"grad_norm": 0.5051470398902893,
"learning_rate": 2.9061032863849768e-06,
"loss": 0.0311,
"step": 20070
},
{
"epoch": 4.71,
"grad_norm": 2.131049156188965,
"learning_rate": 2.8826291079812207e-06,
"loss": 0.0251,
"step": 20080
},
{
"epoch": 4.72,
"grad_norm": 0.45606279373168945,
"learning_rate": 2.859154929577465e-06,
"loss": 0.0228,
"step": 20090
},
{
"epoch": 4.72,
"grad_norm": 3.494180202484131,
"learning_rate": 2.835680751173709e-06,
"loss": 0.0205,
"step": 20100
},
{
"epoch": 4.72,
"grad_norm": 0.07420266419649124,
"learning_rate": 2.812206572769953e-06,
"loss": 0.0189,
"step": 20110
},
{
"epoch": 4.72,
"grad_norm": 1.7509255409240723,
"learning_rate": 2.7887323943661974e-06,
"loss": 0.0237,
"step": 20120
},
{
"epoch": 4.73,
"grad_norm": 3.2737362384796143,
"learning_rate": 2.7652582159624414e-06,
"loss": 0.0323,
"step": 20130
},
{
"epoch": 4.73,
"grad_norm": 1.4535030126571655,
"learning_rate": 2.7417840375586857e-06,
"loss": 0.025,
"step": 20140
},
{
"epoch": 4.73,
"grad_norm": 0.9434374570846558,
"learning_rate": 2.7183098591549297e-06,
"loss": 0.0309,
"step": 20150
},
{
"epoch": 4.73,
"grad_norm": 0.07359094172716141,
"learning_rate": 2.694835680751174e-06,
"loss": 0.0291,
"step": 20160
},
{
"epoch": 4.73,
"grad_norm": 0.20729191601276398,
"learning_rate": 2.671361502347418e-06,
"loss": 0.012,
"step": 20170
},
{
"epoch": 4.74,
"grad_norm": 0.7258203625679016,
"learning_rate": 2.647887323943662e-06,
"loss": 0.0233,
"step": 20180
},
{
"epoch": 4.74,
"grad_norm": 0.2592746913433075,
"learning_rate": 2.6244131455399064e-06,
"loss": 0.032,
"step": 20190
},
{
"epoch": 4.74,
"grad_norm": 1.7571505308151245,
"learning_rate": 2.6009389671361503e-06,
"loss": 0.0433,
"step": 20200
},
{
"epoch": 4.74,
"grad_norm": 0.1218971237540245,
"learning_rate": 2.5774647887323947e-06,
"loss": 0.0263,
"step": 20210
},
{
"epoch": 4.75,
"grad_norm": 1.3359134197235107,
"learning_rate": 2.5539906103286387e-06,
"loss": 0.0245,
"step": 20220
},
{
"epoch": 4.75,
"grad_norm": 0.33871036767959595,
"learning_rate": 2.5305164319248826e-06,
"loss": 0.0397,
"step": 20230
},
{
"epoch": 4.75,
"grad_norm": 2.2899630069732666,
"learning_rate": 2.5070422535211266e-06,
"loss": 0.0327,
"step": 20240
},
{
"epoch": 4.75,
"grad_norm": 0.3078570067882538,
"learning_rate": 2.483568075117371e-06,
"loss": 0.0254,
"step": 20250
},
{
"epoch": 4.76,
"grad_norm": 0.25189924240112305,
"learning_rate": 2.4600938967136154e-06,
"loss": 0.0242,
"step": 20260
},
{
"epoch": 4.76,
"grad_norm": 0.4151531755924225,
"learning_rate": 2.4366197183098593e-06,
"loss": 0.0303,
"step": 20270
},
{
"epoch": 4.76,
"grad_norm": 0.4815005958080292,
"learning_rate": 2.4131455399061037e-06,
"loss": 0.0203,
"step": 20280
},
{
"epoch": 4.76,
"grad_norm": 3.175389289855957,
"learning_rate": 2.3896713615023472e-06,
"loss": 0.0359,
"step": 20290
},
{
"epoch": 4.77,
"grad_norm": 0.38591518998146057,
"learning_rate": 2.3661971830985916e-06,
"loss": 0.027,
"step": 20300
},
{
"epoch": 4.77,
"grad_norm": 3.8292973041534424,
"learning_rate": 2.3427230046948356e-06,
"loss": 0.0432,
"step": 20310
},
{
"epoch": 4.77,
"grad_norm": 0.09365264326334,
"learning_rate": 2.31924882629108e-06,
"loss": 0.0236,
"step": 20320
},
{
"epoch": 4.77,
"grad_norm": 0.9030979871749878,
"learning_rate": 2.2957746478873243e-06,
"loss": 0.0224,
"step": 20330
},
{
"epoch": 4.77,
"grad_norm": 5.00977087020874,
"learning_rate": 2.2723004694835683e-06,
"loss": 0.0375,
"step": 20340
},
{
"epoch": 4.78,
"grad_norm": 0.6654548048973083,
"learning_rate": 2.2488262910798123e-06,
"loss": 0.0454,
"step": 20350
},
{
"epoch": 4.78,
"grad_norm": 0.40298229455947876,
"learning_rate": 2.2253521126760562e-06,
"loss": 0.0323,
"step": 20360
},
{
"epoch": 4.78,
"grad_norm": 0.07516142725944519,
"learning_rate": 2.2018779342723006e-06,
"loss": 0.0265,
"step": 20370
},
{
"epoch": 4.78,
"grad_norm": 7.442012786865234,
"learning_rate": 2.1784037558685446e-06,
"loss": 0.0236,
"step": 20380
},
{
"epoch": 4.79,
"grad_norm": 0.4781692624092102,
"learning_rate": 2.154929577464789e-06,
"loss": 0.0392,
"step": 20390
},
{
"epoch": 4.79,
"grad_norm": 0.02968725562095642,
"learning_rate": 2.131455399061033e-06,
"loss": 0.0345,
"step": 20400
},
{
"epoch": 4.79,
"grad_norm": 2.6985020637512207,
"learning_rate": 2.107981220657277e-06,
"loss": 0.0171,
"step": 20410
},
{
"epoch": 4.79,
"grad_norm": 0.02516743168234825,
"learning_rate": 2.0845070422535212e-06,
"loss": 0.0232,
"step": 20420
},
{
"epoch": 4.8,
"grad_norm": 2.1955928802490234,
"learning_rate": 2.061032863849765e-06,
"loss": 0.0251,
"step": 20430
},
{
"epoch": 4.8,
"grad_norm": 2.6991193294525146,
"learning_rate": 2.0375586854460096e-06,
"loss": 0.024,
"step": 20440
},
{
"epoch": 4.8,
"grad_norm": 1.7629855871200562,
"learning_rate": 2.014084507042254e-06,
"loss": 0.0395,
"step": 20450
},
{
"epoch": 4.8,
"grad_norm": 0.22058716416358948,
"learning_rate": 1.990610328638498e-06,
"loss": 0.0065,
"step": 20460
},
{
"epoch": 4.81,
"grad_norm": 1.8587628602981567,
"learning_rate": 1.967136150234742e-06,
"loss": 0.0259,
"step": 20470
},
{
"epoch": 4.81,
"grad_norm": 0.107975073158741,
"learning_rate": 1.943661971830986e-06,
"loss": 0.0279,
"step": 20480
},
{
"epoch": 4.81,
"grad_norm": 0.9777230620384216,
"learning_rate": 1.9201877934272302e-06,
"loss": 0.0225,
"step": 20490
},
{
"epoch": 4.81,
"grad_norm": 0.5380791425704956,
"learning_rate": 1.8967136150234742e-06,
"loss": 0.0399,
"step": 20500
},
{
"epoch": 4.81,
"eval_loss": 0.05661395192146301,
"eval_macro/f1": 0.9310132935790502,
"eval_macro/precision": 0.931598994557163,
"eval_macro/recall": 0.9305680939976558,
"eval_micro/f1": 0.932027531418005,
"eval_micro/precision": 0.9322509932867517,
"eval_micro/recall": 0.9318041766518316,
"eval_runtime": 29.363,
"eval_samples/accuracy": 0.9252995549469359,
"eval_samples_per_second": 497.394,
"eval_steps_per_second": 15.564,
"step": 20500
},
{
"epoch": 4.81,
"grad_norm": 2.409182071685791,
"learning_rate": 1.8732394366197183e-06,
"loss": 0.0139,
"step": 20510
},
{
"epoch": 4.82,
"grad_norm": 0.6202594041824341,
"learning_rate": 1.8497652582159627e-06,
"loss": 0.0259,
"step": 20520
},
{
"epoch": 4.82,
"grad_norm": 1.8184819221496582,
"learning_rate": 1.8262910798122067e-06,
"loss": 0.0322,
"step": 20530
},
{
"epoch": 4.82,
"grad_norm": 0.19951772689819336,
"learning_rate": 1.8028169014084509e-06,
"loss": 0.0269,
"step": 20540
},
{
"epoch": 4.82,
"grad_norm": 1.1214886903762817,
"learning_rate": 1.7793427230046948e-06,
"loss": 0.0308,
"step": 20550
},
{
"epoch": 4.83,
"grad_norm": 1.819677472114563,
"learning_rate": 1.7558685446009392e-06,
"loss": 0.033,
"step": 20560
},
{
"epoch": 4.83,
"grad_norm": 0.6370648145675659,
"learning_rate": 1.732394366197183e-06,
"loss": 0.0211,
"step": 20570
},
{
"epoch": 4.83,
"grad_norm": 0.18035587668418884,
"learning_rate": 1.7089201877934273e-06,
"loss": 0.0328,
"step": 20580
},
{
"epoch": 4.83,
"grad_norm": 1.0719821453094482,
"learning_rate": 1.6854460093896715e-06,
"loss": 0.0298,
"step": 20590
},
{
"epoch": 4.84,
"grad_norm": 0.222098246216774,
"learning_rate": 1.6619718309859155e-06,
"loss": 0.0465,
"step": 20600
},
{
"epoch": 4.84,
"grad_norm": 4.420154094696045,
"learning_rate": 1.6384976525821598e-06,
"loss": 0.0252,
"step": 20610
},
{
"epoch": 4.84,
"grad_norm": 1.6321355104446411,
"learning_rate": 1.6150234741784038e-06,
"loss": 0.0306,
"step": 20620
},
{
"epoch": 4.84,
"grad_norm": 0.06074954941868782,
"learning_rate": 1.591549295774648e-06,
"loss": 0.0375,
"step": 20630
},
{
"epoch": 4.85,
"grad_norm": 4.066643238067627,
"learning_rate": 1.568075117370892e-06,
"loss": 0.0352,
"step": 20640
},
{
"epoch": 4.85,
"grad_norm": 0.1419898420572281,
"learning_rate": 1.5446009389671363e-06,
"loss": 0.0258,
"step": 20650
},
{
"epoch": 4.85,
"grad_norm": 0.1670309603214264,
"learning_rate": 1.5211267605633803e-06,
"loss": 0.0137,
"step": 20660
},
{
"epoch": 4.85,
"grad_norm": 0.8675075173377991,
"learning_rate": 1.4976525821596244e-06,
"loss": 0.022,
"step": 20670
},
{
"epoch": 4.85,
"grad_norm": 0.5059126615524292,
"learning_rate": 1.4741784037558686e-06,
"loss": 0.045,
"step": 20680
},
{
"epoch": 4.86,
"grad_norm": 1.8234846591949463,
"learning_rate": 1.4507042253521128e-06,
"loss": 0.0285,
"step": 20690
},
{
"epoch": 4.86,
"grad_norm": 0.09939184039831161,
"learning_rate": 1.427230046948357e-06,
"loss": 0.0313,
"step": 20700
},
{
"epoch": 4.86,
"grad_norm": 1.7367419004440308,
"learning_rate": 1.4061032863849765e-06,
"loss": 0.0307,
"step": 20710
},
{
"epoch": 4.86,
"grad_norm": 0.7291856408119202,
"learning_rate": 1.3826291079812207e-06,
"loss": 0.025,
"step": 20720
},
{
"epoch": 4.87,
"grad_norm": 0.6269216537475586,
"learning_rate": 1.3591549295774648e-06,
"loss": 0.0377,
"step": 20730
},
{
"epoch": 4.87,
"grad_norm": 0.03957618027925491,
"learning_rate": 1.335680751173709e-06,
"loss": 0.0235,
"step": 20740
},
{
"epoch": 4.87,
"grad_norm": 0.7715898752212524,
"learning_rate": 1.3122065727699532e-06,
"loss": 0.0339,
"step": 20750
},
{
"epoch": 4.87,
"grad_norm": 0.8872296810150146,
"learning_rate": 1.2887323943661974e-06,
"loss": 0.0167,
"step": 20760
},
{
"epoch": 4.88,
"grad_norm": 0.6660736799240112,
"learning_rate": 1.2652582159624413e-06,
"loss": 0.0384,
"step": 20770
},
{
"epoch": 4.88,
"grad_norm": 1.6280009746551514,
"learning_rate": 1.2417840375586855e-06,
"loss": 0.0247,
"step": 20780
},
{
"epoch": 4.88,
"grad_norm": 0.618594229221344,
"learning_rate": 1.2183098591549297e-06,
"loss": 0.0327,
"step": 20790
},
{
"epoch": 4.88,
"grad_norm": 1.0769779682159424,
"learning_rate": 1.1948356807511736e-06,
"loss": 0.024,
"step": 20800
},
{
"epoch": 4.88,
"grad_norm": 1.8020055294036865,
"learning_rate": 1.1713615023474178e-06,
"loss": 0.0316,
"step": 20810
},
{
"epoch": 4.89,
"grad_norm": 0.07099230587482452,
"learning_rate": 1.1478873239436622e-06,
"loss": 0.0189,
"step": 20820
},
{
"epoch": 4.89,
"grad_norm": 0.5301911234855652,
"learning_rate": 1.1244131455399061e-06,
"loss": 0.0248,
"step": 20830
},
{
"epoch": 4.89,
"grad_norm": 2.444852113723755,
"learning_rate": 1.1009389671361503e-06,
"loss": 0.0317,
"step": 20840
},
{
"epoch": 4.89,
"grad_norm": 1.6938620805740356,
"learning_rate": 1.0774647887323945e-06,
"loss": 0.0216,
"step": 20850
},
{
"epoch": 4.9,
"grad_norm": 1.9859988689422607,
"learning_rate": 1.0539906103286384e-06,
"loss": 0.0323,
"step": 20860
},
{
"epoch": 4.9,
"grad_norm": 0.8528733849525452,
"learning_rate": 1.0305164319248826e-06,
"loss": 0.0214,
"step": 20870
},
{
"epoch": 4.9,
"grad_norm": 8.605490684509277,
"learning_rate": 1.007042253521127e-06,
"loss": 0.017,
"step": 20880
},
{
"epoch": 4.9,
"grad_norm": 0.17768090963363647,
"learning_rate": 9.83568075117371e-07,
"loss": 0.0289,
"step": 20890
},
{
"epoch": 4.91,
"grad_norm": 0.3029472529888153,
"learning_rate": 9.600938967136151e-07,
"loss": 0.0165,
"step": 20900
},
{
"epoch": 4.91,
"grad_norm": 4.979481220245361,
"learning_rate": 9.366197183098592e-07,
"loss": 0.0133,
"step": 20910
},
{
"epoch": 4.91,
"grad_norm": 0.1385805457830429,
"learning_rate": 9.131455399061033e-07,
"loss": 0.0207,
"step": 20920
},
{
"epoch": 4.91,
"grad_norm": 2.0309174060821533,
"learning_rate": 8.896713615023474e-07,
"loss": 0.0274,
"step": 20930
},
{
"epoch": 4.92,
"grad_norm": 0.3579356372356415,
"learning_rate": 8.661971830985915e-07,
"loss": 0.03,
"step": 20940
},
{
"epoch": 4.92,
"grad_norm": 1.9798082113265991,
"learning_rate": 8.427230046948357e-07,
"loss": 0.0183,
"step": 20950
},
{
"epoch": 4.92,
"grad_norm": 3.616589069366455,
"learning_rate": 8.192488262910799e-07,
"loss": 0.029,
"step": 20960
},
{
"epoch": 4.92,
"grad_norm": 0.07770455628633499,
"learning_rate": 7.95774647887324e-07,
"loss": 0.016,
"step": 20970
},
{
"epoch": 4.92,
"grad_norm": 0.17096202075481415,
"learning_rate": 7.723004694835682e-07,
"loss": 0.0204,
"step": 20980
},
{
"epoch": 4.93,
"grad_norm": 0.9039891362190247,
"learning_rate": 7.488262910798122e-07,
"loss": 0.0286,
"step": 20990
},
{
"epoch": 4.93,
"grad_norm": 0.06406420469284058,
"learning_rate": 7.253521126760564e-07,
"loss": 0.0261,
"step": 21000
},
{
"epoch": 4.93,
"eval_loss": 0.057017967104911804,
"eval_macro/f1": 0.9299257465348796,
"eval_macro/precision": 0.9302157930278783,
"eval_macro/recall": 0.9298030271698045,
"eval_micro/f1": 0.9309872655073258,
"eval_micro/precision": 0.9309235298144725,
"eval_micro/recall": 0.9310510099281069,
"eval_runtime": 28.8502,
"eval_samples/accuracy": 0.9243409791167408,
"eval_samples_per_second": 506.236,
"eval_steps_per_second": 15.84,
"step": 21000
}
],
"logging_steps": 10,
"max_steps": 21300,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 2.2102454595032064e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}