BlueSecureBERT / checkpoint-2100 /trainer_state.json
Maxime Turlot
Add BlueSecureBERT weights & tokenizer v1.0
f8766c7
{
"best_global_step": 1800,
"best_metric": 0.9762746087834427,
"best_model_checkpoint": "/workspace/AI/Trend_Primus-FineWeb_Filtering-pipeline/securebert_finetuned/defensive_vs_rest/checkpoint-1800",
"epoch": 3.8181818181818183,
"eval_steps": 300,
"global_step": 2100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01818181818181818,
"grad_norm": 0.4759802222251892,
"learning_rate": 5.714285714285715e-07,
"loss": 0.1542,
"step": 10
},
{
"epoch": 0.03636363636363636,
"grad_norm": 0.3740444481372833,
"learning_rate": 1.142857142857143e-06,
"loss": 0.1623,
"step": 20
},
{
"epoch": 0.05454545454545454,
"grad_norm": 0.5404930710792542,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.1582,
"step": 30
},
{
"epoch": 0.07272727272727272,
"grad_norm": 0.3705693781375885,
"learning_rate": 2.285714285714286e-06,
"loss": 0.1422,
"step": 40
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.4376075565814972,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.1403,
"step": 50
},
{
"epoch": 0.10909090909090909,
"grad_norm": 0.46200892329216003,
"learning_rate": 3.428571428571429e-06,
"loss": 0.142,
"step": 60
},
{
"epoch": 0.12727272727272726,
"grad_norm": 0.2855919599533081,
"learning_rate": 4.000000000000001e-06,
"loss": 0.144,
"step": 70
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.33721357583999634,
"learning_rate": 4.571428571428572e-06,
"loss": 0.1401,
"step": 80
},
{
"epoch": 0.16363636363636364,
"grad_norm": 0.6751205921173096,
"learning_rate": 5.142857142857142e-06,
"loss": 0.132,
"step": 90
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.1521556377410889,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.1277,
"step": 100
},
{
"epoch": 0.2,
"grad_norm": 0.6443154811859131,
"learning_rate": 6.285714285714286e-06,
"loss": 0.1079,
"step": 110
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.6698077321052551,
"learning_rate": 6.857142857142858e-06,
"loss": 0.1066,
"step": 120
},
{
"epoch": 0.23636363636363636,
"grad_norm": 0.8653299808502197,
"learning_rate": 7.428571428571429e-06,
"loss": 0.0943,
"step": 130
},
{
"epoch": 0.2545454545454545,
"grad_norm": 1.1476327180862427,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0924,
"step": 140
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.5096330642700195,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0844,
"step": 150
},
{
"epoch": 0.2909090909090909,
"grad_norm": 1.1907318830490112,
"learning_rate": 9.142857142857144e-06,
"loss": 0.0813,
"step": 160
},
{
"epoch": 0.3090909090909091,
"grad_norm": 1.299401879310608,
"learning_rate": 9.714285714285715e-06,
"loss": 0.0749,
"step": 170
},
{
"epoch": 0.32727272727272727,
"grad_norm": 0.8548530340194702,
"learning_rate": 1.0285714285714285e-05,
"loss": 0.0661,
"step": 180
},
{
"epoch": 0.34545454545454546,
"grad_norm": 0.8947266936302185,
"learning_rate": 1.0857142857142858e-05,
"loss": 0.0638,
"step": 190
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.172971725463867,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.0624,
"step": 200
},
{
"epoch": 0.38181818181818183,
"grad_norm": 4.725502967834473,
"learning_rate": 1.2e-05,
"loss": 0.0504,
"step": 210
},
{
"epoch": 0.4,
"grad_norm": 1.3992273807525635,
"learning_rate": 1.2571428571428572e-05,
"loss": 0.0417,
"step": 220
},
{
"epoch": 0.41818181818181815,
"grad_norm": 0.9457690119743347,
"learning_rate": 1.3142857142857145e-05,
"loss": 0.0377,
"step": 230
},
{
"epoch": 0.43636363636363634,
"grad_norm": 1.1091430187225342,
"learning_rate": 1.3714285714285716e-05,
"loss": 0.0323,
"step": 240
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.7338578701019287,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.0269,
"step": 250
},
{
"epoch": 0.4727272727272727,
"grad_norm": 1.0748590230941772,
"learning_rate": 1.4857142857142858e-05,
"loss": 0.0275,
"step": 260
},
{
"epoch": 0.4909090909090909,
"grad_norm": 1.0615975856781006,
"learning_rate": 1.542857142857143e-05,
"loss": 0.0214,
"step": 270
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.8980767130851746,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0224,
"step": 280
},
{
"epoch": 0.5272727272727272,
"grad_norm": 0.8434118628501892,
"learning_rate": 1.6571428571428574e-05,
"loss": 0.0271,
"step": 290
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.7067236304283142,
"learning_rate": 1.7142857142857142e-05,
"loss": 0.0225,
"step": 300
},
{
"epoch": 0.5454545454545454,
"eval_f1": 0.886688162137264,
"eval_f2": 0.9429803076320172,
"eval_loss": 0.01607716828584671,
"eval_precision": 0.8064516129032258,
"eval_recall": 0.9846547314578005,
"eval_runtime": 10.5831,
"eval_samples_per_second": 738.914,
"eval_steps_per_second": 11.622,
"step": 300
},
{
"epoch": 0.5636363636363636,
"grad_norm": 1.111824631690979,
"learning_rate": 1.7714285714285717e-05,
"loss": 0.0186,
"step": 310
},
{
"epoch": 0.5818181818181818,
"grad_norm": 1.0324747562408447,
"learning_rate": 1.8285714285714288e-05,
"loss": 0.0218,
"step": 320
},
{
"epoch": 0.6,
"grad_norm": 1.2407127618789673,
"learning_rate": 1.885714285714286e-05,
"loss": 0.0223,
"step": 330
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.6641681790351868,
"learning_rate": 1.942857142857143e-05,
"loss": 0.022,
"step": 340
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.8924217224121094,
"learning_rate": 2e-05,
"loss": 0.0178,
"step": 350
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.49054834246635437,
"learning_rate": 1.991666666666667e-05,
"loss": 0.0139,
"step": 360
},
{
"epoch": 0.6727272727272727,
"grad_norm": 1.4703093767166138,
"learning_rate": 1.9833333333333335e-05,
"loss": 0.0211,
"step": 370
},
{
"epoch": 0.6909090909090909,
"grad_norm": 2.7741594314575195,
"learning_rate": 1.9750000000000002e-05,
"loss": 0.0122,
"step": 380
},
{
"epoch": 0.7090909090909091,
"grad_norm": 1.8091500997543335,
"learning_rate": 1.9666666666666666e-05,
"loss": 0.0191,
"step": 390
},
{
"epoch": 0.7272727272727273,
"grad_norm": 3.808887243270874,
"learning_rate": 1.9583333333333333e-05,
"loss": 0.0168,
"step": 400
},
{
"epoch": 0.7454545454545455,
"grad_norm": 0.5149283409118652,
"learning_rate": 1.95e-05,
"loss": 0.0145,
"step": 410
},
{
"epoch": 0.7636363636363637,
"grad_norm": 0.45617809891700745,
"learning_rate": 1.9416666666666667e-05,
"loss": 0.0088,
"step": 420
},
{
"epoch": 0.7818181818181819,
"grad_norm": 1.402259111404419,
"learning_rate": 1.9333333333333333e-05,
"loss": 0.0126,
"step": 430
},
{
"epoch": 0.8,
"grad_norm": 1.5392917394638062,
"learning_rate": 1.925e-05,
"loss": 0.0138,
"step": 440
},
{
"epoch": 0.8181818181818182,
"grad_norm": 2.108272075653076,
"learning_rate": 1.916666666666667e-05,
"loss": 0.0165,
"step": 450
},
{
"epoch": 0.8363636363636363,
"grad_norm": 0.6225730776786804,
"learning_rate": 1.9083333333333338e-05,
"loss": 0.012,
"step": 460
},
{
"epoch": 0.8545454545454545,
"grad_norm": 1.8889803886413574,
"learning_rate": 1.9e-05,
"loss": 0.0097,
"step": 470
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.1781783550977707,
"learning_rate": 1.8916666666666668e-05,
"loss": 0.012,
"step": 480
},
{
"epoch": 0.8909090909090909,
"grad_norm": 0.7276476621627808,
"learning_rate": 1.8833333333333335e-05,
"loss": 0.0151,
"step": 490
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.302780032157898,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.0124,
"step": 500
},
{
"epoch": 0.9272727272727272,
"grad_norm": 0.8764067888259888,
"learning_rate": 1.866666666666667e-05,
"loss": 0.0127,
"step": 510
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.3675801157951355,
"learning_rate": 1.8583333333333336e-05,
"loss": 0.011,
"step": 520
},
{
"epoch": 0.9636363636363636,
"grad_norm": 0.413601279258728,
"learning_rate": 1.8500000000000002e-05,
"loss": 0.0119,
"step": 530
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.3606299161911011,
"learning_rate": 1.8416666666666666e-05,
"loss": 0.0096,
"step": 540
},
{
"epoch": 1.0,
"grad_norm": 0.6626041531562805,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.0091,
"step": 550
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.34816664457321167,
"learning_rate": 1.825e-05,
"loss": 0.0064,
"step": 560
},
{
"epoch": 1.0363636363636364,
"grad_norm": 0.7120109796524048,
"learning_rate": 1.8166666666666667e-05,
"loss": 0.007,
"step": 570
},
{
"epoch": 1.0545454545454545,
"grad_norm": 0.34991776943206787,
"learning_rate": 1.8083333333333334e-05,
"loss": 0.0156,
"step": 580
},
{
"epoch": 1.0727272727272728,
"grad_norm": 0.4325370788574219,
"learning_rate": 1.8e-05,
"loss": 0.0116,
"step": 590
},
{
"epoch": 1.0909090909090908,
"grad_norm": 1.3302485942840576,
"learning_rate": 1.7916666666666667e-05,
"loss": 0.007,
"step": 600
},
{
"epoch": 1.0909090909090908,
"eval_f1": 0.9619118745332338,
"eval_f2": 0.9775349119611415,
"eval_loss": 0.009453566744923592,
"eval_precision": 0.9369544131910766,
"eval_recall": 0.9882352941176471,
"eval_runtime": 10.7346,
"eval_samples_per_second": 728.488,
"eval_steps_per_second": 11.458,
"step": 600
},
{
"epoch": 1.1090909090909091,
"grad_norm": 0.5350901484489441,
"learning_rate": 1.7833333333333334e-05,
"loss": 0.0038,
"step": 610
},
{
"epoch": 1.1272727272727272,
"grad_norm": 0.19123613834381104,
"learning_rate": 1.775e-05,
"loss": 0.0075,
"step": 620
},
{
"epoch": 1.1454545454545455,
"grad_norm": 0.2627851963043213,
"learning_rate": 1.7666666666666668e-05,
"loss": 0.0069,
"step": 630
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.49250972270965576,
"learning_rate": 1.7583333333333335e-05,
"loss": 0.009,
"step": 640
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.2556400299072266,
"learning_rate": 1.7500000000000002e-05,
"loss": 0.0106,
"step": 650
},
{
"epoch": 1.2,
"grad_norm": 0.23302438855171204,
"learning_rate": 1.741666666666667e-05,
"loss": 0.006,
"step": 660
},
{
"epoch": 1.2181818181818183,
"grad_norm": 0.22926795482635498,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.0053,
"step": 670
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.41634848713874817,
"learning_rate": 1.7250000000000003e-05,
"loss": 0.0096,
"step": 680
},
{
"epoch": 1.2545454545454544,
"grad_norm": 0.7806673049926758,
"learning_rate": 1.7166666666666666e-05,
"loss": 0.0077,
"step": 690
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.6627803444862366,
"learning_rate": 1.7083333333333333e-05,
"loss": 0.008,
"step": 700
},
{
"epoch": 1.290909090909091,
"grad_norm": 0.33546727895736694,
"learning_rate": 1.7e-05,
"loss": 0.0074,
"step": 710
},
{
"epoch": 1.309090909090909,
"grad_norm": 1.327726125717163,
"learning_rate": 1.6916666666666667e-05,
"loss": 0.0044,
"step": 720
},
{
"epoch": 1.3272727272727272,
"grad_norm": 0.4449763894081116,
"learning_rate": 1.6833333333333334e-05,
"loss": 0.0046,
"step": 730
},
{
"epoch": 1.3454545454545455,
"grad_norm": 0.2766354978084564,
"learning_rate": 1.675e-05,
"loss": 0.0034,
"step": 740
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.830558180809021,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0074,
"step": 750
},
{
"epoch": 1.3818181818181818,
"grad_norm": 1.0488086938858032,
"learning_rate": 1.659166666666667e-05,
"loss": 0.0065,
"step": 760
},
{
"epoch": 1.4,
"grad_norm": 0.5093031525611877,
"learning_rate": 1.6508333333333336e-05,
"loss": 0.0053,
"step": 770
},
{
"epoch": 1.4181818181818182,
"grad_norm": 0.3070843517780304,
"learning_rate": 1.6425000000000003e-05,
"loss": 0.0045,
"step": 780
},
{
"epoch": 1.4363636363636363,
"grad_norm": 1.093131422996521,
"learning_rate": 1.634166666666667e-05,
"loss": 0.0058,
"step": 790
},
{
"epoch": 1.4545454545454546,
"grad_norm": 2.2531373500823975,
"learning_rate": 1.6258333333333333e-05,
"loss": 0.0065,
"step": 800
},
{
"epoch": 1.4727272727272727,
"grad_norm": 0.27250564098358154,
"learning_rate": 1.6175e-05,
"loss": 0.005,
"step": 810
},
{
"epoch": 1.490909090909091,
"grad_norm": 2.3462181091308594,
"learning_rate": 1.6091666666666667e-05,
"loss": 0.0077,
"step": 820
},
{
"epoch": 1.509090909090909,
"grad_norm": 0.5783445835113525,
"learning_rate": 1.6008333333333334e-05,
"loss": 0.0065,
"step": 830
},
{
"epoch": 1.5272727272727273,
"grad_norm": 2.6000328063964844,
"learning_rate": 1.5925e-05,
"loss": 0.0073,
"step": 840
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.27279505133628845,
"learning_rate": 1.5841666666666668e-05,
"loss": 0.0054,
"step": 850
},
{
"epoch": 1.5636363636363635,
"grad_norm": 0.5557974576950073,
"learning_rate": 1.5758333333333335e-05,
"loss": 0.0054,
"step": 860
},
{
"epoch": 1.5818181818181818,
"grad_norm": 0.4363366365432739,
"learning_rate": 1.5675e-05,
"loss": 0.0064,
"step": 870
},
{
"epoch": 1.6,
"grad_norm": 0.055520545691251755,
"learning_rate": 1.559166666666667e-05,
"loss": 0.0036,
"step": 880
},
{
"epoch": 1.6181818181818182,
"grad_norm": 0.31246721744537354,
"learning_rate": 1.5508333333333335e-05,
"loss": 0.0035,
"step": 890
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.3083712160587311,
"learning_rate": 1.5425000000000002e-05,
"loss": 0.0043,
"step": 900
},
{
"epoch": 1.6363636363636362,
"eval_f1": 0.9619188921859545,
"eval_f2": 0.9814310223029569,
"eval_loss": 0.007550612557679415,
"eval_precision": 0.9310674964097654,
"eval_recall": 0.9948849104859335,
"eval_runtime": 11.1497,
"eval_samples_per_second": 701.366,
"eval_steps_per_second": 11.032,
"step": 900
},
{
"epoch": 1.6545454545454545,
"grad_norm": 1.0350213050842285,
"learning_rate": 1.534166666666667e-05,
"loss": 0.0062,
"step": 910
},
{
"epoch": 1.6727272727272728,
"grad_norm": 1.7585707902908325,
"learning_rate": 1.5258333333333334e-05,
"loss": 0.006,
"step": 920
},
{
"epoch": 1.690909090909091,
"grad_norm": 0.35897427797317505,
"learning_rate": 1.5175000000000001e-05,
"loss": 0.0041,
"step": 930
},
{
"epoch": 1.709090909090909,
"grad_norm": 0.2970544993877411,
"learning_rate": 1.5091666666666668e-05,
"loss": 0.0058,
"step": 940
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.10311456769704819,
"learning_rate": 1.5008333333333333e-05,
"loss": 0.0057,
"step": 950
},
{
"epoch": 1.7454545454545456,
"grad_norm": 1.624154806137085,
"learning_rate": 1.4925e-05,
"loss": 0.0088,
"step": 960
},
{
"epoch": 1.7636363636363637,
"grad_norm": 0.22448480129241943,
"learning_rate": 1.4841666666666667e-05,
"loss": 0.0038,
"step": 970
},
{
"epoch": 1.7818181818181817,
"grad_norm": 0.9474364519119263,
"learning_rate": 1.4758333333333334e-05,
"loss": 0.0044,
"step": 980
},
{
"epoch": 1.8,
"grad_norm": 0.05209196358919144,
"learning_rate": 1.4675000000000001e-05,
"loss": 0.0064,
"step": 990
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.3231663405895233,
"learning_rate": 1.4591666666666668e-05,
"loss": 0.005,
"step": 1000
},
{
"epoch": 1.8363636363636364,
"grad_norm": 0.4507773220539093,
"learning_rate": 1.4508333333333335e-05,
"loss": 0.0047,
"step": 1010
},
{
"epoch": 1.8545454545454545,
"grad_norm": 0.28643473982810974,
"learning_rate": 1.4425e-05,
"loss": 0.006,
"step": 1020
},
{
"epoch": 1.8727272727272726,
"grad_norm": 0.30528539419174194,
"learning_rate": 1.4341666666666667e-05,
"loss": 0.0035,
"step": 1030
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.7955114245414734,
"learning_rate": 1.4258333333333334e-05,
"loss": 0.0049,
"step": 1040
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.6773081421852112,
"learning_rate": 1.4175e-05,
"loss": 0.0058,
"step": 1050
},
{
"epoch": 1.9272727272727272,
"grad_norm": 0.5149025917053223,
"learning_rate": 1.4091666666666668e-05,
"loss": 0.0036,
"step": 1060
},
{
"epoch": 1.9454545454545453,
"grad_norm": 0.5025485754013062,
"learning_rate": 1.4008333333333334e-05,
"loss": 0.0073,
"step": 1070
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.6183115839958191,
"learning_rate": 1.3925000000000001e-05,
"loss": 0.0038,
"step": 1080
},
{
"epoch": 1.981818181818182,
"grad_norm": 0.33286458253860474,
"learning_rate": 1.3841666666666668e-05,
"loss": 0.0047,
"step": 1090
},
{
"epoch": 2.0,
"grad_norm": 0.766334056854248,
"learning_rate": 1.3758333333333333e-05,
"loss": 0.0083,
"step": 1100
},
{
"epoch": 2.018181818181818,
"grad_norm": 0.23794743418693542,
"learning_rate": 1.3675e-05,
"loss": 0.0027,
"step": 1110
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.32537227869033813,
"learning_rate": 1.3591666666666667e-05,
"loss": 0.0019,
"step": 1120
},
{
"epoch": 2.0545454545454547,
"grad_norm": 0.3572939932346344,
"learning_rate": 1.3508333333333334e-05,
"loss": 0.0023,
"step": 1130
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.2717416286468506,
"learning_rate": 1.3425000000000001e-05,
"loss": 0.001,
"step": 1140
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.20052389800548553,
"learning_rate": 1.3341666666666668e-05,
"loss": 0.0017,
"step": 1150
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.1603120118379593,
"learning_rate": 1.3258333333333335e-05,
"loss": 0.0031,
"step": 1160
},
{
"epoch": 2.1272727272727274,
"grad_norm": 0.15461675822734833,
"learning_rate": 1.3175e-05,
"loss": 0.0021,
"step": 1170
},
{
"epoch": 2.1454545454545455,
"grad_norm": 0.7481666803359985,
"learning_rate": 1.3091666666666667e-05,
"loss": 0.0033,
"step": 1180
},
{
"epoch": 2.1636363636363636,
"grad_norm": 0.3837297558784485,
"learning_rate": 1.3008333333333334e-05,
"loss": 0.0018,
"step": 1190
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.09467964619398117,
"learning_rate": 1.2925e-05,
"loss": 0.0017,
"step": 1200
},
{
"epoch": 2.1818181818181817,
"eval_f1": 0.9751068644707066,
"eval_f2": 0.9850640113798008,
"eval_loss": 0.009348779916763306,
"eval_precision": 0.9589515331355094,
"eval_recall": 0.9918158567774936,
"eval_runtime": 10.6749,
"eval_samples_per_second": 732.56,
"eval_steps_per_second": 11.522,
"step": 1200
},
{
"epoch": 2.2,
"grad_norm": 0.2888661324977875,
"learning_rate": 1.2841666666666668e-05,
"loss": 0.0017,
"step": 1210
},
{
"epoch": 2.2181818181818183,
"grad_norm": 1.4225064516067505,
"learning_rate": 1.2758333333333335e-05,
"loss": 0.0017,
"step": 1220
},
{
"epoch": 2.2363636363636363,
"grad_norm": 0.5475151538848877,
"learning_rate": 1.2675000000000001e-05,
"loss": 0.0022,
"step": 1230
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.2563498914241791,
"learning_rate": 1.2591666666666668e-05,
"loss": 0.0026,
"step": 1240
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.09335105866193771,
"learning_rate": 1.2508333333333334e-05,
"loss": 0.0013,
"step": 1250
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.08890422433614731,
"learning_rate": 1.2425e-05,
"loss": 0.0015,
"step": 1260
},
{
"epoch": 2.309090909090909,
"grad_norm": 0.0670776441693306,
"learning_rate": 1.2341666666666667e-05,
"loss": 0.0015,
"step": 1270
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.09385448694229126,
"learning_rate": 1.2258333333333334e-05,
"loss": 0.0022,
"step": 1280
},
{
"epoch": 2.3454545454545457,
"grad_norm": 0.31550052762031555,
"learning_rate": 1.2175000000000001e-05,
"loss": 0.0082,
"step": 1290
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.14805318415164948,
"learning_rate": 1.2091666666666668e-05,
"loss": 0.003,
"step": 1300
},
{
"epoch": 2.381818181818182,
"grad_norm": 0.5259885787963867,
"learning_rate": 1.2008333333333335e-05,
"loss": 0.0038,
"step": 1310
},
{
"epoch": 2.4,
"grad_norm": 0.37176281213760376,
"learning_rate": 1.1925e-05,
"loss": 0.0017,
"step": 1320
},
{
"epoch": 2.418181818181818,
"grad_norm": 0.33867013454437256,
"learning_rate": 1.1841666666666667e-05,
"loss": 0.0026,
"step": 1330
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.04352513328194618,
"learning_rate": 1.1758333333333334e-05,
"loss": 0.002,
"step": 1340
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.35843801498413086,
"learning_rate": 1.1675000000000001e-05,
"loss": 0.0028,
"step": 1350
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.5411182045936584,
"learning_rate": 1.1591666666666668e-05,
"loss": 0.0019,
"step": 1360
},
{
"epoch": 2.4909090909090907,
"grad_norm": 0.08427491784095764,
"learning_rate": 1.1508333333333335e-05,
"loss": 0.0017,
"step": 1370
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.27736711502075195,
"learning_rate": 1.1425000000000002e-05,
"loss": 0.0023,
"step": 1380
},
{
"epoch": 2.5272727272727273,
"grad_norm": 0.06100330501794815,
"learning_rate": 1.1341666666666668e-05,
"loss": 0.0017,
"step": 1390
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.07736339420080185,
"learning_rate": 1.1258333333333334e-05,
"loss": 0.0038,
"step": 1400
},
{
"epoch": 2.5636363636363635,
"grad_norm": 0.15126390755176544,
"learning_rate": 1.1175e-05,
"loss": 0.0009,
"step": 1410
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.0258785467594862,
"learning_rate": 1.1091666666666667e-05,
"loss": 0.001,
"step": 1420
},
{
"epoch": 2.6,
"grad_norm": 0.8827760815620422,
"learning_rate": 1.1008333333333334e-05,
"loss": 0.003,
"step": 1430
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.14176161587238312,
"learning_rate": 1.0925000000000001e-05,
"loss": 0.0023,
"step": 1440
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.17485152184963226,
"learning_rate": 1.0841666666666668e-05,
"loss": 0.0023,
"step": 1450
},
{
"epoch": 2.6545454545454543,
"grad_norm": 0.1624346673488617,
"learning_rate": 1.0758333333333335e-05,
"loss": 0.0016,
"step": 1460
},
{
"epoch": 2.672727272727273,
"grad_norm": 0.32750600576400757,
"learning_rate": 1.0675e-05,
"loss": 0.0018,
"step": 1470
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.3369393050670624,
"learning_rate": 1.0591666666666667e-05,
"loss": 0.0047,
"step": 1480
},
{
"epoch": 2.709090909090909,
"grad_norm": 0.19913850724697113,
"learning_rate": 1.0508333333333334e-05,
"loss": 0.0029,
"step": 1490
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.6039556264877319,
"learning_rate": 1.0425000000000001e-05,
"loss": 0.0029,
"step": 1500
},
{
"epoch": 2.7272727272727275,
"eval_f1": 0.9707426856714179,
"eval_f2": 0.9838807785888077,
"eval_loss": 0.008097349666059017,
"eval_precision": 0.9496086105675147,
"eval_recall": 0.992838874680307,
"eval_runtime": 10.7985,
"eval_samples_per_second": 724.178,
"eval_steps_per_second": 11.391,
"step": 1500
},
{
"epoch": 2.7454545454545456,
"grad_norm": 0.15625803172588348,
"learning_rate": 1.0341666666666668e-05,
"loss": 0.001,
"step": 1510
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.5355175733566284,
"learning_rate": 1.0258333333333335e-05,
"loss": 0.0015,
"step": 1520
},
{
"epoch": 2.7818181818181817,
"grad_norm": 0.054884154349565506,
"learning_rate": 1.0175000000000002e-05,
"loss": 0.0043,
"step": 1530
},
{
"epoch": 2.8,
"grad_norm": 0.14347773790359497,
"learning_rate": 1.0091666666666669e-05,
"loss": 0.0012,
"step": 1540
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.09393730759620667,
"learning_rate": 1.0008333333333334e-05,
"loss": 0.0026,
"step": 1550
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.2671602964401245,
"learning_rate": 9.925e-06,
"loss": 0.0006,
"step": 1560
},
{
"epoch": 2.8545454545454545,
"grad_norm": 0.04782993346452713,
"learning_rate": 9.841666666666668e-06,
"loss": 0.0011,
"step": 1570
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.5545538067817688,
"learning_rate": 9.758333333333334e-06,
"loss": 0.0034,
"step": 1580
},
{
"epoch": 2.8909090909090907,
"grad_norm": 0.18771076202392578,
"learning_rate": 9.675000000000001e-06,
"loss": 0.0014,
"step": 1590
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.022369615733623505,
"learning_rate": 9.591666666666667e-06,
"loss": 0.0007,
"step": 1600
},
{
"epoch": 2.9272727272727272,
"grad_norm": 0.569296658039093,
"learning_rate": 9.508333333333333e-06,
"loss": 0.0016,
"step": 1610
},
{
"epoch": 2.9454545454545453,
"grad_norm": 0.07517626136541367,
"learning_rate": 9.425e-06,
"loss": 0.0012,
"step": 1620
},
{
"epoch": 2.963636363636364,
"grad_norm": 0.4265158772468567,
"learning_rate": 9.341666666666667e-06,
"loss": 0.0013,
"step": 1630
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.31167715787887573,
"learning_rate": 9.258333333333334e-06,
"loss": 0.0014,
"step": 1640
},
{
"epoch": 3.0,
"grad_norm": 0.6325229406356812,
"learning_rate": 9.175000000000001e-06,
"loss": 0.0007,
"step": 1650
},
{
"epoch": 3.018181818181818,
"grad_norm": 0.04929906874895096,
"learning_rate": 9.091666666666668e-06,
"loss": 0.0006,
"step": 1660
},
{
"epoch": 3.036363636363636,
"grad_norm": 0.22075557708740234,
"learning_rate": 9.008333333333335e-06,
"loss": 0.0006,
"step": 1670
},
{
"epoch": 3.0545454545454547,
"grad_norm": 0.2008703649044037,
"learning_rate": 8.925e-06,
"loss": 0.0018,
"step": 1680
},
{
"epoch": 3.0727272727272728,
"grad_norm": 0.15318256616592407,
"learning_rate": 8.841666666666667e-06,
"loss": 0.0011,
"step": 1690
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.19851188361644745,
"learning_rate": 8.758333333333334e-06,
"loss": 0.0005,
"step": 1700
},
{
"epoch": 3.109090909090909,
"grad_norm": 0.02609218843281269,
"learning_rate": 8.675e-06,
"loss": 0.0014,
"step": 1710
},
{
"epoch": 3.1272727272727274,
"grad_norm": 0.02781720645725727,
"learning_rate": 8.591666666666668e-06,
"loss": 0.0004,
"step": 1720
},
{
"epoch": 3.1454545454545455,
"grad_norm": 0.17195935547351837,
"learning_rate": 8.508333333333335e-06,
"loss": 0.0011,
"step": 1730
},
{
"epoch": 3.1636363636363636,
"grad_norm": 0.04604584723711014,
"learning_rate": 8.425000000000001e-06,
"loss": 0.0017,
"step": 1740
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.01334014069288969,
"learning_rate": 8.341666666666667e-06,
"loss": 0.0005,
"step": 1750
},
{
"epoch": 3.2,
"grad_norm": 0.10181070119142532,
"learning_rate": 8.258333333333334e-06,
"loss": 0.0003,
"step": 1760
},
{
"epoch": 3.2181818181818183,
"grad_norm": 0.029040852561593056,
"learning_rate": 8.175e-06,
"loss": 0.0002,
"step": 1770
},
{
"epoch": 3.2363636363636363,
"grad_norm": 1.0948010683059692,
"learning_rate": 8.091666666666667e-06,
"loss": 0.0006,
"step": 1780
},
{
"epoch": 3.2545454545454544,
"grad_norm": 0.19002945721149445,
"learning_rate": 8.008333333333334e-06,
"loss": 0.0008,
"step": 1790
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.02836296707391739,
"learning_rate": 7.925000000000001e-06,
"loss": 0.0006,
"step": 1800
},
{
"epoch": 3.2727272727272725,
"eval_f1": 0.9827235772357723,
"eval_f2": 0.9866340169370472,
"eval_loss": 0.0109314676374197,
"eval_precision": 0.9762746087834427,
"eval_recall": 0.9892583120204603,
"eval_runtime": 10.4766,
"eval_samples_per_second": 746.427,
"eval_steps_per_second": 11.74,
"step": 1800
},
{
"epoch": 3.290909090909091,
"grad_norm": 0.018972614780068398,
"learning_rate": 7.841666666666668e-06,
"loss": 0.001,
"step": 1810
},
{
"epoch": 3.309090909090909,
"grad_norm": 0.003141665132716298,
"learning_rate": 7.758333333333335e-06,
"loss": 0.001,
"step": 1820
},
{
"epoch": 3.327272727272727,
"grad_norm": 0.029703687876462936,
"learning_rate": 7.675e-06,
"loss": 0.0007,
"step": 1830
},
{
"epoch": 3.3454545454545457,
"grad_norm": 0.18382185697555542,
"learning_rate": 7.591666666666667e-06,
"loss": 0.0004,
"step": 1840
},
{
"epoch": 3.3636363636363638,
"grad_norm": 0.05236556753516197,
"learning_rate": 7.508333333333334e-06,
"loss": 0.002,
"step": 1850
},
{
"epoch": 3.381818181818182,
"grad_norm": 0.17387185990810394,
"learning_rate": 7.425000000000001e-06,
"loss": 0.0009,
"step": 1860
},
{
"epoch": 3.4,
"grad_norm": 0.008212663233280182,
"learning_rate": 7.341666666666667e-06,
"loss": 0.0007,
"step": 1870
},
{
"epoch": 3.418181818181818,
"grad_norm": 0.22597701847553253,
"learning_rate": 7.258333333333334e-06,
"loss": 0.001,
"step": 1880
},
{
"epoch": 3.4363636363636365,
"grad_norm": 0.07276669144630432,
"learning_rate": 7.175000000000001e-06,
"loss": 0.0017,
"step": 1890
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.29078298807144165,
"learning_rate": 7.091666666666667e-06,
"loss": 0.0004,
"step": 1900
},
{
"epoch": 3.4727272727272727,
"grad_norm": 0.11019200086593628,
"learning_rate": 7.008333333333334e-06,
"loss": 0.0005,
"step": 1910
},
{
"epoch": 3.4909090909090907,
"grad_norm": 0.017450423911213875,
"learning_rate": 6.925000000000001e-06,
"loss": 0.0003,
"step": 1920
},
{
"epoch": 3.509090909090909,
"grad_norm": 0.023930951952934265,
"learning_rate": 6.8416666666666675e-06,
"loss": 0.0013,
"step": 1930
},
{
"epoch": 3.5272727272727273,
"grad_norm": 0.1692740023136139,
"learning_rate": 6.7583333333333336e-06,
"loss": 0.0033,
"step": 1940
},
{
"epoch": 3.5454545454545454,
"grad_norm": 0.031825270503759384,
"learning_rate": 6.6750000000000005e-06,
"loss": 0.0008,
"step": 1950
},
{
"epoch": 3.5636363636363635,
"grad_norm": 0.004583127796649933,
"learning_rate": 6.591666666666667e-06,
"loss": 0.0005,
"step": 1960
},
{
"epoch": 3.581818181818182,
"grad_norm": 0.19434763491153717,
"learning_rate": 6.508333333333334e-06,
"loss": 0.0003,
"step": 1970
},
{
"epoch": 3.6,
"grad_norm": 0.007167825475335121,
"learning_rate": 6.425e-06,
"loss": 0.0002,
"step": 1980
},
{
"epoch": 3.618181818181818,
"grad_norm": 0.24422968924045563,
"learning_rate": 6.341666666666667e-06,
"loss": 0.0003,
"step": 1990
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.00559116480872035,
"learning_rate": 6.258333333333334e-06,
"loss": 0.0001,
"step": 2000
},
{
"epoch": 3.6545454545454543,
"grad_norm": 0.3058757185935974,
"learning_rate": 6.175000000000001e-06,
"loss": 0.0015,
"step": 2010
},
{
"epoch": 3.672727272727273,
"grad_norm": 0.008120411075651646,
"learning_rate": 6.091666666666667e-06,
"loss": 0.0008,
"step": 2020
},
{
"epoch": 3.690909090909091,
"grad_norm": 0.007178381085395813,
"learning_rate": 6.008333333333334e-06,
"loss": 0.0003,
"step": 2030
},
{
"epoch": 3.709090909090909,
"grad_norm": 0.12139607220888138,
"learning_rate": 5.925000000000001e-06,
"loss": 0.0001,
"step": 2040
},
{
"epoch": 3.7272727272727275,
"grad_norm": 0.16555677354335785,
"learning_rate": 5.841666666666667e-06,
"loss": 0.0004,
"step": 2050
},
{
"epoch": 3.7454545454545456,
"grad_norm": 0.08208701014518738,
"learning_rate": 5.758333333333334e-06,
"loss": 0.0002,
"step": 2060
},
{
"epoch": 3.7636363636363637,
"grad_norm": 0.0696110725402832,
"learning_rate": 5.675000000000001e-06,
"loss": 0.0008,
"step": 2070
},
{
"epoch": 3.7818181818181817,
"grad_norm": 0.019171856343746185,
"learning_rate": 5.591666666666668e-06,
"loss": 0.0009,
"step": 2080
},
{
"epoch": 3.8,
"grad_norm": 0.011577253229916096,
"learning_rate": 5.508333333333334e-06,
"loss": 0.0007,
"step": 2090
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.1947954148054123,
"learning_rate": 5.4250000000000006e-06,
"loss": 0.0005,
"step": 2100
},
{
"epoch": 3.8181818181818183,
"eval_f1": 0.9789500380420999,
"eval_f2": 0.9838907014681892,
"eval_loss": 0.012196212075650692,
"eval_precision": 0.9708249496981891,
"eval_recall": 0.9872122762148338,
"eval_runtime": 10.6686,
"eval_samples_per_second": 732.992,
"eval_steps_per_second": 11.529,
"step": 2100
}
],
"logging_steps": 10,
"max_steps": 2750,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 300,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.07084650174464e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}