RoBERTa-on-Additional25 / trainer_state.json
slightlycodic's picture
Upload 11 files
f89feff verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 5478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00547645125958379,
"grad_norm": 5.172877788543701,
"learning_rate": 2e-05,
"loss": 0.9408,
"step": 10
},
{
"epoch": 0.01095290251916758,
"grad_norm": 3.29821515083313,
"learning_rate": 2e-05,
"loss": 0.4667,
"step": 20
},
{
"epoch": 0.01642935377875137,
"grad_norm": 8.682180404663086,
"learning_rate": 2e-05,
"loss": 0.4354,
"step": 30
},
{
"epoch": 0.02190580503833516,
"grad_norm": 2.988210439682007,
"learning_rate": 2e-05,
"loss": 0.3187,
"step": 40
},
{
"epoch": 0.027382256297918947,
"grad_norm": 2.8658511638641357,
"learning_rate": 2e-05,
"loss": 0.2634,
"step": 50
},
{
"epoch": 0.03285870755750274,
"grad_norm": 4.159895896911621,
"learning_rate": 2e-05,
"loss": 0.2714,
"step": 60
},
{
"epoch": 0.038335158817086525,
"grad_norm": 4.234712600708008,
"learning_rate": 2e-05,
"loss": 0.2401,
"step": 70
},
{
"epoch": 0.04381161007667032,
"grad_norm": 4.615501403808594,
"learning_rate": 2e-05,
"loss": 0.2764,
"step": 80
},
{
"epoch": 0.04928806133625411,
"grad_norm": 2.7115259170532227,
"learning_rate": 2e-05,
"loss": 0.2515,
"step": 90
},
{
"epoch": 0.054764512595837894,
"grad_norm": 4.8792500495910645,
"learning_rate": 2e-05,
"loss": 0.248,
"step": 100
},
{
"epoch": 0.060240963855421686,
"grad_norm": 3.3276002407073975,
"learning_rate": 2e-05,
"loss": 0.2377,
"step": 110
},
{
"epoch": 0.06571741511500548,
"grad_norm": 3.494762897491455,
"learning_rate": 2e-05,
"loss": 0.2317,
"step": 120
},
{
"epoch": 0.07119386637458927,
"grad_norm": 3.085296392440796,
"learning_rate": 2e-05,
"loss": 0.2164,
"step": 130
},
{
"epoch": 0.07667031763417305,
"grad_norm": 2.1236257553100586,
"learning_rate": 2e-05,
"loss": 0.2612,
"step": 140
},
{
"epoch": 0.08214676889375684,
"grad_norm": 2.76648211479187,
"learning_rate": 2e-05,
"loss": 0.2113,
"step": 150
},
{
"epoch": 0.08762322015334063,
"grad_norm": 5.641284942626953,
"learning_rate": 2e-05,
"loss": 0.2361,
"step": 160
},
{
"epoch": 0.09309967141292443,
"grad_norm": 2.8492302894592285,
"learning_rate": 2e-05,
"loss": 0.2273,
"step": 170
},
{
"epoch": 0.09857612267250822,
"grad_norm": 4.214968204498291,
"learning_rate": 2e-05,
"loss": 0.2411,
"step": 180
},
{
"epoch": 0.10405257393209201,
"grad_norm": 4.820573329925537,
"learning_rate": 2e-05,
"loss": 0.182,
"step": 190
},
{
"epoch": 0.10952902519167579,
"grad_norm": 2.3844950199127197,
"learning_rate": 2e-05,
"loss": 0.2221,
"step": 200
},
{
"epoch": 0.11500547645125958,
"grad_norm": 3.7508792877197266,
"learning_rate": 2e-05,
"loss": 0.2787,
"step": 210
},
{
"epoch": 0.12048192771084337,
"grad_norm": 3.6480345726013184,
"learning_rate": 2e-05,
"loss": 0.2308,
"step": 220
},
{
"epoch": 0.12595837897042717,
"grad_norm": 3.0960395336151123,
"learning_rate": 2e-05,
"loss": 0.2141,
"step": 230
},
{
"epoch": 0.13143483023001096,
"grad_norm": 2.7262496948242188,
"learning_rate": 2e-05,
"loss": 0.2295,
"step": 240
},
{
"epoch": 0.13691128148959475,
"grad_norm": 3.7271182537078857,
"learning_rate": 2e-05,
"loss": 0.1957,
"step": 250
},
{
"epoch": 0.14238773274917854,
"grad_norm": 3.30234432220459,
"learning_rate": 2e-05,
"loss": 0.233,
"step": 260
},
{
"epoch": 0.14786418400876233,
"grad_norm": 2.1024515628814697,
"learning_rate": 2e-05,
"loss": 0.205,
"step": 270
},
{
"epoch": 0.1533406352683461,
"grad_norm": 3.121746063232422,
"learning_rate": 2e-05,
"loss": 0.1806,
"step": 280
},
{
"epoch": 0.1588170865279299,
"grad_norm": 5.09675407409668,
"learning_rate": 2e-05,
"loss": 0.2143,
"step": 290
},
{
"epoch": 0.16429353778751368,
"grad_norm": 2.1228365898132324,
"learning_rate": 2e-05,
"loss": 0.1972,
"step": 300
},
{
"epoch": 0.16976998904709747,
"grad_norm": 5.942009925842285,
"learning_rate": 2e-05,
"loss": 0.2367,
"step": 310
},
{
"epoch": 0.17524644030668127,
"grad_norm": 3.828641653060913,
"learning_rate": 2e-05,
"loss": 0.2219,
"step": 320
},
{
"epoch": 0.18072289156626506,
"grad_norm": 3.5164341926574707,
"learning_rate": 2e-05,
"loss": 0.2209,
"step": 330
},
{
"epoch": 0.18619934282584885,
"grad_norm": 3.276562452316284,
"learning_rate": 2e-05,
"loss": 0.2066,
"step": 340
},
{
"epoch": 0.19167579408543264,
"grad_norm": 2.173800468444824,
"learning_rate": 2e-05,
"loss": 0.1824,
"step": 350
},
{
"epoch": 0.19715224534501644,
"grad_norm": 4.548515319824219,
"learning_rate": 2e-05,
"loss": 0.2085,
"step": 360
},
{
"epoch": 0.20262869660460023,
"grad_norm": 5.658101558685303,
"learning_rate": 2e-05,
"loss": 0.2021,
"step": 370
},
{
"epoch": 0.20810514786418402,
"grad_norm": 2.9604289531707764,
"learning_rate": 2e-05,
"loss": 0.2139,
"step": 380
},
{
"epoch": 0.21358159912376778,
"grad_norm": 1.595267653465271,
"learning_rate": 2e-05,
"loss": 0.2441,
"step": 390
},
{
"epoch": 0.21905805038335158,
"grad_norm": 2.9476821422576904,
"learning_rate": 2e-05,
"loss": 0.2221,
"step": 400
},
{
"epoch": 0.22453450164293537,
"grad_norm": 6.530129909515381,
"learning_rate": 2e-05,
"loss": 0.2275,
"step": 410
},
{
"epoch": 0.23001095290251916,
"grad_norm": 2.306079626083374,
"learning_rate": 2e-05,
"loss": 0.1799,
"step": 420
},
{
"epoch": 0.23548740416210295,
"grad_norm": 3.0433263778686523,
"learning_rate": 2e-05,
"loss": 0.185,
"step": 430
},
{
"epoch": 0.24096385542168675,
"grad_norm": 3.235048770904541,
"learning_rate": 2e-05,
"loss": 0.2363,
"step": 440
},
{
"epoch": 0.24644030668127054,
"grad_norm": 2.9125173091888428,
"learning_rate": 2e-05,
"loss": 0.2158,
"step": 450
},
{
"epoch": 0.25191675794085433,
"grad_norm": 1.9198905229568481,
"learning_rate": 2e-05,
"loss": 0.2514,
"step": 460
},
{
"epoch": 0.2573932092004381,
"grad_norm": 4.257998943328857,
"learning_rate": 2e-05,
"loss": 0.2283,
"step": 470
},
{
"epoch": 0.2628696604600219,
"grad_norm": 1.3411542177200317,
"learning_rate": 2e-05,
"loss": 0.1712,
"step": 480
},
{
"epoch": 0.2683461117196057,
"grad_norm": 2.114000082015991,
"learning_rate": 2e-05,
"loss": 0.2022,
"step": 490
},
{
"epoch": 0.2738225629791895,
"grad_norm": 2.4874045848846436,
"learning_rate": 2e-05,
"loss": 0.1744,
"step": 500
},
{
"epoch": 0.2792990142387733,
"grad_norm": 3.700068950653076,
"learning_rate": 2e-05,
"loss": 0.2078,
"step": 510
},
{
"epoch": 0.2847754654983571,
"grad_norm": 3.8135080337524414,
"learning_rate": 2e-05,
"loss": 0.1884,
"step": 520
},
{
"epoch": 0.2902519167579409,
"grad_norm": 2.2092432975769043,
"learning_rate": 2e-05,
"loss": 0.1652,
"step": 530
},
{
"epoch": 0.29572836801752467,
"grad_norm": 2.4670491218566895,
"learning_rate": 2e-05,
"loss": 0.2201,
"step": 540
},
{
"epoch": 0.30120481927710846,
"grad_norm": 2.9962000846862793,
"learning_rate": 2e-05,
"loss": 0.1776,
"step": 550
},
{
"epoch": 0.3066812705366922,
"grad_norm": 2.7531886100769043,
"learning_rate": 2e-05,
"loss": 0.1997,
"step": 560
},
{
"epoch": 0.312157721796276,
"grad_norm": 3.9697697162628174,
"learning_rate": 2e-05,
"loss": 0.1689,
"step": 570
},
{
"epoch": 0.3176341730558598,
"grad_norm": 2.352689743041992,
"learning_rate": 2e-05,
"loss": 0.2196,
"step": 580
},
{
"epoch": 0.3231106243154436,
"grad_norm": 2.1512277126312256,
"learning_rate": 2e-05,
"loss": 0.2266,
"step": 590
},
{
"epoch": 0.32858707557502737,
"grad_norm": 3.3206424713134766,
"learning_rate": 2e-05,
"loss": 0.1837,
"step": 600
},
{
"epoch": 0.33406352683461116,
"grad_norm": 2.5991427898406982,
"learning_rate": 2e-05,
"loss": 0.1642,
"step": 610
},
{
"epoch": 0.33953997809419495,
"grad_norm": 4.06941032409668,
"learning_rate": 2e-05,
"loss": 0.1954,
"step": 620
},
{
"epoch": 0.34501642935377874,
"grad_norm": 2.006395101547241,
"learning_rate": 2e-05,
"loss": 0.1644,
"step": 630
},
{
"epoch": 0.35049288061336253,
"grad_norm": 2.333529233932495,
"learning_rate": 2e-05,
"loss": 0.1766,
"step": 640
},
{
"epoch": 0.3559693318729463,
"grad_norm": 1.8663638830184937,
"learning_rate": 2e-05,
"loss": 0.1427,
"step": 650
},
{
"epoch": 0.3614457831325301,
"grad_norm": 1.4063774347305298,
"learning_rate": 2e-05,
"loss": 0.1886,
"step": 660
},
{
"epoch": 0.3669222343921139,
"grad_norm": 2.3204855918884277,
"learning_rate": 2e-05,
"loss": 0.1794,
"step": 670
},
{
"epoch": 0.3723986856516977,
"grad_norm": 2.0688014030456543,
"learning_rate": 2e-05,
"loss": 0.1788,
"step": 680
},
{
"epoch": 0.3778751369112815,
"grad_norm": 2.9626317024230957,
"learning_rate": 2e-05,
"loss": 0.1931,
"step": 690
},
{
"epoch": 0.3833515881708653,
"grad_norm": 2.1686134338378906,
"learning_rate": 2e-05,
"loss": 0.1797,
"step": 700
},
{
"epoch": 0.3888280394304491,
"grad_norm": 3.220993995666504,
"learning_rate": 2e-05,
"loss": 0.2001,
"step": 710
},
{
"epoch": 0.39430449069003287,
"grad_norm": 4.387556076049805,
"learning_rate": 2e-05,
"loss": 0.1679,
"step": 720
},
{
"epoch": 0.39978094194961666,
"grad_norm": 6.711359024047852,
"learning_rate": 2e-05,
"loss": 0.1906,
"step": 730
},
{
"epoch": 0.40525739320920046,
"grad_norm": 2.8974344730377197,
"learning_rate": 2e-05,
"loss": 0.1842,
"step": 740
},
{
"epoch": 0.41073384446878425,
"grad_norm": 2.0242817401885986,
"learning_rate": 2e-05,
"loss": 0.1874,
"step": 750
},
{
"epoch": 0.41621029572836804,
"grad_norm": 2.71280574798584,
"learning_rate": 2e-05,
"loss": 0.1226,
"step": 760
},
{
"epoch": 0.42168674698795183,
"grad_norm": 2.318493604660034,
"learning_rate": 2e-05,
"loss": 0.2111,
"step": 770
},
{
"epoch": 0.42716319824753557,
"grad_norm": 4.267580509185791,
"learning_rate": 2e-05,
"loss": 0.2008,
"step": 780
},
{
"epoch": 0.43263964950711936,
"grad_norm": 2.4939076900482178,
"learning_rate": 2e-05,
"loss": 0.2053,
"step": 790
},
{
"epoch": 0.43811610076670315,
"grad_norm": 1.9013803005218506,
"learning_rate": 2e-05,
"loss": 0.1724,
"step": 800
},
{
"epoch": 0.44359255202628695,
"grad_norm": 3.4086337089538574,
"learning_rate": 2e-05,
"loss": 0.1688,
"step": 810
},
{
"epoch": 0.44906900328587074,
"grad_norm": 4.253098011016846,
"learning_rate": 2e-05,
"loss": 0.2092,
"step": 820
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.7755553722381592,
"learning_rate": 2e-05,
"loss": 0.1812,
"step": 830
},
{
"epoch": 0.4600219058050383,
"grad_norm": 1.562054991722107,
"learning_rate": 2e-05,
"loss": 0.2296,
"step": 840
},
{
"epoch": 0.4654983570646221,
"grad_norm": 1.4678446054458618,
"learning_rate": 2e-05,
"loss": 0.2006,
"step": 850
},
{
"epoch": 0.4709748083242059,
"grad_norm": 2.852142810821533,
"learning_rate": 2e-05,
"loss": 0.1763,
"step": 860
},
{
"epoch": 0.4764512595837897,
"grad_norm": 3.451470136642456,
"learning_rate": 2e-05,
"loss": 0.198,
"step": 870
},
{
"epoch": 0.4819277108433735,
"grad_norm": 4.738248348236084,
"learning_rate": 2e-05,
"loss": 0.1803,
"step": 880
},
{
"epoch": 0.4874041621029573,
"grad_norm": 2.169006824493408,
"learning_rate": 2e-05,
"loss": 0.1711,
"step": 890
},
{
"epoch": 0.4928806133625411,
"grad_norm": 6.985939979553223,
"learning_rate": 2e-05,
"loss": 0.1828,
"step": 900
},
{
"epoch": 0.49835706462212487,
"grad_norm": 3.6901047229766846,
"learning_rate": 2e-05,
"loss": 0.227,
"step": 910
},
{
"epoch": 0.5038335158817087,
"grad_norm": 4.057900428771973,
"learning_rate": 2e-05,
"loss": 0.181,
"step": 920
},
{
"epoch": 0.5093099671412924,
"grad_norm": 2.3261914253234863,
"learning_rate": 2e-05,
"loss": 0.2152,
"step": 930
},
{
"epoch": 0.5147864184008762,
"grad_norm": 2.459638833999634,
"learning_rate": 2e-05,
"loss": 0.2261,
"step": 940
},
{
"epoch": 0.52026286966046,
"grad_norm": 1.9431893825531006,
"learning_rate": 2e-05,
"loss": 0.1555,
"step": 950
},
{
"epoch": 0.5257393209200438,
"grad_norm": 2.213655710220337,
"learning_rate": 2e-05,
"loss": 0.1966,
"step": 960
},
{
"epoch": 0.5312157721796276,
"grad_norm": 3.0963807106018066,
"learning_rate": 2e-05,
"loss": 0.1672,
"step": 970
},
{
"epoch": 0.5366922234392114,
"grad_norm": 1.3352348804473877,
"learning_rate": 2e-05,
"loss": 0.1688,
"step": 980
},
{
"epoch": 0.5421686746987951,
"grad_norm": 4.130510330200195,
"learning_rate": 2e-05,
"loss": 0.2033,
"step": 990
},
{
"epoch": 0.547645125958379,
"grad_norm": 1.72730553150177,
"learning_rate": 2e-05,
"loss": 0.1883,
"step": 1000
},
{
"epoch": 0.5531215772179627,
"grad_norm": 2.0997588634490967,
"learning_rate": 2e-05,
"loss": 0.2085,
"step": 1010
},
{
"epoch": 0.5585980284775466,
"grad_norm": 1.9637783765792847,
"learning_rate": 2e-05,
"loss": 0.1787,
"step": 1020
},
{
"epoch": 0.5640744797371303,
"grad_norm": 5.923566818237305,
"learning_rate": 2e-05,
"loss": 0.179,
"step": 1030
},
{
"epoch": 0.5695509309967142,
"grad_norm": 2.0748746395111084,
"learning_rate": 2e-05,
"loss": 0.1937,
"step": 1040
},
{
"epoch": 0.5750273822562979,
"grad_norm": 3.298703193664551,
"learning_rate": 2e-05,
"loss": 0.1661,
"step": 1050
},
{
"epoch": 0.5805038335158818,
"grad_norm": 2.482518196105957,
"learning_rate": 2e-05,
"loss": 0.1767,
"step": 1060
},
{
"epoch": 0.5859802847754655,
"grad_norm": 1.430920124053955,
"learning_rate": 2e-05,
"loss": 0.1592,
"step": 1070
},
{
"epoch": 0.5914567360350493,
"grad_norm": 2.223555326461792,
"learning_rate": 2e-05,
"loss": 0.1623,
"step": 1080
},
{
"epoch": 0.5969331872946331,
"grad_norm": 3.7139480113983154,
"learning_rate": 2e-05,
"loss": 0.2036,
"step": 1090
},
{
"epoch": 0.6024096385542169,
"grad_norm": 2.7747836112976074,
"learning_rate": 2e-05,
"loss": 0.159,
"step": 1100
},
{
"epoch": 0.6078860898138007,
"grad_norm": 1.7586026191711426,
"learning_rate": 2e-05,
"loss": 0.1994,
"step": 1110
},
{
"epoch": 0.6133625410733844,
"grad_norm": 1.7416315078735352,
"learning_rate": 2e-05,
"loss": 0.1974,
"step": 1120
},
{
"epoch": 0.6188389923329682,
"grad_norm": 2.214825391769409,
"learning_rate": 2e-05,
"loss": 0.1584,
"step": 1130
},
{
"epoch": 0.624315443592552,
"grad_norm": 4.937350273132324,
"learning_rate": 2e-05,
"loss": 0.1751,
"step": 1140
},
{
"epoch": 0.6297918948521358,
"grad_norm": 2.918086290359497,
"learning_rate": 2e-05,
"loss": 0.1872,
"step": 1150
},
{
"epoch": 0.6352683461117196,
"grad_norm": 2.486037254333496,
"learning_rate": 2e-05,
"loss": 0.1769,
"step": 1160
},
{
"epoch": 0.6407447973713034,
"grad_norm": 1.8280752897262573,
"learning_rate": 2e-05,
"loss": 0.1948,
"step": 1170
},
{
"epoch": 0.6462212486308871,
"grad_norm": 2.8489694595336914,
"learning_rate": 2e-05,
"loss": 0.1598,
"step": 1180
},
{
"epoch": 0.651697699890471,
"grad_norm": 5.198742389678955,
"learning_rate": 2e-05,
"loss": 0.1503,
"step": 1190
},
{
"epoch": 0.6571741511500547,
"grad_norm": 4.022942066192627,
"learning_rate": 2e-05,
"loss": 0.1242,
"step": 1200
},
{
"epoch": 0.6626506024096386,
"grad_norm": 1.6303025484085083,
"learning_rate": 2e-05,
"loss": 0.1753,
"step": 1210
},
{
"epoch": 0.6681270536692223,
"grad_norm": 2.142221450805664,
"learning_rate": 2e-05,
"loss": 0.1933,
"step": 1220
},
{
"epoch": 0.6736035049288062,
"grad_norm": 1.7236963510513306,
"learning_rate": 2e-05,
"loss": 0.1632,
"step": 1230
},
{
"epoch": 0.6790799561883899,
"grad_norm": 6.54170560836792,
"learning_rate": 2e-05,
"loss": 0.1745,
"step": 1240
},
{
"epoch": 0.6845564074479737,
"grad_norm": 3.051344394683838,
"learning_rate": 2e-05,
"loss": 0.161,
"step": 1250
},
{
"epoch": 0.6900328587075575,
"grad_norm": 4.368143558502197,
"learning_rate": 2e-05,
"loss": 0.1686,
"step": 1260
},
{
"epoch": 0.6955093099671413,
"grad_norm": 2.810403347015381,
"learning_rate": 2e-05,
"loss": 0.1341,
"step": 1270
},
{
"epoch": 0.7009857612267251,
"grad_norm": 3.573010206222534,
"learning_rate": 2e-05,
"loss": 0.1888,
"step": 1280
},
{
"epoch": 0.7064622124863089,
"grad_norm": 3.028301954269409,
"learning_rate": 2e-05,
"loss": 0.1753,
"step": 1290
},
{
"epoch": 0.7119386637458927,
"grad_norm": 3.8532004356384277,
"learning_rate": 2e-05,
"loss": 0.1733,
"step": 1300
},
{
"epoch": 0.7174151150054765,
"grad_norm": 2.062229633331299,
"learning_rate": 2e-05,
"loss": 0.1533,
"step": 1310
},
{
"epoch": 0.7228915662650602,
"grad_norm": 3.278475284576416,
"learning_rate": 2e-05,
"loss": 0.157,
"step": 1320
},
{
"epoch": 0.7283680175246441,
"grad_norm": 2.650078058242798,
"learning_rate": 2e-05,
"loss": 0.168,
"step": 1330
},
{
"epoch": 0.7338444687842278,
"grad_norm": 3.017892360687256,
"learning_rate": 2e-05,
"loss": 0.2381,
"step": 1340
},
{
"epoch": 0.7393209200438116,
"grad_norm": 1.8926328420639038,
"learning_rate": 2e-05,
"loss": 0.1815,
"step": 1350
},
{
"epoch": 0.7447973713033954,
"grad_norm": 1.819949746131897,
"learning_rate": 2e-05,
"loss": 0.1387,
"step": 1360
},
{
"epoch": 0.7502738225629791,
"grad_norm": 1.5167309045791626,
"learning_rate": 2e-05,
"loss": 0.138,
"step": 1370
},
{
"epoch": 0.755750273822563,
"grad_norm": 2.128439426422119,
"learning_rate": 2e-05,
"loss": 0.1566,
"step": 1380
},
{
"epoch": 0.7612267250821467,
"grad_norm": 1.9905339479446411,
"learning_rate": 2e-05,
"loss": 0.1711,
"step": 1390
},
{
"epoch": 0.7667031763417306,
"grad_norm": 1.624918818473816,
"learning_rate": 2e-05,
"loss": 0.1614,
"step": 1400
},
{
"epoch": 0.7721796276013143,
"grad_norm": 2.2797772884368896,
"learning_rate": 2e-05,
"loss": 0.1839,
"step": 1410
},
{
"epoch": 0.7776560788608982,
"grad_norm": 1.8958566188812256,
"learning_rate": 2e-05,
"loss": 0.1609,
"step": 1420
},
{
"epoch": 0.7831325301204819,
"grad_norm": 4.018071174621582,
"learning_rate": 2e-05,
"loss": 0.2153,
"step": 1430
},
{
"epoch": 0.7886089813800657,
"grad_norm": 2.038041353225708,
"learning_rate": 2e-05,
"loss": 0.1576,
"step": 1440
},
{
"epoch": 0.7940854326396495,
"grad_norm": 3.009593963623047,
"learning_rate": 2e-05,
"loss": 0.1448,
"step": 1450
},
{
"epoch": 0.7995618838992333,
"grad_norm": 2.0207109451293945,
"learning_rate": 2e-05,
"loss": 0.1642,
"step": 1460
},
{
"epoch": 0.8050383351588171,
"grad_norm": 1.9745655059814453,
"learning_rate": 2e-05,
"loss": 0.1499,
"step": 1470
},
{
"epoch": 0.8105147864184009,
"grad_norm": 2.5017263889312744,
"learning_rate": 2e-05,
"loss": 0.1865,
"step": 1480
},
{
"epoch": 0.8159912376779846,
"grad_norm": 3.3768310546875,
"learning_rate": 2e-05,
"loss": 0.1439,
"step": 1490
},
{
"epoch": 0.8214676889375685,
"grad_norm": 1.90123450756073,
"learning_rate": 2e-05,
"loss": 0.1712,
"step": 1500
},
{
"epoch": 0.8269441401971522,
"grad_norm": 1.7746949195861816,
"learning_rate": 2e-05,
"loss": 0.1712,
"step": 1510
},
{
"epoch": 0.8324205914567361,
"grad_norm": 2.588888645172119,
"learning_rate": 2e-05,
"loss": 0.189,
"step": 1520
},
{
"epoch": 0.8378970427163198,
"grad_norm": 3.115365743637085,
"learning_rate": 2e-05,
"loss": 0.1595,
"step": 1530
},
{
"epoch": 0.8433734939759037,
"grad_norm": 1.9716410636901855,
"learning_rate": 2e-05,
"loss": 0.1639,
"step": 1540
},
{
"epoch": 0.8488499452354874,
"grad_norm": 1.8997513055801392,
"learning_rate": 2e-05,
"loss": 0.1843,
"step": 1550
},
{
"epoch": 0.8543263964950711,
"grad_norm": 1.860077142715454,
"learning_rate": 2e-05,
"loss": 0.1748,
"step": 1560
},
{
"epoch": 0.859802847754655,
"grad_norm": 2.047376871109009,
"learning_rate": 2e-05,
"loss": 0.1548,
"step": 1570
},
{
"epoch": 0.8652792990142387,
"grad_norm": 2.242220401763916,
"learning_rate": 2e-05,
"loss": 0.1897,
"step": 1580
},
{
"epoch": 0.8707557502738226,
"grad_norm": 1.3936264514923096,
"learning_rate": 2e-05,
"loss": 0.1529,
"step": 1590
},
{
"epoch": 0.8762322015334063,
"grad_norm": 1.3506709337234497,
"learning_rate": 2e-05,
"loss": 0.1635,
"step": 1600
},
{
"epoch": 0.8817086527929902,
"grad_norm": 2.024489641189575,
"learning_rate": 2e-05,
"loss": 0.1751,
"step": 1610
},
{
"epoch": 0.8871851040525739,
"grad_norm": 3.0132129192352295,
"learning_rate": 2e-05,
"loss": 0.1447,
"step": 1620
},
{
"epoch": 0.8926615553121577,
"grad_norm": 1.4840929508209229,
"learning_rate": 2e-05,
"loss": 0.1668,
"step": 1630
},
{
"epoch": 0.8981380065717415,
"grad_norm": 5.782477378845215,
"learning_rate": 2e-05,
"loss": 0.186,
"step": 1640
},
{
"epoch": 0.9036144578313253,
"grad_norm": 3.7930588722229004,
"learning_rate": 2e-05,
"loss": 0.1477,
"step": 1650
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.4280755519866943,
"learning_rate": 2e-05,
"loss": 0.1733,
"step": 1660
},
{
"epoch": 0.9145673603504929,
"grad_norm": 3.5071022510528564,
"learning_rate": 2e-05,
"loss": 0.1771,
"step": 1670
},
{
"epoch": 0.9200438116100766,
"grad_norm": 1.890026330947876,
"learning_rate": 2e-05,
"loss": 0.1807,
"step": 1680
},
{
"epoch": 0.9255202628696605,
"grad_norm": 4.8093647956848145,
"learning_rate": 2e-05,
"loss": 0.1783,
"step": 1690
},
{
"epoch": 0.9309967141292442,
"grad_norm": 3.141622304916382,
"learning_rate": 2e-05,
"loss": 0.1425,
"step": 1700
},
{
"epoch": 0.9364731653888281,
"grad_norm": 1.4867947101593018,
"learning_rate": 2e-05,
"loss": 0.1549,
"step": 1710
},
{
"epoch": 0.9419496166484118,
"grad_norm": 2.396588087081909,
"learning_rate": 2e-05,
"loss": 0.1459,
"step": 1720
},
{
"epoch": 0.9474260679079957,
"grad_norm": 2.241640329360962,
"learning_rate": 2e-05,
"loss": 0.2159,
"step": 1730
},
{
"epoch": 0.9529025191675794,
"grad_norm": 2.0894391536712646,
"learning_rate": 2e-05,
"loss": 0.1824,
"step": 1740
},
{
"epoch": 0.9583789704271632,
"grad_norm": 2.195657253265381,
"learning_rate": 2e-05,
"loss": 0.1773,
"step": 1750
},
{
"epoch": 0.963855421686747,
"grad_norm": 1.9088704586029053,
"learning_rate": 2e-05,
"loss": 0.1671,
"step": 1760
},
{
"epoch": 0.9693318729463308,
"grad_norm": 2.40929913520813,
"learning_rate": 2e-05,
"loss": 0.1483,
"step": 1770
},
{
"epoch": 0.9748083242059146,
"grad_norm": 2.1379597187042236,
"learning_rate": 2e-05,
"loss": 0.1651,
"step": 1780
},
{
"epoch": 0.9802847754654983,
"grad_norm": 1.5753893852233887,
"learning_rate": 2e-05,
"loss": 0.1521,
"step": 1790
},
{
"epoch": 0.9857612267250822,
"grad_norm": 1.7690379619598389,
"learning_rate": 2e-05,
"loss": 0.1685,
"step": 1800
},
{
"epoch": 0.9912376779846659,
"grad_norm": 3.3368117809295654,
"learning_rate": 2e-05,
"loss": 0.1808,
"step": 1810
},
{
"epoch": 0.9967141292442497,
"grad_norm": 2.3334920406341553,
"learning_rate": 2e-05,
"loss": 0.1687,
"step": 1820
},
{
"epoch": 1.0,
"eval_accuracy": 0.9350717996050485,
"eval_f1": 0.8531654843973757,
"eval_loss": 0.16189317405223846,
"eval_precision": 0.7893026050251876,
"eval_recall": 0.9282724884500407,
"eval_runtime": 19.8282,
"eval_samples_per_second": 311.929,
"eval_steps_per_second": 19.518,
"step": 1826
},
{
"epoch": 1.0021905805038336,
"grad_norm": 3.1540722846984863,
"learning_rate": 2e-05,
"loss": 0.1989,
"step": 1830
},
{
"epoch": 1.0076670317634173,
"grad_norm": 2.7038586139678955,
"learning_rate": 2e-05,
"loss": 0.1663,
"step": 1840
},
{
"epoch": 1.013143483023001,
"grad_norm": 2.185299873352051,
"learning_rate": 2e-05,
"loss": 0.1469,
"step": 1850
},
{
"epoch": 1.0186199342825848,
"grad_norm": 4.436729431152344,
"learning_rate": 2e-05,
"loss": 0.1658,
"step": 1860
},
{
"epoch": 1.0240963855421688,
"grad_norm": 2.3837592601776123,
"learning_rate": 2e-05,
"loss": 0.1563,
"step": 1870
},
{
"epoch": 1.0295728368017525,
"grad_norm": 1.6888504028320312,
"learning_rate": 2e-05,
"loss": 0.1705,
"step": 1880
},
{
"epoch": 1.0350492880613362,
"grad_norm": 1.7870920896530151,
"learning_rate": 2e-05,
"loss": 0.1754,
"step": 1890
},
{
"epoch": 1.04052573932092,
"grad_norm": 2.037872314453125,
"learning_rate": 2e-05,
"loss": 0.134,
"step": 1900
},
{
"epoch": 1.046002190580504,
"grad_norm": 1.956781029701233,
"learning_rate": 2e-05,
"loss": 0.1525,
"step": 1910
},
{
"epoch": 1.0514786418400877,
"grad_norm": 3.9054486751556396,
"learning_rate": 2e-05,
"loss": 0.1405,
"step": 1920
},
{
"epoch": 1.0569550930996714,
"grad_norm": 3.2423737049102783,
"learning_rate": 2e-05,
"loss": 0.1304,
"step": 1930
},
{
"epoch": 1.0624315443592551,
"grad_norm": 1.7311038970947266,
"learning_rate": 2e-05,
"loss": 0.1288,
"step": 1940
},
{
"epoch": 1.067907995618839,
"grad_norm": 3.4807159900665283,
"learning_rate": 2e-05,
"loss": 0.1723,
"step": 1950
},
{
"epoch": 1.0733844468784228,
"grad_norm": 1.2659446001052856,
"learning_rate": 2e-05,
"loss": 0.1674,
"step": 1960
},
{
"epoch": 1.0788608981380066,
"grad_norm": 2.4274561405181885,
"learning_rate": 2e-05,
"loss": 0.1634,
"step": 1970
},
{
"epoch": 1.0843373493975903,
"grad_norm": 1.1292122602462769,
"learning_rate": 2e-05,
"loss": 0.1579,
"step": 1980
},
{
"epoch": 1.0898138006571743,
"grad_norm": 4.752586364746094,
"learning_rate": 2e-05,
"loss": 0.1267,
"step": 1990
},
{
"epoch": 1.095290251916758,
"grad_norm": 4.907914161682129,
"learning_rate": 2e-05,
"loss": 0.1444,
"step": 2000
},
{
"epoch": 1.1007667031763417,
"grad_norm": 1.4939802885055542,
"learning_rate": 2e-05,
"loss": 0.1763,
"step": 2010
},
{
"epoch": 1.1062431544359255,
"grad_norm": 4.959670066833496,
"learning_rate": 2e-05,
"loss": 0.1581,
"step": 2020
},
{
"epoch": 1.1117196056955092,
"grad_norm": 1.6166772842407227,
"learning_rate": 2e-05,
"loss": 0.1379,
"step": 2030
},
{
"epoch": 1.1171960569550932,
"grad_norm": 1.21837317943573,
"learning_rate": 2e-05,
"loss": 0.1259,
"step": 2040
},
{
"epoch": 1.122672508214677,
"grad_norm": 2.209987163543701,
"learning_rate": 2e-05,
"loss": 0.134,
"step": 2050
},
{
"epoch": 1.1281489594742606,
"grad_norm": 2.8201191425323486,
"learning_rate": 2e-05,
"loss": 0.1391,
"step": 2060
},
{
"epoch": 1.1336254107338444,
"grad_norm": 2.6724655628204346,
"learning_rate": 2e-05,
"loss": 0.1731,
"step": 2070
},
{
"epoch": 1.1391018619934283,
"grad_norm": 3.112408399581909,
"learning_rate": 2e-05,
"loss": 0.1419,
"step": 2080
},
{
"epoch": 1.144578313253012,
"grad_norm": 6.945749759674072,
"learning_rate": 2e-05,
"loss": 0.1673,
"step": 2090
},
{
"epoch": 1.1500547645125958,
"grad_norm": 4.609986782073975,
"learning_rate": 2e-05,
"loss": 0.1118,
"step": 2100
},
{
"epoch": 1.1555312157721795,
"grad_norm": 1.9280059337615967,
"learning_rate": 2e-05,
"loss": 0.1697,
"step": 2110
},
{
"epoch": 1.1610076670317635,
"grad_norm": 2.8931400775909424,
"learning_rate": 2e-05,
"loss": 0.1794,
"step": 2120
},
{
"epoch": 1.1664841182913472,
"grad_norm": 1.505615234375,
"learning_rate": 2e-05,
"loss": 0.1438,
"step": 2130
},
{
"epoch": 1.171960569550931,
"grad_norm": 1.2091026306152344,
"learning_rate": 2e-05,
"loss": 0.1409,
"step": 2140
},
{
"epoch": 1.1774370208105147,
"grad_norm": 5.640398025512695,
"learning_rate": 2e-05,
"loss": 0.1374,
"step": 2150
},
{
"epoch": 1.1829134720700987,
"grad_norm": 1.9069983959197998,
"learning_rate": 2e-05,
"loss": 0.1333,
"step": 2160
},
{
"epoch": 1.1883899233296824,
"grad_norm": 2.034888982772827,
"learning_rate": 2e-05,
"loss": 0.185,
"step": 2170
},
{
"epoch": 1.1938663745892661,
"grad_norm": 1.780856728553772,
"learning_rate": 2e-05,
"loss": 0.1463,
"step": 2180
},
{
"epoch": 1.1993428258488499,
"grad_norm": 3.035339593887329,
"learning_rate": 2e-05,
"loss": 0.1662,
"step": 2190
},
{
"epoch": 1.2048192771084336,
"grad_norm": 2.7439584732055664,
"learning_rate": 2e-05,
"loss": 0.1481,
"step": 2200
},
{
"epoch": 1.2102957283680176,
"grad_norm": 4.901017189025879,
"learning_rate": 2e-05,
"loss": 0.1641,
"step": 2210
},
{
"epoch": 1.2157721796276013,
"grad_norm": 2.227445125579834,
"learning_rate": 2e-05,
"loss": 0.1582,
"step": 2220
},
{
"epoch": 1.221248630887185,
"grad_norm": 2.1216564178466797,
"learning_rate": 2e-05,
"loss": 0.1554,
"step": 2230
},
{
"epoch": 1.226725082146769,
"grad_norm": 1.2567392587661743,
"learning_rate": 2e-05,
"loss": 0.1683,
"step": 2240
},
{
"epoch": 1.2322015334063527,
"grad_norm": 1.426159381866455,
"learning_rate": 2e-05,
"loss": 0.17,
"step": 2250
},
{
"epoch": 1.2376779846659365,
"grad_norm": 1.435729742050171,
"learning_rate": 2e-05,
"loss": 0.1404,
"step": 2260
},
{
"epoch": 1.2431544359255202,
"grad_norm": 5.743936538696289,
"learning_rate": 2e-05,
"loss": 0.1665,
"step": 2270
},
{
"epoch": 1.248630887185104,
"grad_norm": 1.7799255847930908,
"learning_rate": 2e-05,
"loss": 0.1538,
"step": 2280
},
{
"epoch": 1.254107338444688,
"grad_norm": 2.466597318649292,
"learning_rate": 2e-05,
"loss": 0.1038,
"step": 2290
},
{
"epoch": 1.2595837897042717,
"grad_norm": 2.635021686553955,
"learning_rate": 2e-05,
"loss": 0.1613,
"step": 2300
},
{
"epoch": 1.2650602409638554,
"grad_norm": 2.1053247451782227,
"learning_rate": 2e-05,
"loss": 0.1169,
"step": 2310
},
{
"epoch": 1.2705366922234393,
"grad_norm": 2.312171459197998,
"learning_rate": 2e-05,
"loss": 0.1574,
"step": 2320
},
{
"epoch": 1.276013143483023,
"grad_norm": 4.142621994018555,
"learning_rate": 2e-05,
"loss": 0.1568,
"step": 2330
},
{
"epoch": 1.2814895947426068,
"grad_norm": 3.278440237045288,
"learning_rate": 2e-05,
"loss": 0.1758,
"step": 2340
},
{
"epoch": 1.2869660460021906,
"grad_norm": 2.5266401767730713,
"learning_rate": 2e-05,
"loss": 0.177,
"step": 2350
},
{
"epoch": 1.2924424972617743,
"grad_norm": 2.4267191886901855,
"learning_rate": 2e-05,
"loss": 0.1485,
"step": 2360
},
{
"epoch": 1.297918948521358,
"grad_norm": 1.5584640502929688,
"learning_rate": 2e-05,
"loss": 0.1583,
"step": 2370
},
{
"epoch": 1.303395399780942,
"grad_norm": 2.881457805633545,
"learning_rate": 2e-05,
"loss": 0.1515,
"step": 2380
},
{
"epoch": 1.3088718510405257,
"grad_norm": 4.441954612731934,
"learning_rate": 2e-05,
"loss": 0.1414,
"step": 2390
},
{
"epoch": 1.3143483023001095,
"grad_norm": 5.456357479095459,
"learning_rate": 2e-05,
"loss": 0.1685,
"step": 2400
},
{
"epoch": 1.3198247535596934,
"grad_norm": 2.449070930480957,
"learning_rate": 2e-05,
"loss": 0.1445,
"step": 2410
},
{
"epoch": 1.3253012048192772,
"grad_norm": 2.4937679767608643,
"learning_rate": 2e-05,
"loss": 0.1528,
"step": 2420
},
{
"epoch": 1.330777656078861,
"grad_norm": 1.7794448137283325,
"learning_rate": 2e-05,
"loss": 0.1601,
"step": 2430
},
{
"epoch": 1.3362541073384446,
"grad_norm": 1.795912265777588,
"learning_rate": 2e-05,
"loss": 0.1542,
"step": 2440
},
{
"epoch": 1.3417305585980284,
"grad_norm": 1.772538661956787,
"learning_rate": 2e-05,
"loss": 0.1297,
"step": 2450
},
{
"epoch": 1.3472070098576123,
"grad_norm": 1.0752304792404175,
"learning_rate": 2e-05,
"loss": 0.13,
"step": 2460
},
{
"epoch": 1.352683461117196,
"grad_norm": 2.1968908309936523,
"learning_rate": 2e-05,
"loss": 0.1373,
"step": 2470
},
{
"epoch": 1.3581599123767798,
"grad_norm": 0.7487109303474426,
"learning_rate": 2e-05,
"loss": 0.1646,
"step": 2480
},
{
"epoch": 1.3636363636363638,
"grad_norm": 2.1781516075134277,
"learning_rate": 2e-05,
"loss": 0.1852,
"step": 2490
},
{
"epoch": 1.3691128148959475,
"grad_norm": 1.8818821907043457,
"learning_rate": 2e-05,
"loss": 0.1481,
"step": 2500
},
{
"epoch": 1.3745892661555312,
"grad_norm": 2.2098746299743652,
"learning_rate": 2e-05,
"loss": 0.1823,
"step": 2510
},
{
"epoch": 1.380065717415115,
"grad_norm": 1.5912271738052368,
"learning_rate": 2e-05,
"loss": 0.1968,
"step": 2520
},
{
"epoch": 1.3855421686746987,
"grad_norm": 1.1806056499481201,
"learning_rate": 2e-05,
"loss": 0.1677,
"step": 2530
},
{
"epoch": 1.3910186199342827,
"grad_norm": 1.9674164056777954,
"learning_rate": 2e-05,
"loss": 0.1273,
"step": 2540
},
{
"epoch": 1.3964950711938664,
"grad_norm": 4.151760578155518,
"learning_rate": 2e-05,
"loss": 0.1658,
"step": 2550
},
{
"epoch": 1.4019715224534501,
"grad_norm": 1.8803857564926147,
"learning_rate": 2e-05,
"loss": 0.1466,
"step": 2560
},
{
"epoch": 1.4074479737130339,
"grad_norm": 2.625727891921997,
"learning_rate": 2e-05,
"loss": 0.1298,
"step": 2570
},
{
"epoch": 1.4129244249726178,
"grad_norm": 2.6431047916412354,
"learning_rate": 2e-05,
"loss": 0.1503,
"step": 2580
},
{
"epoch": 1.4184008762322016,
"grad_norm": 4.68942928314209,
"learning_rate": 2e-05,
"loss": 0.1375,
"step": 2590
},
{
"epoch": 1.4238773274917853,
"grad_norm": 2.73363995552063,
"learning_rate": 2e-05,
"loss": 0.1712,
"step": 2600
},
{
"epoch": 1.429353778751369,
"grad_norm": 3.2278857231140137,
"learning_rate": 2e-05,
"loss": 0.1425,
"step": 2610
},
{
"epoch": 1.4348302300109528,
"grad_norm": 3.2857725620269775,
"learning_rate": 2e-05,
"loss": 0.1111,
"step": 2620
},
{
"epoch": 1.4403066812705367,
"grad_norm": 1.6636910438537598,
"learning_rate": 2e-05,
"loss": 0.1231,
"step": 2630
},
{
"epoch": 1.4457831325301205,
"grad_norm": 1.7655991315841675,
"learning_rate": 2e-05,
"loss": 0.1526,
"step": 2640
},
{
"epoch": 1.4512595837897042,
"grad_norm": 2.4831273555755615,
"learning_rate": 2e-05,
"loss": 0.1626,
"step": 2650
},
{
"epoch": 1.4567360350492882,
"grad_norm": 1.5845210552215576,
"learning_rate": 2e-05,
"loss": 0.1471,
"step": 2660
},
{
"epoch": 1.462212486308872,
"grad_norm": 2.035768985748291,
"learning_rate": 2e-05,
"loss": 0.1353,
"step": 2670
},
{
"epoch": 1.4676889375684556,
"grad_norm": 3.0364644527435303,
"learning_rate": 2e-05,
"loss": 0.1534,
"step": 2680
},
{
"epoch": 1.4731653888280394,
"grad_norm": 1.0436877012252808,
"learning_rate": 2e-05,
"loss": 0.1384,
"step": 2690
},
{
"epoch": 1.4786418400876231,
"grad_norm": 3.814385175704956,
"learning_rate": 2e-05,
"loss": 0.1571,
"step": 2700
},
{
"epoch": 1.484118291347207,
"grad_norm": 4.043318748474121,
"learning_rate": 2e-05,
"loss": 0.1408,
"step": 2710
},
{
"epoch": 1.4895947426067908,
"grad_norm": 2.101560354232788,
"learning_rate": 2e-05,
"loss": 0.1506,
"step": 2720
},
{
"epoch": 1.4950711938663745,
"grad_norm": 3.871284008026123,
"learning_rate": 2e-05,
"loss": 0.1598,
"step": 2730
},
{
"epoch": 1.5005476451259585,
"grad_norm": 1.0195356607437134,
"learning_rate": 2e-05,
"loss": 0.1625,
"step": 2740
},
{
"epoch": 1.5060240963855422,
"grad_norm": 2.7019972801208496,
"learning_rate": 2e-05,
"loss": 0.1396,
"step": 2750
},
{
"epoch": 1.511500547645126,
"grad_norm": 3.040086269378662,
"learning_rate": 2e-05,
"loss": 0.1503,
"step": 2760
},
{
"epoch": 1.5169769989047097,
"grad_norm": 1.6536140441894531,
"learning_rate": 2e-05,
"loss": 0.1424,
"step": 2770
},
{
"epoch": 1.5224534501642935,
"grad_norm": 2.9479269981384277,
"learning_rate": 2e-05,
"loss": 0.1537,
"step": 2780
},
{
"epoch": 1.5279299014238772,
"grad_norm": 2.638228416442871,
"learning_rate": 2e-05,
"loss": 0.1517,
"step": 2790
},
{
"epoch": 1.5334063526834611,
"grad_norm": 1.5154801607131958,
"learning_rate": 2e-05,
"loss": 0.1627,
"step": 2800
},
{
"epoch": 1.5388828039430449,
"grad_norm": 4.037379264831543,
"learning_rate": 2e-05,
"loss": 0.1788,
"step": 2810
},
{
"epoch": 1.5443592552026288,
"grad_norm": 3.5345592498779297,
"learning_rate": 2e-05,
"loss": 0.1768,
"step": 2820
},
{
"epoch": 1.5498357064622126,
"grad_norm": 3.8549864292144775,
"learning_rate": 2e-05,
"loss": 0.1721,
"step": 2830
},
{
"epoch": 1.5553121577217963,
"grad_norm": 3.5247507095336914,
"learning_rate": 2e-05,
"loss": 0.1305,
"step": 2840
},
{
"epoch": 1.56078860898138,
"grad_norm": 2.387272834777832,
"learning_rate": 2e-05,
"loss": 0.1234,
"step": 2850
},
{
"epoch": 1.5662650602409638,
"grad_norm": 3.007579803466797,
"learning_rate": 2e-05,
"loss": 0.152,
"step": 2860
},
{
"epoch": 1.5717415115005475,
"grad_norm": 1.0041784048080444,
"learning_rate": 2e-05,
"loss": 0.1489,
"step": 2870
},
{
"epoch": 1.5772179627601315,
"grad_norm": 3.3091013431549072,
"learning_rate": 2e-05,
"loss": 0.14,
"step": 2880
},
{
"epoch": 1.5826944140197152,
"grad_norm": 1.844616174697876,
"learning_rate": 2e-05,
"loss": 0.1721,
"step": 2890
},
{
"epoch": 1.588170865279299,
"grad_norm": 3.9923973083496094,
"learning_rate": 2e-05,
"loss": 0.1546,
"step": 2900
},
{
"epoch": 1.593647316538883,
"grad_norm": 2.3511135578155518,
"learning_rate": 2e-05,
"loss": 0.1477,
"step": 2910
},
{
"epoch": 1.5991237677984667,
"grad_norm": 2.524749994277954,
"learning_rate": 2e-05,
"loss": 0.1613,
"step": 2920
},
{
"epoch": 1.6046002190580504,
"grad_norm": 1.5530831813812256,
"learning_rate": 2e-05,
"loss": 0.1445,
"step": 2930
},
{
"epoch": 1.6100766703176341,
"grad_norm": 1.8088948726654053,
"learning_rate": 2e-05,
"loss": 0.1446,
"step": 2940
},
{
"epoch": 1.6155531215772179,
"grad_norm": 1.5274639129638672,
"learning_rate": 2e-05,
"loss": 0.1453,
"step": 2950
},
{
"epoch": 1.6210295728368016,
"grad_norm": 2.369565963745117,
"learning_rate": 2e-05,
"loss": 0.1487,
"step": 2960
},
{
"epoch": 1.6265060240963856,
"grad_norm": 2.4283454418182373,
"learning_rate": 2e-05,
"loss": 0.1522,
"step": 2970
},
{
"epoch": 1.6319824753559693,
"grad_norm": 4.117255687713623,
"learning_rate": 2e-05,
"loss": 0.1523,
"step": 2980
},
{
"epoch": 1.6374589266155533,
"grad_norm": 2.1403403282165527,
"learning_rate": 2e-05,
"loss": 0.1558,
"step": 2990
},
{
"epoch": 1.642935377875137,
"grad_norm": 3.7226603031158447,
"learning_rate": 2e-05,
"loss": 0.1635,
"step": 3000
},
{
"epoch": 1.6484118291347207,
"grad_norm": 3.3474371433258057,
"learning_rate": 2e-05,
"loss": 0.1543,
"step": 3010
},
{
"epoch": 1.6538882803943045,
"grad_norm": 2.174217700958252,
"learning_rate": 2e-05,
"loss": 0.1474,
"step": 3020
},
{
"epoch": 1.6593647316538882,
"grad_norm": 1.7523736953735352,
"learning_rate": 2e-05,
"loss": 0.1487,
"step": 3030
},
{
"epoch": 1.664841182913472,
"grad_norm": 2.573213577270508,
"learning_rate": 2e-05,
"loss": 0.1566,
"step": 3040
},
{
"epoch": 1.670317634173056,
"grad_norm": 1.8312263488769531,
"learning_rate": 2e-05,
"loss": 0.1524,
"step": 3050
},
{
"epoch": 1.6757940854326396,
"grad_norm": 1.8972638845443726,
"learning_rate": 2e-05,
"loss": 0.1129,
"step": 3060
},
{
"epoch": 1.6812705366922236,
"grad_norm": 2.2399697303771973,
"learning_rate": 2e-05,
"loss": 0.1569,
"step": 3070
},
{
"epoch": 1.6867469879518073,
"grad_norm": 2.9116086959838867,
"learning_rate": 2e-05,
"loss": 0.1531,
"step": 3080
},
{
"epoch": 1.692223439211391,
"grad_norm": 2.098607063293457,
"learning_rate": 2e-05,
"loss": 0.1378,
"step": 3090
},
{
"epoch": 1.6976998904709748,
"grad_norm": 1.720107913017273,
"learning_rate": 2e-05,
"loss": 0.1554,
"step": 3100
},
{
"epoch": 1.7031763417305585,
"grad_norm": 2.0600640773773193,
"learning_rate": 2e-05,
"loss": 0.1541,
"step": 3110
},
{
"epoch": 1.7086527929901423,
"grad_norm": 2.0780065059661865,
"learning_rate": 2e-05,
"loss": 0.1551,
"step": 3120
},
{
"epoch": 1.714129244249726,
"grad_norm": 1.9723634719848633,
"learning_rate": 2e-05,
"loss": 0.1168,
"step": 3130
},
{
"epoch": 1.71960569550931,
"grad_norm": 6.4908552169799805,
"learning_rate": 2e-05,
"loss": 0.1157,
"step": 3140
},
{
"epoch": 1.7250821467688937,
"grad_norm": 2.1401596069335938,
"learning_rate": 2e-05,
"loss": 0.1419,
"step": 3150
},
{
"epoch": 1.7305585980284777,
"grad_norm": 1.883585810661316,
"learning_rate": 2e-05,
"loss": 0.1428,
"step": 3160
},
{
"epoch": 1.7360350492880614,
"grad_norm": 2.2904489040374756,
"learning_rate": 2e-05,
"loss": 0.1382,
"step": 3170
},
{
"epoch": 1.7415115005476451,
"grad_norm": 2.3025336265563965,
"learning_rate": 2e-05,
"loss": 0.2024,
"step": 3180
},
{
"epoch": 1.7469879518072289,
"grad_norm": 1.5613994598388672,
"learning_rate": 2e-05,
"loss": 0.1696,
"step": 3190
},
{
"epoch": 1.7524644030668126,
"grad_norm": 1.7806004285812378,
"learning_rate": 2e-05,
"loss": 0.1474,
"step": 3200
},
{
"epoch": 1.7579408543263964,
"grad_norm": 2.04266095161438,
"learning_rate": 2e-05,
"loss": 0.1537,
"step": 3210
},
{
"epoch": 1.7634173055859803,
"grad_norm": 3.345473527908325,
"learning_rate": 2e-05,
"loss": 0.1411,
"step": 3220
},
{
"epoch": 1.768893756845564,
"grad_norm": 2.1662192344665527,
"learning_rate": 2e-05,
"loss": 0.1536,
"step": 3230
},
{
"epoch": 1.774370208105148,
"grad_norm": 1.1458584070205688,
"learning_rate": 2e-05,
"loss": 0.1642,
"step": 3240
},
{
"epoch": 1.7798466593647317,
"grad_norm": 4.288283824920654,
"learning_rate": 2e-05,
"loss": 0.1419,
"step": 3250
},
{
"epoch": 1.7853231106243155,
"grad_norm": 3.2075963020324707,
"learning_rate": 2e-05,
"loss": 0.1667,
"step": 3260
},
{
"epoch": 1.7907995618838992,
"grad_norm": 2.8897817134857178,
"learning_rate": 2e-05,
"loss": 0.1646,
"step": 3270
},
{
"epoch": 1.796276013143483,
"grad_norm": 2.2969679832458496,
"learning_rate": 2e-05,
"loss": 0.1573,
"step": 3280
},
{
"epoch": 1.8017524644030667,
"grad_norm": 3.1827869415283203,
"learning_rate": 2e-05,
"loss": 0.1366,
"step": 3290
},
{
"epoch": 1.8072289156626506,
"grad_norm": 3.3078675270080566,
"learning_rate": 2e-05,
"loss": 0.1342,
"step": 3300
},
{
"epoch": 1.8127053669222344,
"grad_norm": 0.969814658164978,
"learning_rate": 2e-05,
"loss": 0.1314,
"step": 3310
},
{
"epoch": 1.8181818181818183,
"grad_norm": 1.9750161170959473,
"learning_rate": 2e-05,
"loss": 0.1398,
"step": 3320
},
{
"epoch": 1.823658269441402,
"grad_norm": 2.6312105655670166,
"learning_rate": 2e-05,
"loss": 0.1271,
"step": 3330
},
{
"epoch": 1.8291347207009858,
"grad_norm": 5.169326305389404,
"learning_rate": 2e-05,
"loss": 0.136,
"step": 3340
},
{
"epoch": 1.8346111719605696,
"grad_norm": 4.923961639404297,
"learning_rate": 2e-05,
"loss": 0.1516,
"step": 3350
},
{
"epoch": 1.8400876232201533,
"grad_norm": 1.6556754112243652,
"learning_rate": 2e-05,
"loss": 0.1577,
"step": 3360
},
{
"epoch": 1.845564074479737,
"grad_norm": 3.2922916412353516,
"learning_rate": 2e-05,
"loss": 0.1508,
"step": 3370
},
{
"epoch": 1.8510405257393208,
"grad_norm": 1.2395728826522827,
"learning_rate": 2e-05,
"loss": 0.149,
"step": 3380
},
{
"epoch": 1.8565169769989047,
"grad_norm": 1.776043176651001,
"learning_rate": 2e-05,
"loss": 0.1762,
"step": 3390
},
{
"epoch": 1.8619934282584885,
"grad_norm": 3.395716667175293,
"learning_rate": 2e-05,
"loss": 0.1709,
"step": 3400
},
{
"epoch": 1.8674698795180724,
"grad_norm": 3.3589627742767334,
"learning_rate": 2e-05,
"loss": 0.1943,
"step": 3410
},
{
"epoch": 1.8729463307776562,
"grad_norm": 1.2186440229415894,
"learning_rate": 2e-05,
"loss": 0.1607,
"step": 3420
},
{
"epoch": 1.87842278203724,
"grad_norm": 1.260779857635498,
"learning_rate": 2e-05,
"loss": 0.1522,
"step": 3430
},
{
"epoch": 1.8838992332968236,
"grad_norm": 2.699249267578125,
"learning_rate": 2e-05,
"loss": 0.1493,
"step": 3440
},
{
"epoch": 1.8893756845564074,
"grad_norm": 1.9771623611450195,
"learning_rate": 2e-05,
"loss": 0.1485,
"step": 3450
},
{
"epoch": 1.894852135815991,
"grad_norm": 2.270580768585205,
"learning_rate": 2e-05,
"loss": 0.1534,
"step": 3460
},
{
"epoch": 1.900328587075575,
"grad_norm": 1.3207887411117554,
"learning_rate": 2e-05,
"loss": 0.1383,
"step": 3470
},
{
"epoch": 1.9058050383351588,
"grad_norm": 26.00341796875,
"learning_rate": 2e-05,
"loss": 0.1685,
"step": 3480
},
{
"epoch": 1.9112814895947428,
"grad_norm": 2.4248104095458984,
"learning_rate": 2e-05,
"loss": 0.1252,
"step": 3490
},
{
"epoch": 1.9167579408543265,
"grad_norm": 3.160520315170288,
"learning_rate": 2e-05,
"loss": 0.1452,
"step": 3500
},
{
"epoch": 1.9222343921139102,
"grad_norm": 2.528468608856201,
"learning_rate": 2e-05,
"loss": 0.1168,
"step": 3510
},
{
"epoch": 1.927710843373494,
"grad_norm": 1.9054774045944214,
"learning_rate": 2e-05,
"loss": 0.1375,
"step": 3520
},
{
"epoch": 1.9331872946330777,
"grad_norm": 3.4692299365997314,
"learning_rate": 2e-05,
"loss": 0.1652,
"step": 3530
},
{
"epoch": 1.9386637458926614,
"grad_norm": 1.626815915107727,
"learning_rate": 2e-05,
"loss": 0.1231,
"step": 3540
},
{
"epoch": 1.9441401971522454,
"grad_norm": 3.7855207920074463,
"learning_rate": 2e-05,
"loss": 0.1492,
"step": 3550
},
{
"epoch": 1.9496166484118291,
"grad_norm": 1.1849123239517212,
"learning_rate": 2e-05,
"loss": 0.1594,
"step": 3560
},
{
"epoch": 1.9550930996714129,
"grad_norm": 2.7899911403656006,
"learning_rate": 2e-05,
"loss": 0.1601,
"step": 3570
},
{
"epoch": 1.9605695509309968,
"grad_norm": 1.963122010231018,
"learning_rate": 2e-05,
"loss": 0.1301,
"step": 3580
},
{
"epoch": 1.9660460021905806,
"grad_norm": 1.8025850057601929,
"learning_rate": 2e-05,
"loss": 0.156,
"step": 3590
},
{
"epoch": 1.9715224534501643,
"grad_norm": 1.5995118618011475,
"learning_rate": 2e-05,
"loss": 0.1348,
"step": 3600
},
{
"epoch": 1.976998904709748,
"grad_norm": 1.159638524055481,
"learning_rate": 2e-05,
"loss": 0.1288,
"step": 3610
},
{
"epoch": 1.9824753559693318,
"grad_norm": 1.3912004232406616,
"learning_rate": 2e-05,
"loss": 0.1161,
"step": 3620
},
{
"epoch": 1.9879518072289155,
"grad_norm": 1.0395070314407349,
"learning_rate": 2e-05,
"loss": 0.1386,
"step": 3630
},
{
"epoch": 1.9934282584884995,
"grad_norm": 1.532216191291809,
"learning_rate": 2e-05,
"loss": 0.1213,
"step": 3640
},
{
"epoch": 1.9989047097480832,
"grad_norm": 1.4489120244979858,
"learning_rate": 2e-05,
"loss": 0.1123,
"step": 3650
},
{
"epoch": 2.0,
"eval_accuracy": 0.941111445007298,
"eval_f1": 0.8677761381181066,
"eval_loss": 0.15645764768123627,
"eval_precision": 0.819916825171669,
"eval_recall": 0.9215689826977082,
"eval_runtime": 19.7981,
"eval_samples_per_second": 312.404,
"eval_steps_per_second": 19.547,
"step": 3652
},
{
"epoch": 2.004381161007667,
"grad_norm": 1.69236159324646,
"learning_rate": 2e-05,
"loss": 0.1479,
"step": 3660
},
{
"epoch": 2.009857612267251,
"grad_norm": 3.8225982189178467,
"learning_rate": 2e-05,
"loss": 0.1477,
"step": 3670
},
{
"epoch": 2.0153340635268346,
"grad_norm": 4.383903980255127,
"learning_rate": 2e-05,
"loss": 0.1094,
"step": 3680
},
{
"epoch": 2.0208105147864184,
"grad_norm": 3.1119155883789062,
"learning_rate": 2e-05,
"loss": 0.1602,
"step": 3690
},
{
"epoch": 2.026286966046002,
"grad_norm": 3.3700242042541504,
"learning_rate": 2e-05,
"loss": 0.1303,
"step": 3700
},
{
"epoch": 2.031763417305586,
"grad_norm": 1.0470126867294312,
"learning_rate": 2e-05,
"loss": 0.1174,
"step": 3710
},
{
"epoch": 2.0372398685651696,
"grad_norm": 2.912874698638916,
"learning_rate": 2e-05,
"loss": 0.1336,
"step": 3720
},
{
"epoch": 2.0427163198247538,
"grad_norm": 0.8620438575744629,
"learning_rate": 2e-05,
"loss": 0.1112,
"step": 3730
},
{
"epoch": 2.0481927710843375,
"grad_norm": 2.3170716762542725,
"learning_rate": 2e-05,
"loss": 0.1208,
"step": 3740
},
{
"epoch": 2.0536692223439212,
"grad_norm": 1.4915480613708496,
"learning_rate": 2e-05,
"loss": 0.136,
"step": 3750
},
{
"epoch": 2.059145673603505,
"grad_norm": 1.7329208850860596,
"learning_rate": 2e-05,
"loss": 0.1437,
"step": 3760
},
{
"epoch": 2.0646221248630887,
"grad_norm": 1.5879555940628052,
"learning_rate": 2e-05,
"loss": 0.1338,
"step": 3770
},
{
"epoch": 2.0700985761226725,
"grad_norm": 0.7418123483657837,
"learning_rate": 2e-05,
"loss": 0.1013,
"step": 3780
},
{
"epoch": 2.075575027382256,
"grad_norm": 1.0119812488555908,
"learning_rate": 2e-05,
"loss": 0.0841,
"step": 3790
},
{
"epoch": 2.08105147864184,
"grad_norm": 1.383432149887085,
"learning_rate": 2e-05,
"loss": 0.1212,
"step": 3800
},
{
"epoch": 2.0865279299014237,
"grad_norm": 2.614387273788452,
"learning_rate": 2e-05,
"loss": 0.1228,
"step": 3810
},
{
"epoch": 2.092004381161008,
"grad_norm": 2.6762051582336426,
"learning_rate": 2e-05,
"loss": 0.1491,
"step": 3820
},
{
"epoch": 2.0974808324205916,
"grad_norm": 3.3792619705200195,
"learning_rate": 2e-05,
"loss": 0.1161,
"step": 3830
},
{
"epoch": 2.1029572836801753,
"grad_norm": 2.690113067626953,
"learning_rate": 2e-05,
"loss": 0.1093,
"step": 3840
},
{
"epoch": 2.108433734939759,
"grad_norm": 1.5759937763214111,
"learning_rate": 2e-05,
"loss": 0.1406,
"step": 3850
},
{
"epoch": 2.113910186199343,
"grad_norm": 1.4909275770187378,
"learning_rate": 2e-05,
"loss": 0.1108,
"step": 3860
},
{
"epoch": 2.1193866374589265,
"grad_norm": 2.6127500534057617,
"learning_rate": 2e-05,
"loss": 0.1269,
"step": 3870
},
{
"epoch": 2.1248630887185103,
"grad_norm": 2.5836493968963623,
"learning_rate": 2e-05,
"loss": 0.1396,
"step": 3880
},
{
"epoch": 2.130339539978094,
"grad_norm": 1.385608434677124,
"learning_rate": 2e-05,
"loss": 0.127,
"step": 3890
},
{
"epoch": 2.135815991237678,
"grad_norm": 3.3218297958374023,
"learning_rate": 2e-05,
"loss": 0.1056,
"step": 3900
},
{
"epoch": 2.141292442497262,
"grad_norm": 1.8507598638534546,
"learning_rate": 2e-05,
"loss": 0.1453,
"step": 3910
},
{
"epoch": 2.1467688937568457,
"grad_norm": 3.654327630996704,
"learning_rate": 2e-05,
"loss": 0.122,
"step": 3920
},
{
"epoch": 2.1522453450164294,
"grad_norm": 3.592478036880493,
"learning_rate": 2e-05,
"loss": 0.1245,
"step": 3930
},
{
"epoch": 2.157721796276013,
"grad_norm": 3.7161383628845215,
"learning_rate": 2e-05,
"loss": 0.1126,
"step": 3940
},
{
"epoch": 2.163198247535597,
"grad_norm": 2.2989351749420166,
"learning_rate": 2e-05,
"loss": 0.0944,
"step": 3950
},
{
"epoch": 2.1686746987951806,
"grad_norm": 2.9460718631744385,
"learning_rate": 2e-05,
"loss": 0.126,
"step": 3960
},
{
"epoch": 2.1741511500547643,
"grad_norm": 3.1067349910736084,
"learning_rate": 2e-05,
"loss": 0.1436,
"step": 3970
},
{
"epoch": 2.1796276013143485,
"grad_norm": 2.155015230178833,
"learning_rate": 2e-05,
"loss": 0.1033,
"step": 3980
},
{
"epoch": 2.1851040525739323,
"grad_norm": 2.9963104724884033,
"learning_rate": 2e-05,
"loss": 0.1443,
"step": 3990
},
{
"epoch": 2.190580503833516,
"grad_norm": 1.293370246887207,
"learning_rate": 2e-05,
"loss": 0.1093,
"step": 4000
},
{
"epoch": 2.1960569550930997,
"grad_norm": 1.3873592615127563,
"learning_rate": 2e-05,
"loss": 0.1139,
"step": 4010
},
{
"epoch": 2.2015334063526835,
"grad_norm": 1.8804830312728882,
"learning_rate": 2e-05,
"loss": 0.1554,
"step": 4020
},
{
"epoch": 2.207009857612267,
"grad_norm": 4.313164710998535,
"learning_rate": 2e-05,
"loss": 0.1129,
"step": 4030
},
{
"epoch": 2.212486308871851,
"grad_norm": 2.9426050186157227,
"learning_rate": 2e-05,
"loss": 0.1334,
"step": 4040
},
{
"epoch": 2.2179627601314347,
"grad_norm": 2.560018539428711,
"learning_rate": 2e-05,
"loss": 0.1492,
"step": 4050
},
{
"epoch": 2.2234392113910184,
"grad_norm": 1.6301517486572266,
"learning_rate": 2e-05,
"loss": 0.1308,
"step": 4060
},
{
"epoch": 2.2289156626506026,
"grad_norm": 1.1607255935668945,
"learning_rate": 2e-05,
"loss": 0.1374,
"step": 4070
},
{
"epoch": 2.2343921139101863,
"grad_norm": 4.422305107116699,
"learning_rate": 2e-05,
"loss": 0.1375,
"step": 4080
},
{
"epoch": 2.23986856516977,
"grad_norm": 3.9398353099823,
"learning_rate": 2e-05,
"loss": 0.1526,
"step": 4090
},
{
"epoch": 2.245345016429354,
"grad_norm": 4.186077117919922,
"learning_rate": 2e-05,
"loss": 0.1117,
"step": 4100
},
{
"epoch": 2.2508214676889375,
"grad_norm": 3.083814859390259,
"learning_rate": 2e-05,
"loss": 0.1273,
"step": 4110
},
{
"epoch": 2.2562979189485213,
"grad_norm": 1.9174625873565674,
"learning_rate": 2e-05,
"loss": 0.1036,
"step": 4120
},
{
"epoch": 2.261774370208105,
"grad_norm": 1.3200234174728394,
"learning_rate": 2e-05,
"loss": 0.1246,
"step": 4130
},
{
"epoch": 2.2672508214676887,
"grad_norm": 1.504086971282959,
"learning_rate": 2e-05,
"loss": 0.0944,
"step": 4140
},
{
"epoch": 2.2727272727272725,
"grad_norm": 2.5579471588134766,
"learning_rate": 2e-05,
"loss": 0.1242,
"step": 4150
},
{
"epoch": 2.2782037239868567,
"grad_norm": 2.304062843322754,
"learning_rate": 2e-05,
"loss": 0.1543,
"step": 4160
},
{
"epoch": 2.2836801752464404,
"grad_norm": 1.507938265800476,
"learning_rate": 2e-05,
"loss": 0.1277,
"step": 4170
},
{
"epoch": 2.289156626506024,
"grad_norm": 4.3036346435546875,
"learning_rate": 2e-05,
"loss": 0.1311,
"step": 4180
},
{
"epoch": 2.294633077765608,
"grad_norm": 1.80647873878479,
"learning_rate": 2e-05,
"loss": 0.1403,
"step": 4190
},
{
"epoch": 2.3001095290251916,
"grad_norm": 2.770962715148926,
"learning_rate": 2e-05,
"loss": 0.1521,
"step": 4200
},
{
"epoch": 2.3055859802847753,
"grad_norm": 2.768677234649658,
"learning_rate": 2e-05,
"loss": 0.1314,
"step": 4210
},
{
"epoch": 2.311062431544359,
"grad_norm": 1.7572500705718994,
"learning_rate": 2e-05,
"loss": 0.122,
"step": 4220
},
{
"epoch": 2.3165388828039433,
"grad_norm": 1.1709873676300049,
"learning_rate": 2e-05,
"loss": 0.1236,
"step": 4230
},
{
"epoch": 2.322015334063527,
"grad_norm": 17.13128089904785,
"learning_rate": 2e-05,
"loss": 0.1509,
"step": 4240
},
{
"epoch": 2.3274917853231107,
"grad_norm": 2.660583019256592,
"learning_rate": 2e-05,
"loss": 0.1326,
"step": 4250
},
{
"epoch": 2.3329682365826945,
"grad_norm": 1.643479347229004,
"learning_rate": 2e-05,
"loss": 0.1179,
"step": 4260
},
{
"epoch": 2.338444687842278,
"grad_norm": 6.019737720489502,
"learning_rate": 2e-05,
"loss": 0.0953,
"step": 4270
},
{
"epoch": 2.343921139101862,
"grad_norm": 1.986523985862732,
"learning_rate": 2e-05,
"loss": 0.1048,
"step": 4280
},
{
"epoch": 2.3493975903614457,
"grad_norm": 1.796851634979248,
"learning_rate": 2e-05,
"loss": 0.115,
"step": 4290
},
{
"epoch": 2.3548740416210294,
"grad_norm": 1.2359439134597778,
"learning_rate": 2e-05,
"loss": 0.1174,
"step": 4300
},
{
"epoch": 2.360350492880613,
"grad_norm": 3.057445526123047,
"learning_rate": 2e-05,
"loss": 0.1458,
"step": 4310
},
{
"epoch": 2.3658269441401973,
"grad_norm": 1.0161036252975464,
"learning_rate": 2e-05,
"loss": 0.1437,
"step": 4320
},
{
"epoch": 2.371303395399781,
"grad_norm": 1.2098288536071777,
"learning_rate": 2e-05,
"loss": 0.1379,
"step": 4330
},
{
"epoch": 2.376779846659365,
"grad_norm": 1.4055923223495483,
"learning_rate": 2e-05,
"loss": 0.1199,
"step": 4340
},
{
"epoch": 2.3822562979189486,
"grad_norm": 2.134941816329956,
"learning_rate": 2e-05,
"loss": 0.1303,
"step": 4350
},
{
"epoch": 2.3877327491785323,
"grad_norm": 2.351625680923462,
"learning_rate": 2e-05,
"loss": 0.1278,
"step": 4360
},
{
"epoch": 2.393209200438116,
"grad_norm": 3.273850679397583,
"learning_rate": 2e-05,
"loss": 0.11,
"step": 4370
},
{
"epoch": 2.3986856516976998,
"grad_norm": 2.0896518230438232,
"learning_rate": 2e-05,
"loss": 0.1193,
"step": 4380
},
{
"epoch": 2.4041621029572835,
"grad_norm": 3.240591287612915,
"learning_rate": 2e-05,
"loss": 0.139,
"step": 4390
},
{
"epoch": 2.4096385542168672,
"grad_norm": 4.579762935638428,
"learning_rate": 2e-05,
"loss": 0.1417,
"step": 4400
},
{
"epoch": 2.4151150054764514,
"grad_norm": 2.048832654953003,
"learning_rate": 2e-05,
"loss": 0.1176,
"step": 4410
},
{
"epoch": 2.420591456736035,
"grad_norm": 1.8488651514053345,
"learning_rate": 2e-05,
"loss": 0.1408,
"step": 4420
},
{
"epoch": 2.426067907995619,
"grad_norm": 1.4034713506698608,
"learning_rate": 2e-05,
"loss": 0.1246,
"step": 4430
},
{
"epoch": 2.4315443592552026,
"grad_norm": 1.0171767473220825,
"learning_rate": 2e-05,
"loss": 0.1118,
"step": 4440
},
{
"epoch": 2.4370208105147864,
"grad_norm": 4.190380573272705,
"learning_rate": 2e-05,
"loss": 0.1527,
"step": 4450
},
{
"epoch": 2.44249726177437,
"grad_norm": 3.9857051372528076,
"learning_rate": 2e-05,
"loss": 0.1153,
"step": 4460
},
{
"epoch": 2.447973713033954,
"grad_norm": 1.4358816146850586,
"learning_rate": 2e-05,
"loss": 0.0983,
"step": 4470
},
{
"epoch": 2.453450164293538,
"grad_norm": 1.8944737911224365,
"learning_rate": 2e-05,
"loss": 0.1382,
"step": 4480
},
{
"epoch": 2.4589266155531218,
"grad_norm": 1.8662302494049072,
"learning_rate": 2e-05,
"loss": 0.1564,
"step": 4490
},
{
"epoch": 2.4644030668127055,
"grad_norm": 1.050307035446167,
"learning_rate": 2e-05,
"loss": 0.1111,
"step": 4500
},
{
"epoch": 2.4698795180722892,
"grad_norm": 1.3058151006698608,
"learning_rate": 2e-05,
"loss": 0.1526,
"step": 4510
},
{
"epoch": 2.475355969331873,
"grad_norm": 2.779019355773926,
"learning_rate": 2e-05,
"loss": 0.1013,
"step": 4520
},
{
"epoch": 2.4808324205914567,
"grad_norm": 3.649847984313965,
"learning_rate": 2e-05,
"loss": 0.1039,
"step": 4530
},
{
"epoch": 2.4863088718510404,
"grad_norm": 1.4723719358444214,
"learning_rate": 2e-05,
"loss": 0.1304,
"step": 4540
},
{
"epoch": 2.491785323110624,
"grad_norm": 2.51281476020813,
"learning_rate": 2e-05,
"loss": 0.1067,
"step": 4550
},
{
"epoch": 2.497261774370208,
"grad_norm": 3.2945971488952637,
"learning_rate": 2e-05,
"loss": 0.1354,
"step": 4560
},
{
"epoch": 2.502738225629792,
"grad_norm": 3.16933536529541,
"learning_rate": 2e-05,
"loss": 0.127,
"step": 4570
},
{
"epoch": 2.508214676889376,
"grad_norm": 1.2082220315933228,
"learning_rate": 2e-05,
"loss": 0.1296,
"step": 4580
},
{
"epoch": 2.5136911281489596,
"grad_norm": 4.029638767242432,
"learning_rate": 2e-05,
"loss": 0.1363,
"step": 4590
},
{
"epoch": 2.5191675794085433,
"grad_norm": 1.5362796783447266,
"learning_rate": 2e-05,
"loss": 0.1275,
"step": 4600
},
{
"epoch": 2.524644030668127,
"grad_norm": 2.8110194206237793,
"learning_rate": 2e-05,
"loss": 0.1401,
"step": 4610
},
{
"epoch": 2.5301204819277108,
"grad_norm": 1.6804673671722412,
"learning_rate": 2e-05,
"loss": 0.1448,
"step": 4620
},
{
"epoch": 2.5355969331872945,
"grad_norm": 7.145838260650635,
"learning_rate": 2e-05,
"loss": 0.1305,
"step": 4630
},
{
"epoch": 2.5410733844468787,
"grad_norm": 2.6446447372436523,
"learning_rate": 2e-05,
"loss": 0.1242,
"step": 4640
},
{
"epoch": 2.546549835706462,
"grad_norm": 1.742530107498169,
"learning_rate": 2e-05,
"loss": 0.1151,
"step": 4650
},
{
"epoch": 2.552026286966046,
"grad_norm": 2.4224681854248047,
"learning_rate": 2e-05,
"loss": 0.1499,
"step": 4660
},
{
"epoch": 2.55750273822563,
"grad_norm": 2.849701404571533,
"learning_rate": 2e-05,
"loss": 0.1365,
"step": 4670
},
{
"epoch": 2.5629791894852136,
"grad_norm": 5.31744384765625,
"learning_rate": 2e-05,
"loss": 0.1308,
"step": 4680
},
{
"epoch": 2.5684556407447974,
"grad_norm": 1.5912376642227173,
"learning_rate": 2e-05,
"loss": 0.1551,
"step": 4690
},
{
"epoch": 2.573932092004381,
"grad_norm": 1.0725617408752441,
"learning_rate": 2e-05,
"loss": 0.1279,
"step": 4700
},
{
"epoch": 2.579408543263965,
"grad_norm": 3.9630486965179443,
"learning_rate": 2e-05,
"loss": 0.1158,
"step": 4710
},
{
"epoch": 2.5848849945235486,
"grad_norm": 1.6319750547409058,
"learning_rate": 2e-05,
"loss": 0.1576,
"step": 4720
},
{
"epoch": 2.5903614457831328,
"grad_norm": 4.792392253875732,
"learning_rate": 2e-05,
"loss": 0.1598,
"step": 4730
},
{
"epoch": 2.595837897042716,
"grad_norm": 2.365724563598633,
"learning_rate": 2e-05,
"loss": 0.1189,
"step": 4740
},
{
"epoch": 2.6013143483023002,
"grad_norm": 4.436146259307861,
"learning_rate": 2e-05,
"loss": 0.1286,
"step": 4750
},
{
"epoch": 2.606790799561884,
"grad_norm": 1.227371096611023,
"learning_rate": 2e-05,
"loss": 0.1465,
"step": 4760
},
{
"epoch": 2.6122672508214677,
"grad_norm": 1.3392515182495117,
"learning_rate": 2e-05,
"loss": 0.1305,
"step": 4770
},
{
"epoch": 2.6177437020810514,
"grad_norm": 2.5968856811523438,
"learning_rate": 2e-05,
"loss": 0.1089,
"step": 4780
},
{
"epoch": 2.623220153340635,
"grad_norm": 2.4093873500823975,
"learning_rate": 2e-05,
"loss": 0.139,
"step": 4790
},
{
"epoch": 2.628696604600219,
"grad_norm": 2.9262475967407227,
"learning_rate": 2e-05,
"loss": 0.1138,
"step": 4800
},
{
"epoch": 2.6341730558598027,
"grad_norm": 1.900516152381897,
"learning_rate": 2e-05,
"loss": 0.0951,
"step": 4810
},
{
"epoch": 2.639649507119387,
"grad_norm": 1.9894856214523315,
"learning_rate": 2e-05,
"loss": 0.184,
"step": 4820
},
{
"epoch": 2.6451259583789706,
"grad_norm": 0.8725138306617737,
"learning_rate": 2e-05,
"loss": 0.1043,
"step": 4830
},
{
"epoch": 2.6506024096385543,
"grad_norm": 2.6368794441223145,
"learning_rate": 2e-05,
"loss": 0.1496,
"step": 4840
},
{
"epoch": 2.656078860898138,
"grad_norm": 2.7605247497558594,
"learning_rate": 2e-05,
"loss": 0.1188,
"step": 4850
},
{
"epoch": 2.661555312157722,
"grad_norm": 1.7536276578903198,
"learning_rate": 2e-05,
"loss": 0.12,
"step": 4860
},
{
"epoch": 2.6670317634173055,
"grad_norm": 2.9888434410095215,
"learning_rate": 2e-05,
"loss": 0.1251,
"step": 4870
},
{
"epoch": 2.6725082146768893,
"grad_norm": 2.501553535461426,
"learning_rate": 2e-05,
"loss": 0.1317,
"step": 4880
},
{
"epoch": 2.6779846659364734,
"grad_norm": 3.2755653858184814,
"learning_rate": 2e-05,
"loss": 0.1431,
"step": 4890
},
{
"epoch": 2.6834611171960567,
"grad_norm": 1.1044738292694092,
"learning_rate": 2e-05,
"loss": 0.1177,
"step": 4900
},
{
"epoch": 2.688937568455641,
"grad_norm": 2.04195237159729,
"learning_rate": 2e-05,
"loss": 0.1385,
"step": 4910
},
{
"epoch": 2.6944140197152247,
"grad_norm": 1.8423049449920654,
"learning_rate": 2e-05,
"loss": 0.1524,
"step": 4920
},
{
"epoch": 2.6998904709748084,
"grad_norm": 3.017038345336914,
"learning_rate": 2e-05,
"loss": 0.1309,
"step": 4930
},
{
"epoch": 2.705366922234392,
"grad_norm": 2.5917625427246094,
"learning_rate": 2e-05,
"loss": 0.0968,
"step": 4940
},
{
"epoch": 2.710843373493976,
"grad_norm": 2.059396266937256,
"learning_rate": 2e-05,
"loss": 0.136,
"step": 4950
},
{
"epoch": 2.7163198247535596,
"grad_norm": 2.7622623443603516,
"learning_rate": 2e-05,
"loss": 0.1344,
"step": 4960
},
{
"epoch": 2.7217962760131433,
"grad_norm": 2.050316333770752,
"learning_rate": 2e-05,
"loss": 0.139,
"step": 4970
},
{
"epoch": 2.7272727272727275,
"grad_norm": 2.0974202156066895,
"learning_rate": 2e-05,
"loss": 0.133,
"step": 4980
},
{
"epoch": 2.732749178532311,
"grad_norm": 4.3153533935546875,
"learning_rate": 2e-05,
"loss": 0.1513,
"step": 4990
},
{
"epoch": 2.738225629791895,
"grad_norm": 1.2816858291625977,
"learning_rate": 2e-05,
"loss": 0.1581,
"step": 5000
},
{
"epoch": 2.7437020810514787,
"grad_norm": 3.44547176361084,
"learning_rate": 2e-05,
"loss": 0.1349,
"step": 5010
},
{
"epoch": 2.7491785323110625,
"grad_norm": 1.2258213758468628,
"learning_rate": 2e-05,
"loss": 0.1412,
"step": 5020
},
{
"epoch": 2.754654983570646,
"grad_norm": 2.301241159439087,
"learning_rate": 2e-05,
"loss": 0.1013,
"step": 5030
},
{
"epoch": 2.76013143483023,
"grad_norm": 2.467775821685791,
"learning_rate": 2e-05,
"loss": 0.1501,
"step": 5040
},
{
"epoch": 2.7656078860898137,
"grad_norm": 7.51368522644043,
"learning_rate": 2e-05,
"loss": 0.1378,
"step": 5050
},
{
"epoch": 2.7710843373493974,
"grad_norm": 4.798404693603516,
"learning_rate": 2e-05,
"loss": 0.1305,
"step": 5060
},
{
"epoch": 2.7765607886089816,
"grad_norm": 1.396974802017212,
"learning_rate": 2e-05,
"loss": 0.1043,
"step": 5070
},
{
"epoch": 2.7820372398685653,
"grad_norm": 1.14879310131073,
"learning_rate": 2e-05,
"loss": 0.12,
"step": 5080
},
{
"epoch": 2.787513691128149,
"grad_norm": 1.2579361200332642,
"learning_rate": 2e-05,
"loss": 0.1501,
"step": 5090
},
{
"epoch": 2.792990142387733,
"grad_norm": 2.2348382472991943,
"learning_rate": 2e-05,
"loss": 0.157,
"step": 5100
},
{
"epoch": 2.7984665936473165,
"grad_norm": 1.5609731674194336,
"learning_rate": 2e-05,
"loss": 0.135,
"step": 5110
},
{
"epoch": 2.8039430449069003,
"grad_norm": 2.0120913982391357,
"learning_rate": 2e-05,
"loss": 0.1123,
"step": 5120
},
{
"epoch": 2.809419496166484,
"grad_norm": 2.593989610671997,
"learning_rate": 2e-05,
"loss": 0.1142,
"step": 5130
},
{
"epoch": 2.8148959474260677,
"grad_norm": 1.574157953262329,
"learning_rate": 2e-05,
"loss": 0.1701,
"step": 5140
},
{
"epoch": 2.8203723986856515,
"grad_norm": 3.661393642425537,
"learning_rate": 2e-05,
"loss": 0.1311,
"step": 5150
},
{
"epoch": 2.8258488499452357,
"grad_norm": 0.9907870888710022,
"learning_rate": 2e-05,
"loss": 0.1421,
"step": 5160
},
{
"epoch": 2.8313253012048194,
"grad_norm": 2.06791615486145,
"learning_rate": 2e-05,
"loss": 0.1353,
"step": 5170
},
{
"epoch": 2.836801752464403,
"grad_norm": 3.9285051822662354,
"learning_rate": 2e-05,
"loss": 0.1373,
"step": 5180
},
{
"epoch": 2.842278203723987,
"grad_norm": 1.7791670560836792,
"learning_rate": 2e-05,
"loss": 0.1018,
"step": 5190
},
{
"epoch": 2.8477546549835706,
"grad_norm": 1.8296700716018677,
"learning_rate": 2e-05,
"loss": 0.1247,
"step": 5200
},
{
"epoch": 2.8532311062431543,
"grad_norm": 4.70639181137085,
"learning_rate": 2e-05,
"loss": 0.1314,
"step": 5210
},
{
"epoch": 2.858707557502738,
"grad_norm": 2.3807830810546875,
"learning_rate": 2e-05,
"loss": 0.1356,
"step": 5220
},
{
"epoch": 2.8641840087623223,
"grad_norm": 1.6014913320541382,
"learning_rate": 2e-05,
"loss": 0.1146,
"step": 5230
},
{
"epoch": 2.8696604600219056,
"grad_norm": 1.8272550106048584,
"learning_rate": 2e-05,
"loss": 0.1183,
"step": 5240
},
{
"epoch": 2.8751369112814897,
"grad_norm": 1.299852728843689,
"learning_rate": 2e-05,
"loss": 0.1405,
"step": 5250
},
{
"epoch": 2.8806133625410735,
"grad_norm": 1.1718955039978027,
"learning_rate": 2e-05,
"loss": 0.1461,
"step": 5260
},
{
"epoch": 2.886089813800657,
"grad_norm": 5.982026100158691,
"learning_rate": 2e-05,
"loss": 0.1539,
"step": 5270
},
{
"epoch": 2.891566265060241,
"grad_norm": 3.6748552322387695,
"learning_rate": 2e-05,
"loss": 0.1164,
"step": 5280
},
{
"epoch": 2.8970427163198247,
"grad_norm": 2.9227006435394287,
"learning_rate": 2e-05,
"loss": 0.1557,
"step": 5290
},
{
"epoch": 2.9025191675794084,
"grad_norm": 1.4735232591629028,
"learning_rate": 2e-05,
"loss": 0.1168,
"step": 5300
},
{
"epoch": 2.907995618838992,
"grad_norm": 3.212067127227783,
"learning_rate": 2e-05,
"loss": 0.1489,
"step": 5310
},
{
"epoch": 2.9134720700985763,
"grad_norm": 1.4650025367736816,
"learning_rate": 2e-05,
"loss": 0.1131,
"step": 5320
},
{
"epoch": 2.91894852135816,
"grad_norm": 4.302365779876709,
"learning_rate": 2e-05,
"loss": 0.1705,
"step": 5330
},
{
"epoch": 2.924424972617744,
"grad_norm": 2.2288858890533447,
"learning_rate": 2e-05,
"loss": 0.1485,
"step": 5340
},
{
"epoch": 2.9299014238773275,
"grad_norm": 1.019721508026123,
"learning_rate": 2e-05,
"loss": 0.1331,
"step": 5350
},
{
"epoch": 2.9353778751369113,
"grad_norm": 2.5705177783966064,
"learning_rate": 2e-05,
"loss": 0.122,
"step": 5360
},
{
"epoch": 2.940854326396495,
"grad_norm": 1.639599084854126,
"learning_rate": 2e-05,
"loss": 0.1218,
"step": 5370
},
{
"epoch": 2.9463307776560788,
"grad_norm": 3.7570650577545166,
"learning_rate": 2e-05,
"loss": 0.1491,
"step": 5380
},
{
"epoch": 2.9518072289156625,
"grad_norm": 3.499650716781616,
"learning_rate": 2e-05,
"loss": 0.1335,
"step": 5390
},
{
"epoch": 2.9572836801752462,
"grad_norm": 2.9174211025238037,
"learning_rate": 2e-05,
"loss": 0.1468,
"step": 5400
},
{
"epoch": 2.9627601314348304,
"grad_norm": 3.045962333679199,
"learning_rate": 2e-05,
"loss": 0.1283,
"step": 5410
},
{
"epoch": 2.968236582694414,
"grad_norm": 2.5407910346984863,
"learning_rate": 2e-05,
"loss": 0.1171,
"step": 5420
},
{
"epoch": 2.973713033953998,
"grad_norm": 2.07804274559021,
"learning_rate": 2e-05,
"loss": 0.1127,
"step": 5430
},
{
"epoch": 2.9791894852135816,
"grad_norm": 2.2112016677856445,
"learning_rate": 2e-05,
"loss": 0.1249,
"step": 5440
},
{
"epoch": 2.9846659364731654,
"grad_norm": 4.099008083343506,
"learning_rate": 2e-05,
"loss": 0.141,
"step": 5450
},
{
"epoch": 2.990142387732749,
"grad_norm": 1.716291069984436,
"learning_rate": 2e-05,
"loss": 0.1102,
"step": 5460
},
{
"epoch": 2.995618838992333,
"grad_norm": 3.4466896057128906,
"learning_rate": 2e-05,
"loss": 0.1283,
"step": 5470
}
],
"logging_steps": 10,
"max_steps": 5478,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7443682033477008.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}