nsfk-detection / trainer_state.json
yasserrmd's picture
Add fine-tuned ModernBERT model
5383e56 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006715916722632639,
"grad_norm": 5.630690574645996,
"learning_rate": 9e-07,
"loss": 0.7133,
"step": 10
},
{
"epoch": 0.013431833445265278,
"grad_norm": 4.237407207489014,
"learning_rate": 1.9e-06,
"loss": 0.6472,
"step": 20
},
{
"epoch": 0.020147750167897917,
"grad_norm": 6.263025760650635,
"learning_rate": 2.9e-06,
"loss": 0.5851,
"step": 30
},
{
"epoch": 0.026863666890530557,
"grad_norm": 10.21045207977295,
"learning_rate": 3.9e-06,
"loss": 0.5366,
"step": 40
},
{
"epoch": 0.0335795836131632,
"grad_norm": 3.691526174545288,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.4325,
"step": 50
},
{
"epoch": 0.040295500335795834,
"grad_norm": 5.201944351196289,
"learning_rate": 5.9e-06,
"loss": 0.354,
"step": 60
},
{
"epoch": 0.04701141705842848,
"grad_norm": 15.92503547668457,
"learning_rate": 6.900000000000001e-06,
"loss": 0.2003,
"step": 70
},
{
"epoch": 0.053727333781061114,
"grad_norm": 7.618583679199219,
"learning_rate": 7.9e-06,
"loss": 0.2309,
"step": 80
},
{
"epoch": 0.06044325050369376,
"grad_norm": 4.151483535766602,
"learning_rate": 8.9e-06,
"loss": 0.1384,
"step": 90
},
{
"epoch": 0.0671591672263264,
"grad_norm": 9.921517372131348,
"learning_rate": 9.900000000000002e-06,
"loss": 0.1763,
"step": 100
},
{
"epoch": 0.07387508394895903,
"grad_norm": 2.020857810974121,
"learning_rate": 1.09e-05,
"loss": 0.1451,
"step": 110
},
{
"epoch": 0.08059100067159167,
"grad_norm": 1.8496060371398926,
"learning_rate": 1.19e-05,
"loss": 0.1004,
"step": 120
},
{
"epoch": 0.08730691739422432,
"grad_norm": 9.236139297485352,
"learning_rate": 1.29e-05,
"loss": 0.1879,
"step": 130
},
{
"epoch": 0.09402283411685695,
"grad_norm": 10.885250091552734,
"learning_rate": 1.3900000000000002e-05,
"loss": 0.1709,
"step": 140
},
{
"epoch": 0.10073875083948959,
"grad_norm": 18.184890747070312,
"learning_rate": 1.49e-05,
"loss": 0.1324,
"step": 150
},
{
"epoch": 0.10745466756212223,
"grad_norm": 6.792922019958496,
"learning_rate": 1.59e-05,
"loss": 0.1485,
"step": 160
},
{
"epoch": 0.11417058428475486,
"grad_norm": 2.8304738998413086,
"learning_rate": 1.69e-05,
"loss": 0.0533,
"step": 170
},
{
"epoch": 0.12088650100738751,
"grad_norm": 0.03287811204791069,
"learning_rate": 1.79e-05,
"loss": 0.1616,
"step": 180
},
{
"epoch": 0.12760241773002015,
"grad_norm": 4.168348789215088,
"learning_rate": 1.8900000000000002e-05,
"loss": 0.0667,
"step": 190
},
{
"epoch": 0.1343183344526528,
"grad_norm": 0.9931334257125854,
"learning_rate": 1.9900000000000003e-05,
"loss": 0.1678,
"step": 200
},
{
"epoch": 0.14103425117528542,
"grad_norm": 14.898422241210938,
"learning_rate": 2.09e-05,
"loss": 0.1274,
"step": 210
},
{
"epoch": 0.14775016789791806,
"grad_norm": 2.7615175247192383,
"learning_rate": 2.19e-05,
"loss": 0.1324,
"step": 220
},
{
"epoch": 0.1544660846205507,
"grad_norm": 0.19374199211597443,
"learning_rate": 2.29e-05,
"loss": 0.2972,
"step": 230
},
{
"epoch": 0.16118200134318333,
"grad_norm": 6.9305419921875,
"learning_rate": 2.39e-05,
"loss": 0.0863,
"step": 240
},
{
"epoch": 0.16789791806581597,
"grad_norm": 8.841792106628418,
"learning_rate": 2.4900000000000002e-05,
"loss": 0.1672,
"step": 250
},
{
"epoch": 0.17461383478844864,
"grad_norm": 0.10736802220344543,
"learning_rate": 2.5900000000000003e-05,
"loss": 0.1535,
"step": 260
},
{
"epoch": 0.18132975151108127,
"grad_norm": 9.984793663024902,
"learning_rate": 2.6900000000000003e-05,
"loss": 0.0592,
"step": 270
},
{
"epoch": 0.1880456682337139,
"grad_norm": 17.420595169067383,
"learning_rate": 2.7900000000000004e-05,
"loss": 0.2605,
"step": 280
},
{
"epoch": 0.19476158495634655,
"grad_norm": 2.0211360454559326,
"learning_rate": 2.8899999999999998e-05,
"loss": 0.0581,
"step": 290
},
{
"epoch": 0.20147750167897918,
"grad_norm": 4.574538707733154,
"learning_rate": 2.9900000000000002e-05,
"loss": 0.1315,
"step": 300
},
{
"epoch": 0.20819341840161182,
"grad_norm": 0.37417200207710266,
"learning_rate": 3.09e-05,
"loss": 0.0863,
"step": 310
},
{
"epoch": 0.21490933512424445,
"grad_norm": 0.08471789956092834,
"learning_rate": 3.19e-05,
"loss": 0.0509,
"step": 320
},
{
"epoch": 0.2216252518468771,
"grad_norm": 2.391404151916504,
"learning_rate": 3.29e-05,
"loss": 0.0897,
"step": 330
},
{
"epoch": 0.22834116856950973,
"grad_norm": 0.2534968852996826,
"learning_rate": 3.3900000000000004e-05,
"loss": 0.1236,
"step": 340
},
{
"epoch": 0.23505708529214236,
"grad_norm": 0.9776629209518433,
"learning_rate": 3.49e-05,
"loss": 0.1281,
"step": 350
},
{
"epoch": 0.24177300201477503,
"grad_norm": 2.469677209854126,
"learning_rate": 3.59e-05,
"loss": 0.2372,
"step": 360
},
{
"epoch": 0.24848891873740767,
"grad_norm": 0.3327536880970001,
"learning_rate": 3.69e-05,
"loss": 0.083,
"step": 370
},
{
"epoch": 0.2552048354600403,
"grad_norm": 10.457483291625977,
"learning_rate": 3.79e-05,
"loss": 0.1516,
"step": 380
},
{
"epoch": 0.2619207521826729,
"grad_norm": 0.4218825697898865,
"learning_rate": 3.8900000000000004e-05,
"loss": 0.137,
"step": 390
},
{
"epoch": 0.2686366689053056,
"grad_norm": 4.134459495544434,
"learning_rate": 3.99e-05,
"loss": 0.1811,
"step": 400
},
{
"epoch": 0.27535258562793824,
"grad_norm": 0.6121009588241577,
"learning_rate": 4.09e-05,
"loss": 0.0851,
"step": 410
},
{
"epoch": 0.28206850235057085,
"grad_norm": 0.04361363872885704,
"learning_rate": 4.19e-05,
"loss": 0.0387,
"step": 420
},
{
"epoch": 0.2887844190732035,
"grad_norm": 109.2441635131836,
"learning_rate": 4.29e-05,
"loss": 0.1153,
"step": 430
},
{
"epoch": 0.2955003357958361,
"grad_norm": 0.012662663124501705,
"learning_rate": 4.39e-05,
"loss": 0.251,
"step": 440
},
{
"epoch": 0.3022162525184688,
"grad_norm": 0.8022059202194214,
"learning_rate": 4.49e-05,
"loss": 0.184,
"step": 450
},
{
"epoch": 0.3089321692411014,
"grad_norm": 18.51458740234375,
"learning_rate": 4.5900000000000004e-05,
"loss": 0.1241,
"step": 460
},
{
"epoch": 0.31564808596373406,
"grad_norm": 2.7725183963775635,
"learning_rate": 4.69e-05,
"loss": 0.1641,
"step": 470
},
{
"epoch": 0.32236400268636667,
"grad_norm": 0.11448180675506592,
"learning_rate": 4.79e-05,
"loss": 0.0613,
"step": 480
},
{
"epoch": 0.32907991940899933,
"grad_norm": 0.0008866311982274055,
"learning_rate": 4.89e-05,
"loss": 0.0064,
"step": 490
},
{
"epoch": 0.33579583613163194,
"grad_norm": 3.4407784938812256,
"learning_rate": 4.99e-05,
"loss": 0.1556,
"step": 500
},
{
"epoch": 0.3425117528542646,
"grad_norm": 12.447320938110352,
"learning_rate": 4.988656415427275e-05,
"loss": 0.2918,
"step": 510
},
{
"epoch": 0.34922766957689727,
"grad_norm": 3.2200286388397217,
"learning_rate": 4.976052432568692e-05,
"loss": 0.155,
"step": 520
},
{
"epoch": 0.3559435862995299,
"grad_norm": 0.31001582741737366,
"learning_rate": 4.963448449710109e-05,
"loss": 0.0628,
"step": 530
},
{
"epoch": 0.36265950302216254,
"grad_norm": 1.2337498664855957,
"learning_rate": 4.9508444668515254e-05,
"loss": 0.1069,
"step": 540
},
{
"epoch": 0.36937541974479515,
"grad_norm": 0.11717221140861511,
"learning_rate": 4.938240483992942e-05,
"loss": 0.0971,
"step": 550
},
{
"epoch": 0.3760913364674278,
"grad_norm": 1.2564656734466553,
"learning_rate": 4.925636501134358e-05,
"loss": 0.0889,
"step": 560
},
{
"epoch": 0.3828072531900604,
"grad_norm": 1.1871436834335327,
"learning_rate": 4.9130325182757754e-05,
"loss": 0.2583,
"step": 570
},
{
"epoch": 0.3895231699126931,
"grad_norm": 0.31010547280311584,
"learning_rate": 4.900428535417192e-05,
"loss": 0.0479,
"step": 580
},
{
"epoch": 0.3962390866353257,
"grad_norm": 0.8911293745040894,
"learning_rate": 4.8878245525586084e-05,
"loss": 0.1356,
"step": 590
},
{
"epoch": 0.40295500335795836,
"grad_norm": 2.5432729721069336,
"learning_rate": 4.8752205697000255e-05,
"loss": 0.0841,
"step": 600
},
{
"epoch": 0.40967092008059103,
"grad_norm": 0.2870176136493683,
"learning_rate": 4.862616586841442e-05,
"loss": 0.0695,
"step": 610
},
{
"epoch": 0.41638683680322364,
"grad_norm": 1.6922942399978638,
"learning_rate": 4.8500126039828585e-05,
"loss": 0.0721,
"step": 620
},
{
"epoch": 0.4231027535258563,
"grad_norm": 0.4180830717086792,
"learning_rate": 4.8374086211242756e-05,
"loss": 0.0194,
"step": 630
},
{
"epoch": 0.4298186702484889,
"grad_norm": 4.236703872680664,
"learning_rate": 4.824804638265692e-05,
"loss": 0.1223,
"step": 640
},
{
"epoch": 0.4365345869711216,
"grad_norm": 2.4450433254241943,
"learning_rate": 4.8122006554071086e-05,
"loss": 0.0755,
"step": 650
},
{
"epoch": 0.4432505036937542,
"grad_norm": 1.0442028045654297,
"learning_rate": 4.799596672548525e-05,
"loss": 0.0912,
"step": 660
},
{
"epoch": 0.44996642041638685,
"grad_norm": 5.720908164978027,
"learning_rate": 4.786992689689943e-05,
"loss": 0.0321,
"step": 670
},
{
"epoch": 0.45668233713901946,
"grad_norm": 0.7653585076332092,
"learning_rate": 4.774388706831359e-05,
"loss": 0.0998,
"step": 680
},
{
"epoch": 0.4633982538616521,
"grad_norm": 0.3201594352722168,
"learning_rate": 4.761784723972776e-05,
"loss": 0.0781,
"step": 690
},
{
"epoch": 0.47011417058428473,
"grad_norm": 0.001702402252703905,
"learning_rate": 4.749180741114192e-05,
"loss": 0.013,
"step": 700
},
{
"epoch": 0.4768300873069174,
"grad_norm": 14.604610443115234,
"learning_rate": 4.7365767582556094e-05,
"loss": 0.083,
"step": 710
},
{
"epoch": 0.48354600402955006,
"grad_norm": 5.529962539672852,
"learning_rate": 4.723972775397026e-05,
"loss": 0.182,
"step": 720
},
{
"epoch": 0.49026192075218267,
"grad_norm": 1.8030890226364136,
"learning_rate": 4.7113687925384424e-05,
"loss": 0.125,
"step": 730
},
{
"epoch": 0.49697783747481533,
"grad_norm": 6.454009532928467,
"learning_rate": 4.6987648096798595e-05,
"loss": 0.0709,
"step": 740
},
{
"epoch": 0.503693754197448,
"grad_norm": 5.522768020629883,
"learning_rate": 4.686160826821276e-05,
"loss": 0.1086,
"step": 750
},
{
"epoch": 0.5104096709200806,
"grad_norm": 0.5437310338020325,
"learning_rate": 4.6735568439626924e-05,
"loss": 0.0835,
"step": 760
},
{
"epoch": 0.5171255876427132,
"grad_norm": 1.4233680963516235,
"learning_rate": 4.660952861104109e-05,
"loss": 0.15,
"step": 770
},
{
"epoch": 0.5238415043653458,
"grad_norm": 2.103144884109497,
"learning_rate": 4.648348878245526e-05,
"loss": 0.0615,
"step": 780
},
{
"epoch": 0.5305574210879785,
"grad_norm": 4.456714630126953,
"learning_rate": 4.6357448953869425e-05,
"loss": 0.117,
"step": 790
},
{
"epoch": 0.5372733378106112,
"grad_norm": 2.5074684619903564,
"learning_rate": 4.623140912528359e-05,
"loss": 0.1606,
"step": 800
},
{
"epoch": 0.5439892545332438,
"grad_norm": 0.8483042120933533,
"learning_rate": 4.610536929669776e-05,
"loss": 0.0747,
"step": 810
},
{
"epoch": 0.5507051712558765,
"grad_norm": 0.05906325578689575,
"learning_rate": 4.5979329468111926e-05,
"loss": 0.1355,
"step": 820
},
{
"epoch": 0.5574210879785091,
"grad_norm": 0.8080152273178101,
"learning_rate": 4.585328963952609e-05,
"loss": 0.1001,
"step": 830
},
{
"epoch": 0.5641370047011417,
"grad_norm": 0.1343095898628235,
"learning_rate": 4.572724981094026e-05,
"loss": 0.0376,
"step": 840
},
{
"epoch": 0.5708529214237743,
"grad_norm": 1.397542119026184,
"learning_rate": 4.560120998235443e-05,
"loss": 0.1431,
"step": 850
},
{
"epoch": 0.577568838146407,
"grad_norm": 0.5649687051773071,
"learning_rate": 4.547517015376859e-05,
"loss": 0.0864,
"step": 860
},
{
"epoch": 0.5842847548690396,
"grad_norm": 1.3697500228881836,
"learning_rate": 4.5349130325182757e-05,
"loss": 0.0727,
"step": 870
},
{
"epoch": 0.5910006715916722,
"grad_norm": 0.849629819393158,
"learning_rate": 4.522309049659693e-05,
"loss": 0.073,
"step": 880
},
{
"epoch": 0.5977165883143049,
"grad_norm": 1.3777800798416138,
"learning_rate": 4.509705066801109e-05,
"loss": 0.1439,
"step": 890
},
{
"epoch": 0.6044325050369376,
"grad_norm": 3.064931869506836,
"learning_rate": 4.497101083942526e-05,
"loss": 0.0923,
"step": 900
},
{
"epoch": 0.6111484217595702,
"grad_norm": 0.47612619400024414,
"learning_rate": 4.484497101083943e-05,
"loss": 0.0279,
"step": 910
},
{
"epoch": 0.6178643384822028,
"grad_norm": 0.2581987679004669,
"learning_rate": 4.4718931182253594e-05,
"loss": 0.0275,
"step": 920
},
{
"epoch": 0.6245802552048355,
"grad_norm": 3.8201942443847656,
"learning_rate": 4.459289135366776e-05,
"loss": 0.0821,
"step": 930
},
{
"epoch": 0.6312961719274681,
"grad_norm": 5.899015426635742,
"learning_rate": 4.446685152508192e-05,
"loss": 0.1238,
"step": 940
},
{
"epoch": 0.6380120886501007,
"grad_norm": 0.38817426562309265,
"learning_rate": 4.4340811696496094e-05,
"loss": 0.1314,
"step": 950
},
{
"epoch": 0.6447280053727333,
"grad_norm": 0.4328228235244751,
"learning_rate": 4.421477186791026e-05,
"loss": 0.0884,
"step": 960
},
{
"epoch": 0.6514439220953661,
"grad_norm": 0.645894467830658,
"learning_rate": 4.4088732039324424e-05,
"loss": 0.0464,
"step": 970
},
{
"epoch": 0.6581598388179987,
"grad_norm": 0.25149455666542053,
"learning_rate": 4.3962692210738595e-05,
"loss": 0.1324,
"step": 980
},
{
"epoch": 0.6648757555406313,
"grad_norm": 3.0216193199157715,
"learning_rate": 4.383665238215276e-05,
"loss": 0.1317,
"step": 990
},
{
"epoch": 0.6715916722632639,
"grad_norm": 0.4481722116470337,
"learning_rate": 4.371061255356693e-05,
"loss": 0.0304,
"step": 1000
},
{
"epoch": 0.6783075889858966,
"grad_norm": 0.4348456859588623,
"learning_rate": 4.3584572724981096e-05,
"loss": 0.1263,
"step": 1010
},
{
"epoch": 0.6850235057085292,
"grad_norm": 0.7903152108192444,
"learning_rate": 4.345853289639527e-05,
"loss": 0.0465,
"step": 1020
},
{
"epoch": 0.6917394224311618,
"grad_norm": 0.6616799831390381,
"learning_rate": 4.333249306780943e-05,
"loss": 0.0438,
"step": 1030
},
{
"epoch": 0.6984553391537945,
"grad_norm": 0.45970794558525085,
"learning_rate": 4.32064532392236e-05,
"loss": 0.0662,
"step": 1040
},
{
"epoch": 0.7051712558764271,
"grad_norm": 1.471104621887207,
"learning_rate": 4.308041341063777e-05,
"loss": 0.0858,
"step": 1050
},
{
"epoch": 0.7118871725990598,
"grad_norm": 8.536913871765137,
"learning_rate": 4.295437358205193e-05,
"loss": 0.0661,
"step": 1060
},
{
"epoch": 0.7186030893216924,
"grad_norm": 0.17855380475521088,
"learning_rate": 4.28283337534661e-05,
"loss": 0.0604,
"step": 1070
},
{
"epoch": 0.7253190060443251,
"grad_norm": 1.2315560579299927,
"learning_rate": 4.270229392488026e-05,
"loss": 0.0738,
"step": 1080
},
{
"epoch": 0.7320349227669577,
"grad_norm": 2.4337143898010254,
"learning_rate": 4.2576254096294434e-05,
"loss": 0.111,
"step": 1090
},
{
"epoch": 0.7387508394895903,
"grad_norm": 0.5771811604499817,
"learning_rate": 4.24502142677086e-05,
"loss": 0.0873,
"step": 1100
},
{
"epoch": 0.7454667562122229,
"grad_norm": 0.29616767168045044,
"learning_rate": 4.2324174439122764e-05,
"loss": 0.1899,
"step": 1110
},
{
"epoch": 0.7521826729348556,
"grad_norm": 0.005016247741878033,
"learning_rate": 4.2198134610536935e-05,
"loss": 0.0512,
"step": 1120
},
{
"epoch": 0.7588985896574882,
"grad_norm": 0.6726338267326355,
"learning_rate": 4.20720947819511e-05,
"loss": 0.0749,
"step": 1130
},
{
"epoch": 0.7656145063801209,
"grad_norm": 15.482983589172363,
"learning_rate": 4.1946054953365264e-05,
"loss": 0.1208,
"step": 1140
},
{
"epoch": 0.7723304231027536,
"grad_norm": 0.13030461966991425,
"learning_rate": 4.182001512477943e-05,
"loss": 0.0716,
"step": 1150
},
{
"epoch": 0.7790463398253862,
"grad_norm": 0.00033535558031871915,
"learning_rate": 4.16939752961936e-05,
"loss": 0.0575,
"step": 1160
},
{
"epoch": 0.7857622565480188,
"grad_norm": 9.646067337598652e-05,
"learning_rate": 4.1567935467607765e-05,
"loss": 0.0491,
"step": 1170
},
{
"epoch": 0.7924781732706514,
"grad_norm": 0.6216703653335571,
"learning_rate": 4.144189563902193e-05,
"loss": 0.1496,
"step": 1180
},
{
"epoch": 0.7991940899932841,
"grad_norm": 0.1437370628118515,
"learning_rate": 4.13158558104361e-05,
"loss": 0.0467,
"step": 1190
},
{
"epoch": 0.8059100067159167,
"grad_norm": 0.5845384001731873,
"learning_rate": 4.1189815981850266e-05,
"loss": 0.0423,
"step": 1200
},
{
"epoch": 0.8126259234385493,
"grad_norm": 0.006311117671430111,
"learning_rate": 4.106377615326443e-05,
"loss": 0.0434,
"step": 1210
},
{
"epoch": 0.8193418401611821,
"grad_norm": 0.28811392188072205,
"learning_rate": 4.09377363246786e-05,
"loss": 0.0401,
"step": 1220
},
{
"epoch": 0.8260577568838147,
"grad_norm": 0.00534482067450881,
"learning_rate": 4.081169649609277e-05,
"loss": 0.0974,
"step": 1230
},
{
"epoch": 0.8327736736064473,
"grad_norm": 0.3741544187068939,
"learning_rate": 4.068565666750693e-05,
"loss": 0.0457,
"step": 1240
},
{
"epoch": 0.8394895903290799,
"grad_norm": 0.9101037383079529,
"learning_rate": 4.0559616838921097e-05,
"loss": 0.0547,
"step": 1250
},
{
"epoch": 0.8462055070517126,
"grad_norm": 0.2954216003417969,
"learning_rate": 4.043357701033527e-05,
"loss": 0.0656,
"step": 1260
},
{
"epoch": 0.8529214237743452,
"grad_norm": 1.8236368894577026,
"learning_rate": 4.030753718174943e-05,
"loss": 0.0607,
"step": 1270
},
{
"epoch": 0.8596373404969778,
"grad_norm": 0.17372089624404907,
"learning_rate": 4.01814973531636e-05,
"loss": 0.0894,
"step": 1280
},
{
"epoch": 0.8663532572196104,
"grad_norm": 0.0005210727686062455,
"learning_rate": 4.005545752457777e-05,
"loss": 0.0761,
"step": 1290
},
{
"epoch": 0.8730691739422431,
"grad_norm": 0.29330241680145264,
"learning_rate": 3.9929417695991934e-05,
"loss": 0.0626,
"step": 1300
},
{
"epoch": 0.8797850906648758,
"grad_norm": 0.7308351397514343,
"learning_rate": 3.98033778674061e-05,
"loss": 0.0754,
"step": 1310
},
{
"epoch": 0.8865010073875084,
"grad_norm": 0.0010205712169408798,
"learning_rate": 3.967733803882027e-05,
"loss": 0.0991,
"step": 1320
},
{
"epoch": 0.8932169241101411,
"grad_norm": 0.26492127776145935,
"learning_rate": 3.955129821023444e-05,
"loss": 0.0371,
"step": 1330
},
{
"epoch": 0.8999328408327737,
"grad_norm": 0.3958211839199066,
"learning_rate": 3.9425258381648606e-05,
"loss": 0.1793,
"step": 1340
},
{
"epoch": 0.9066487575554063,
"grad_norm": 0.006723370868712664,
"learning_rate": 3.929921855306277e-05,
"loss": 0.0385,
"step": 1350
},
{
"epoch": 0.9133646742780389,
"grad_norm": 16.94601821899414,
"learning_rate": 3.917317872447694e-05,
"loss": 0.0664,
"step": 1360
},
{
"epoch": 0.9200805910006716,
"grad_norm": 0.3626093864440918,
"learning_rate": 3.904713889589111e-05,
"loss": 0.0424,
"step": 1370
},
{
"epoch": 0.9267965077233042,
"grad_norm": 0.347401887178421,
"learning_rate": 3.892109906730527e-05,
"loss": 0.0558,
"step": 1380
},
{
"epoch": 0.9335124244459369,
"grad_norm": 0.5849874019622803,
"learning_rate": 3.8795059238719436e-05,
"loss": 0.0543,
"step": 1390
},
{
"epoch": 0.9402283411685695,
"grad_norm": 0.0007323077879846096,
"learning_rate": 3.866901941013361e-05,
"loss": 0.065,
"step": 1400
},
{
"epoch": 0.9469442578912022,
"grad_norm": 0.29866859316825867,
"learning_rate": 3.854297958154777e-05,
"loss": 0.083,
"step": 1410
},
{
"epoch": 0.9536601746138348,
"grad_norm": 1.046910047531128,
"learning_rate": 3.841693975296194e-05,
"loss": 0.0795,
"step": 1420
},
{
"epoch": 0.9603760913364674,
"grad_norm": 0.28778284788131714,
"learning_rate": 3.829089992437611e-05,
"loss": 0.121,
"step": 1430
},
{
"epoch": 0.9670920080591001,
"grad_norm": 0.6510307192802429,
"learning_rate": 3.816486009579027e-05,
"loss": 0.0976,
"step": 1440
},
{
"epoch": 0.9738079247817327,
"grad_norm": 0.7349569797515869,
"learning_rate": 3.803882026720444e-05,
"loss": 0.027,
"step": 1450
},
{
"epoch": 0.9805238415043653,
"grad_norm": 0.41969776153564453,
"learning_rate": 3.79127804386186e-05,
"loss": 0.0609,
"step": 1460
},
{
"epoch": 0.9872397582269979,
"grad_norm": 0.765001118183136,
"learning_rate": 3.7786740610032774e-05,
"loss": 0.0823,
"step": 1470
},
{
"epoch": 0.9939556749496307,
"grad_norm": 0.5332460999488831,
"learning_rate": 3.766070078144694e-05,
"loss": 0.063,
"step": 1480
},
{
"epoch": 1.0,
"eval_loss": 0.05031801387667656,
"eval_runtime": 16.4028,
"eval_samples_per_second": 362.988,
"eval_steps_per_second": 5.731,
"step": 1489
},
{
"epoch": 1.0006715916722633,
"grad_norm": 1.0223584175109863,
"learning_rate": 3.7534660952861104e-05,
"loss": 0.0612,
"step": 1490
},
{
"epoch": 1.007387508394896,
"grad_norm": 0.2597563862800598,
"learning_rate": 3.7408621124275275e-05,
"loss": 0.0986,
"step": 1500
},
{
"epoch": 1.0141034251175285,
"grad_norm": 0.4414527714252472,
"learning_rate": 3.728258129568944e-05,
"loss": 0.0544,
"step": 1510
},
{
"epoch": 1.0208193418401612,
"grad_norm": 0.5101744532585144,
"learning_rate": 3.7156541467103605e-05,
"loss": 0.0549,
"step": 1520
},
{
"epoch": 1.027535258562794,
"grad_norm": 0.695880651473999,
"learning_rate": 3.7030501638517776e-05,
"loss": 0.0429,
"step": 1530
},
{
"epoch": 1.0342511752854264,
"grad_norm": 0.0017918674275279045,
"learning_rate": 3.690446180993194e-05,
"loss": 0.057,
"step": 1540
},
{
"epoch": 1.0409670920080591,
"grad_norm": 5.918661117553711,
"learning_rate": 3.6778421981346105e-05,
"loss": 0.0428,
"step": 1550
},
{
"epoch": 1.0476830087306916,
"grad_norm": 0.017390791326761246,
"learning_rate": 3.665238215276027e-05,
"loss": 0.107,
"step": 1560
},
{
"epoch": 1.0543989254533244,
"grad_norm": 0.022303210571408272,
"learning_rate": 3.652634232417444e-05,
"loss": 0.0683,
"step": 1570
},
{
"epoch": 1.061114842175957,
"grad_norm": 0.003013026202097535,
"learning_rate": 3.6400302495588606e-05,
"loss": 0.0675,
"step": 1580
},
{
"epoch": 1.0678307588985896,
"grad_norm": 0.0006217322661541402,
"learning_rate": 3.627426266700277e-05,
"loss": 0.0552,
"step": 1590
},
{
"epoch": 1.0745466756212223,
"grad_norm": 0.2726440131664276,
"learning_rate": 3.614822283841694e-05,
"loss": 0.0427,
"step": 1600
},
{
"epoch": 1.081262592343855,
"grad_norm": 0.29946306347846985,
"learning_rate": 3.602218300983111e-05,
"loss": 0.0866,
"step": 1610
},
{
"epoch": 1.0879785090664875,
"grad_norm": 0.0426226370036602,
"learning_rate": 3.589614318124527e-05,
"loss": 0.0447,
"step": 1620
},
{
"epoch": 1.0946944257891202,
"grad_norm": 0.6733192801475525,
"learning_rate": 3.5770103352659437e-05,
"loss": 0.1095,
"step": 1630
},
{
"epoch": 1.1014103425117527,
"grad_norm": 0.2110043466091156,
"learning_rate": 3.564406352407361e-05,
"loss": 0.0213,
"step": 1640
},
{
"epoch": 1.1081262592343855,
"grad_norm": 0.0002463633718434721,
"learning_rate": 3.551802369548777e-05,
"loss": 0.0531,
"step": 1650
},
{
"epoch": 1.1148421759570182,
"grad_norm": 0.660849392414093,
"learning_rate": 3.5391983866901944e-05,
"loss": 0.0565,
"step": 1660
},
{
"epoch": 1.1215580926796507,
"grad_norm": 0.0015531358076259494,
"learning_rate": 3.526594403831611e-05,
"loss": 0.0355,
"step": 1670
},
{
"epoch": 1.1282740094022834,
"grad_norm": 0.5349295735359192,
"learning_rate": 3.513990420973028e-05,
"loss": 0.0461,
"step": 1680
},
{
"epoch": 1.1349899261249161,
"grad_norm": 0.0036038027610629797,
"learning_rate": 3.5013864381144445e-05,
"loss": 0.0672,
"step": 1690
},
{
"epoch": 1.1417058428475486,
"grad_norm": 0.0011060705874115229,
"learning_rate": 3.488782455255861e-05,
"loss": 0.0501,
"step": 1700
},
{
"epoch": 1.1484217595701813,
"grad_norm": 3.7792551517486572,
"learning_rate": 3.476178472397278e-05,
"loss": 0.0436,
"step": 1710
},
{
"epoch": 1.155137676292814,
"grad_norm": 8.058547019958496,
"learning_rate": 3.4635744895386946e-05,
"loss": 0.0752,
"step": 1720
},
{
"epoch": 1.1618535930154466,
"grad_norm": 0.0012638597982004285,
"learning_rate": 3.450970506680111e-05,
"loss": 0.0591,
"step": 1730
},
{
"epoch": 1.1685695097380793,
"grad_norm": 0.0024126614443957806,
"learning_rate": 3.438366523821528e-05,
"loss": 0.0399,
"step": 1740
},
{
"epoch": 1.1752854264607118,
"grad_norm": 3.4204914569854736,
"learning_rate": 3.425762540962945e-05,
"loss": 0.0456,
"step": 1750
},
{
"epoch": 1.1820013431833445,
"grad_norm": 0.6280553340911865,
"learning_rate": 3.413158558104361e-05,
"loss": 0.0623,
"step": 1760
},
{
"epoch": 1.1887172599059772,
"grad_norm": 0.0001932340528583154,
"learning_rate": 3.4005545752457776e-05,
"loss": 0.015,
"step": 1770
},
{
"epoch": 1.19543317662861,
"grad_norm": 3.976154088973999,
"learning_rate": 3.387950592387195e-05,
"loss": 0.0464,
"step": 1780
},
{
"epoch": 1.2021490933512424,
"grad_norm": 0.632788360118866,
"learning_rate": 3.375346609528611e-05,
"loss": 0.1215,
"step": 1790
},
{
"epoch": 1.2088650100738751,
"grad_norm": 0.24529287219047546,
"learning_rate": 3.362742626670028e-05,
"loss": 0.0582,
"step": 1800
},
{
"epoch": 1.2155809267965076,
"grad_norm": 0.42675378918647766,
"learning_rate": 3.350138643811445e-05,
"loss": 0.0812,
"step": 1810
},
{
"epoch": 1.2222968435191404,
"grad_norm": 0.0022362687159329653,
"learning_rate": 3.337534660952861e-05,
"loss": 0.047,
"step": 1820
},
{
"epoch": 1.229012760241773,
"grad_norm": 0.45256197452545166,
"learning_rate": 3.324930678094278e-05,
"loss": 0.0664,
"step": 1830
},
{
"epoch": 1.2357286769644056,
"grad_norm": 0.8793488144874573,
"learning_rate": 3.312326695235694e-05,
"loss": 0.0452,
"step": 1840
},
{
"epoch": 1.2424445936870383,
"grad_norm": 0.0005373602034524083,
"learning_rate": 3.2997227123771114e-05,
"loss": 0.0289,
"step": 1850
},
{
"epoch": 1.2491605104096708,
"grad_norm": 1.7183722257614136,
"learning_rate": 3.287118729518528e-05,
"loss": 0.0455,
"step": 1860
},
{
"epoch": 1.2558764271323035,
"grad_norm": 0.005168843548744917,
"learning_rate": 3.2745147466599444e-05,
"loss": 0.0554,
"step": 1870
},
{
"epoch": 1.2625923438549362,
"grad_norm": 0.0013626981526613235,
"learning_rate": 3.2619107638013615e-05,
"loss": 0.0344,
"step": 1880
},
{
"epoch": 1.269308260577569,
"grad_norm": 0.42641666531562805,
"learning_rate": 3.249306780942778e-05,
"loss": 0.0683,
"step": 1890
},
{
"epoch": 1.2760241773002015,
"grad_norm": 0.005407288204878569,
"learning_rate": 3.2367027980841945e-05,
"loss": 0.0167,
"step": 1900
},
{
"epoch": 1.2827400940228342,
"grad_norm": 0.00033558416180312634,
"learning_rate": 3.2240988152256116e-05,
"loss": 0.0603,
"step": 1910
},
{
"epoch": 1.2894560107454667,
"grad_norm": 0.10696588456630707,
"learning_rate": 3.211494832367028e-05,
"loss": 0.0787,
"step": 1920
},
{
"epoch": 1.2961719274680994,
"grad_norm": 0.7402423620223999,
"learning_rate": 3.1988908495084445e-05,
"loss": 0.1398,
"step": 1930
},
{
"epoch": 1.3028878441907321,
"grad_norm": 0.8248059749603271,
"learning_rate": 3.186286866649861e-05,
"loss": 0.0925,
"step": 1940
},
{
"epoch": 1.3096037609133646,
"grad_norm": 0.0007926832186058164,
"learning_rate": 3.173682883791278e-05,
"loss": 0.0942,
"step": 1950
},
{
"epoch": 1.3163196776359973,
"grad_norm": 0.47744426131248474,
"learning_rate": 3.1610789009326946e-05,
"loss": 0.0348,
"step": 1960
},
{
"epoch": 1.3230355943586298,
"grad_norm": 0.2431195080280304,
"learning_rate": 3.148474918074111e-05,
"loss": 0.0533,
"step": 1970
},
{
"epoch": 1.3297515110812625,
"grad_norm": 3.9710357189178467,
"learning_rate": 3.135870935215528e-05,
"loss": 0.0338,
"step": 1980
},
{
"epoch": 1.3364674278038953,
"grad_norm": 0.6269270181655884,
"learning_rate": 3.1232669523569454e-05,
"loss": 0.0741,
"step": 1990
},
{
"epoch": 1.343183344526528,
"grad_norm": 0.004900393076241016,
"learning_rate": 3.110662969498362e-05,
"loss": 0.0309,
"step": 2000
},
{
"epoch": 1.3498992612491605,
"grad_norm": 1.1476103067398071,
"learning_rate": 3.098058986639778e-05,
"loss": 0.047,
"step": 2010
},
{
"epoch": 1.3566151779717932,
"grad_norm": 0.30575576424598694,
"learning_rate": 3.0854550037811955e-05,
"loss": 0.0541,
"step": 2020
},
{
"epoch": 1.3633310946944257,
"grad_norm": 0.0030384438578039408,
"learning_rate": 3.072851020922612e-05,
"loss": 0.0721,
"step": 2030
},
{
"epoch": 1.3700470114170584,
"grad_norm": 0.3193557560443878,
"learning_rate": 3.0602470380640284e-05,
"loss": 0.0818,
"step": 2040
},
{
"epoch": 1.3767629281396911,
"grad_norm": 0.09504958987236023,
"learning_rate": 3.0476430552054452e-05,
"loss": 0.0238,
"step": 2050
},
{
"epoch": 1.3834788448623236,
"grad_norm": 0.40825656056404114,
"learning_rate": 3.035039072346862e-05,
"loss": 0.0558,
"step": 2060
},
{
"epoch": 1.3901947615849564,
"grad_norm": 0.08500930666923523,
"learning_rate": 3.0224350894882785e-05,
"loss": 0.0854,
"step": 2070
},
{
"epoch": 1.3969106783075889,
"grad_norm": 0.9645543098449707,
"learning_rate": 3.0098311066296953e-05,
"loss": 0.0704,
"step": 2080
},
{
"epoch": 1.4036265950302216,
"grad_norm": 0.5297892093658447,
"learning_rate": 2.9972271237711118e-05,
"loss": 0.0528,
"step": 2090
},
{
"epoch": 1.4103425117528543,
"grad_norm": 0.002040713559836149,
"learning_rate": 2.9846231409125286e-05,
"loss": 0.064,
"step": 2100
},
{
"epoch": 1.417058428475487,
"grad_norm": 0.0018752739997580647,
"learning_rate": 2.9720191580539454e-05,
"loss": 0.0442,
"step": 2110
},
{
"epoch": 1.4237743451981195,
"grad_norm": 0.012829877436161041,
"learning_rate": 2.959415175195362e-05,
"loss": 0.0457,
"step": 2120
},
{
"epoch": 1.4304902619207522,
"grad_norm": 0.7869836688041687,
"learning_rate": 2.9468111923367787e-05,
"loss": 0.0562,
"step": 2130
},
{
"epoch": 1.4372061786433847,
"grad_norm": 0.3505788743495941,
"learning_rate": 2.934207209478195e-05,
"loss": 0.0734,
"step": 2140
},
{
"epoch": 1.4439220953660175,
"grad_norm": 0.35704436898231506,
"learning_rate": 2.921603226619612e-05,
"loss": 0.055,
"step": 2150
},
{
"epoch": 1.4506380120886502,
"grad_norm": 0.45379963517189026,
"learning_rate": 2.9089992437610288e-05,
"loss": 0.0495,
"step": 2160
},
{
"epoch": 1.4573539288112827,
"grad_norm": 0.7454732656478882,
"learning_rate": 2.8963952609024453e-05,
"loss": 0.085,
"step": 2170
},
{
"epoch": 1.4640698455339154,
"grad_norm": 0.04314453899860382,
"learning_rate": 2.883791278043862e-05,
"loss": 0.0336,
"step": 2180
},
{
"epoch": 1.470785762256548,
"grad_norm": 0.306495726108551,
"learning_rate": 2.8711872951852785e-05,
"loss": 0.0458,
"step": 2190
},
{
"epoch": 1.4775016789791806,
"grad_norm": 0.3177057206630707,
"learning_rate": 2.8585833123266953e-05,
"loss": 0.09,
"step": 2200
},
{
"epoch": 1.4842175957018133,
"grad_norm": 0.4207700192928314,
"learning_rate": 2.8459793294681118e-05,
"loss": 0.0506,
"step": 2210
},
{
"epoch": 1.490933512424446,
"grad_norm": 0.43287593126296997,
"learning_rate": 2.8333753466095286e-05,
"loss": 0.042,
"step": 2220
},
{
"epoch": 1.4976494291470785,
"grad_norm": 0.6717241406440735,
"learning_rate": 2.8207713637509454e-05,
"loss": 0.0523,
"step": 2230
},
{
"epoch": 1.5043653458697113,
"grad_norm": 0.00016471787239424884,
"learning_rate": 2.808167380892362e-05,
"loss": 0.0308,
"step": 2240
},
{
"epoch": 1.5110812625923438,
"grad_norm": 0.32060420513153076,
"learning_rate": 2.7955633980337787e-05,
"loss": 0.0945,
"step": 2250
},
{
"epoch": 1.5177971793149765,
"grad_norm": 0.399581640958786,
"learning_rate": 2.7829594151751952e-05,
"loss": 0.0374,
"step": 2260
},
{
"epoch": 1.5245130960376092,
"grad_norm": 0.6255159378051758,
"learning_rate": 2.770355432316612e-05,
"loss": 0.049,
"step": 2270
},
{
"epoch": 1.531229012760242,
"grad_norm": 0.005870606750249863,
"learning_rate": 2.7577514494580288e-05,
"loss": 0.0183,
"step": 2280
},
{
"epoch": 1.5379449294828744,
"grad_norm": 9.138748282566667e-05,
"learning_rate": 2.7451474665994453e-05,
"loss": 0.0429,
"step": 2290
},
{
"epoch": 1.544660846205507,
"grad_norm": 0.780626654624939,
"learning_rate": 2.732543483740862e-05,
"loss": 0.0528,
"step": 2300
},
{
"epoch": 1.5513767629281396,
"grad_norm": 0.28625041246414185,
"learning_rate": 2.7199395008822785e-05,
"loss": 0.0555,
"step": 2310
},
{
"epoch": 1.5580926796507724,
"grad_norm": 0.6077707409858704,
"learning_rate": 2.707335518023696e-05,
"loss": 0.0651,
"step": 2320
},
{
"epoch": 1.564808596373405,
"grad_norm": 0.005245072301477194,
"learning_rate": 2.6947315351651125e-05,
"loss": 0.0269,
"step": 2330
},
{
"epoch": 1.5715245130960376,
"grad_norm": 0.3782106935977936,
"learning_rate": 2.6821275523065293e-05,
"loss": 0.0622,
"step": 2340
},
{
"epoch": 1.5782404298186703,
"grad_norm": 1.065748929977417,
"learning_rate": 2.6695235694479458e-05,
"loss": 0.0855,
"step": 2350
},
{
"epoch": 1.5849563465413028,
"grad_norm": 0.4613993763923645,
"learning_rate": 2.6569195865893626e-05,
"loss": 0.0365,
"step": 2360
},
{
"epoch": 1.5916722632639355,
"grad_norm": 0.3238326907157898,
"learning_rate": 2.6443156037307794e-05,
"loss": 0.0357,
"step": 2370
},
{
"epoch": 1.5983881799865682,
"grad_norm": 0.0309292059391737,
"learning_rate": 2.631711620872196e-05,
"loss": 0.228,
"step": 2380
},
{
"epoch": 1.605104096709201,
"grad_norm": 13.997108459472656,
"learning_rate": 2.6191076380136127e-05,
"loss": 0.0675,
"step": 2390
},
{
"epoch": 1.6118200134318335,
"grad_norm": 0.4026775658130646,
"learning_rate": 2.606503655155029e-05,
"loss": 0.1093,
"step": 2400
},
{
"epoch": 1.618535930154466,
"grad_norm": 0.29253387451171875,
"learning_rate": 2.593899672296446e-05,
"loss": 0.0617,
"step": 2410
},
{
"epoch": 1.6252518468770987,
"grad_norm": 0.017194174230098724,
"learning_rate": 2.5812956894378624e-05,
"loss": 0.0371,
"step": 2420
},
{
"epoch": 1.6319677635997314,
"grad_norm": 0.2723110020160675,
"learning_rate": 2.5686917065792792e-05,
"loss": 0.0377,
"step": 2430
},
{
"epoch": 1.6386836803223641,
"grad_norm": 1.9029347896575928,
"learning_rate": 2.556087723720696e-05,
"loss": 0.062,
"step": 2440
},
{
"epoch": 1.6453995970449966,
"grad_norm": 0.6120497584342957,
"learning_rate": 2.5434837408621125e-05,
"loss": 0.0396,
"step": 2450
},
{
"epoch": 1.6521155137676293,
"grad_norm": 0.012274952605366707,
"learning_rate": 2.5308797580035293e-05,
"loss": 0.1047,
"step": 2460
},
{
"epoch": 1.6588314304902618,
"grad_norm": 0.0005646342178806663,
"learning_rate": 2.5182757751449458e-05,
"loss": 0.0295,
"step": 2470
},
{
"epoch": 1.6655473472128945,
"grad_norm": 0.2949657738208771,
"learning_rate": 2.5056717922863626e-05,
"loss": 0.0687,
"step": 2480
},
{
"epoch": 1.6722632639355273,
"grad_norm": 0.0009046673658303916,
"learning_rate": 2.4930678094277794e-05,
"loss": 0.0363,
"step": 2490
},
{
"epoch": 1.67897918065816,
"grad_norm": 0.666671633720398,
"learning_rate": 2.480463826569196e-05,
"loss": 0.0456,
"step": 2500
},
{
"epoch": 1.6856950973807925,
"grad_norm": 0.40098175406455994,
"learning_rate": 2.4678598437106127e-05,
"loss": 0.046,
"step": 2510
},
{
"epoch": 1.692411014103425,
"grad_norm": 0.43292441964149475,
"learning_rate": 2.455255860852029e-05,
"loss": 0.0506,
"step": 2520
},
{
"epoch": 1.6991269308260577,
"grad_norm": 0.9216532707214355,
"learning_rate": 2.442651877993446e-05,
"loss": 0.0481,
"step": 2530
},
{
"epoch": 1.7058428475486904,
"grad_norm": 46.244651794433594,
"learning_rate": 2.4300478951348628e-05,
"loss": 0.0886,
"step": 2540
},
{
"epoch": 1.7125587642713231,
"grad_norm": 0.6050248742103577,
"learning_rate": 2.4174439122762793e-05,
"loss": 0.0557,
"step": 2550
},
{
"epoch": 1.7192746809939556,
"grad_norm": 1.492427110671997,
"learning_rate": 2.404839929417696e-05,
"loss": 0.0499,
"step": 2560
},
{
"epoch": 1.7259905977165884,
"grad_norm": 2.4528143405914307,
"learning_rate": 2.392235946559113e-05,
"loss": 0.0938,
"step": 2570
},
{
"epoch": 1.7327065144392209,
"grad_norm": 0.029096094891428947,
"learning_rate": 2.3796319637005297e-05,
"loss": 0.0313,
"step": 2580
},
{
"epoch": 1.7394224311618536,
"grad_norm": 0.3839016556739807,
"learning_rate": 2.367027980841946e-05,
"loss": 0.0503,
"step": 2590
},
{
"epoch": 1.7461383478844863,
"grad_norm": 1.5034056901931763,
"learning_rate": 2.354423997983363e-05,
"loss": 0.0269,
"step": 2600
},
{
"epoch": 1.752854264607119,
"grad_norm": 0.0012208319967612624,
"learning_rate": 2.3418200151247798e-05,
"loss": 0.0609,
"step": 2610
},
{
"epoch": 1.7595701813297515,
"grad_norm": 0.9314192533493042,
"learning_rate": 2.3292160322661962e-05,
"loss": 0.0475,
"step": 2620
},
{
"epoch": 1.766286098052384,
"grad_norm": 0.16898050904273987,
"learning_rate": 2.316612049407613e-05,
"loss": 0.078,
"step": 2630
},
{
"epoch": 1.7730020147750167,
"grad_norm": 0.6302085518836975,
"learning_rate": 2.3040080665490295e-05,
"loss": 0.173,
"step": 2640
},
{
"epoch": 1.7797179314976495,
"grad_norm": 0.01881493628025055,
"learning_rate": 2.2914040836904463e-05,
"loss": 0.0291,
"step": 2650
},
{
"epoch": 1.7864338482202822,
"grad_norm": 0.1634562611579895,
"learning_rate": 2.2788001008318628e-05,
"loss": 0.0672,
"step": 2660
},
{
"epoch": 1.7931497649429147,
"grad_norm": 0.2767401933670044,
"learning_rate": 2.2661961179732796e-05,
"loss": 0.052,
"step": 2670
},
{
"epoch": 1.7998656816655474,
"grad_norm": 0.5256266593933105,
"learning_rate": 2.2535921351146964e-05,
"loss": 0.0783,
"step": 2680
},
{
"epoch": 1.8065815983881799,
"grad_norm": 0.006557038053870201,
"learning_rate": 2.240988152256113e-05,
"loss": 0.0281,
"step": 2690
},
{
"epoch": 1.8132975151108126,
"grad_norm": 0.5430310368537903,
"learning_rate": 2.2283841693975297e-05,
"loss": 0.0274,
"step": 2700
},
{
"epoch": 1.8200134318334453,
"grad_norm": 0.0013151871971786022,
"learning_rate": 2.215780186538946e-05,
"loss": 0.0453,
"step": 2710
},
{
"epoch": 1.826729348556078,
"grad_norm": 3.120368480682373,
"learning_rate": 2.203176203680363e-05,
"loss": 0.06,
"step": 2720
},
{
"epoch": 1.8334452652787105,
"grad_norm": 0.46205762028694153,
"learning_rate": 2.1905722208217798e-05,
"loss": 0.1034,
"step": 2730
},
{
"epoch": 1.840161182001343,
"grad_norm": 0.6960582137107849,
"learning_rate": 2.1779682379631966e-05,
"loss": 0.0706,
"step": 2740
},
{
"epoch": 1.8468770987239758,
"grad_norm": 0.552689254283905,
"learning_rate": 2.1653642551046134e-05,
"loss": 0.0281,
"step": 2750
},
{
"epoch": 1.8535930154466085,
"grad_norm": 0.37117066979408264,
"learning_rate": 2.15276027224603e-05,
"loss": 0.0655,
"step": 2760
},
{
"epoch": 1.8603089321692412,
"grad_norm": 0.3859846293926239,
"learning_rate": 2.1401562893874467e-05,
"loss": 0.0769,
"step": 2770
},
{
"epoch": 1.8670248488918737,
"grad_norm": 0.47629514336586,
"learning_rate": 2.127552306528863e-05,
"loss": 0.0398,
"step": 2780
},
{
"epoch": 1.8737407656145064,
"grad_norm": 0.5712143182754517,
"learning_rate": 2.11494832367028e-05,
"loss": 0.0549,
"step": 2790
},
{
"epoch": 1.880456682337139,
"grad_norm": 1.7003332376480103,
"learning_rate": 2.1023443408116968e-05,
"loss": 0.0624,
"step": 2800
},
{
"epoch": 1.8871725990597716,
"grad_norm": 0.46252167224884033,
"learning_rate": 2.0897403579531132e-05,
"loss": 0.0584,
"step": 2810
},
{
"epoch": 1.8938885157824044,
"grad_norm": 4.596211910247803,
"learning_rate": 2.07713637509453e-05,
"loss": 0.1001,
"step": 2820
},
{
"epoch": 1.900604432505037,
"grad_norm": 0.0004219062684569508,
"learning_rate": 2.0645323922359465e-05,
"loss": 0.0698,
"step": 2830
},
{
"epoch": 1.9073203492276696,
"grad_norm": 0.5564634799957275,
"learning_rate": 2.0519284093773633e-05,
"loss": 0.0587,
"step": 2840
},
{
"epoch": 1.914036265950302,
"grad_norm": 0.3310578763484955,
"learning_rate": 2.03932442651878e-05,
"loss": 0.0921,
"step": 2850
},
{
"epoch": 1.9207521826729348,
"grad_norm": 0.4154551327228546,
"learning_rate": 2.0267204436601966e-05,
"loss": 0.0653,
"step": 2860
},
{
"epoch": 1.9274680993955675,
"grad_norm": 0.30634987354278564,
"learning_rate": 2.0141164608016134e-05,
"loss": 0.0439,
"step": 2870
},
{
"epoch": 1.9341840161182002,
"grad_norm": 0.353935569524765,
"learning_rate": 2.00151247794303e-05,
"loss": 0.0453,
"step": 2880
},
{
"epoch": 1.9408999328408327,
"grad_norm": 0.009207702241837978,
"learning_rate": 1.9889084950844467e-05,
"loss": 0.019,
"step": 2890
},
{
"epoch": 1.9476158495634655,
"grad_norm": 0.5521604418754578,
"learning_rate": 1.9763045122258635e-05,
"loss": 0.1271,
"step": 2900
},
{
"epoch": 1.954331766286098,
"grad_norm": 4.1649088859558105,
"learning_rate": 1.9637005293672803e-05,
"loss": 0.0635,
"step": 2910
},
{
"epoch": 1.9610476830087307,
"grad_norm": 0.36337295174598694,
"learning_rate": 1.9510965465086968e-05,
"loss": 0.0704,
"step": 2920
},
{
"epoch": 1.9677635997313634,
"grad_norm": 0.1291392743587494,
"learning_rate": 1.9384925636501136e-05,
"loss": 0.046,
"step": 2930
},
{
"epoch": 1.974479516453996,
"grad_norm": 0.011192429810762405,
"learning_rate": 1.9258885807915304e-05,
"loss": 0.0733,
"step": 2940
},
{
"epoch": 1.9811954331766286,
"grad_norm": 0.048028428107500076,
"learning_rate": 1.913284597932947e-05,
"loss": 0.0419,
"step": 2950
},
{
"epoch": 1.987911349899261,
"grad_norm": 0.3492966890335083,
"learning_rate": 1.9006806150743637e-05,
"loss": 0.0585,
"step": 2960
},
{
"epoch": 1.9946272666218938,
"grad_norm": 0.00014002685202285647,
"learning_rate": 1.88807663221578e-05,
"loss": 0.0612,
"step": 2970
},
{
"epoch": 2.0,
"eval_loss": 0.05646410211920738,
"eval_runtime": 13.5636,
"eval_samples_per_second": 438.969,
"eval_steps_per_second": 6.93,
"step": 2978
},
{
"epoch": 2.0013431833445265,
"grad_norm": 0.3761807680130005,
"learning_rate": 1.875472649357197e-05,
"loss": 0.0616,
"step": 2980
},
{
"epoch": 2.0080591000671593,
"grad_norm": 0.015631867572665215,
"learning_rate": 1.8628686664986138e-05,
"loss": 0.0211,
"step": 2990
},
{
"epoch": 2.014775016789792,
"grad_norm": 0.02187258005142212,
"learning_rate": 1.8502646836400302e-05,
"loss": 0.0486,
"step": 3000
},
{
"epoch": 2.0214909335124243,
"grad_norm": 0.28977006673812866,
"learning_rate": 1.837660700781447e-05,
"loss": 0.0663,
"step": 3010
},
{
"epoch": 2.028206850235057,
"grad_norm": 0.006137068383395672,
"learning_rate": 1.8250567179228635e-05,
"loss": 0.034,
"step": 3020
},
{
"epoch": 2.0349227669576897,
"grad_norm": 0.022373627871274948,
"learning_rate": 1.8124527350642803e-05,
"loss": 0.0503,
"step": 3030
},
{
"epoch": 2.0416386836803224,
"grad_norm": 0.017447108402848244,
"learning_rate": 1.799848752205697e-05,
"loss": 0.0307,
"step": 3040
},
{
"epoch": 2.048354600402955,
"grad_norm": 0.09448742121458054,
"learning_rate": 1.7872447693471136e-05,
"loss": 0.0401,
"step": 3050
},
{
"epoch": 2.055070517125588,
"grad_norm": 0.5452361702919006,
"learning_rate": 1.7746407864885304e-05,
"loss": 0.0496,
"step": 3060
},
{
"epoch": 2.06178643384822,
"grad_norm": 0.5851243138313293,
"learning_rate": 1.7620368036299472e-05,
"loss": 0.0307,
"step": 3070
},
{
"epoch": 2.068502350570853,
"grad_norm": 0.1558593511581421,
"learning_rate": 1.749432820771364e-05,
"loss": 0.0437,
"step": 3080
},
{
"epoch": 2.0752182672934856,
"grad_norm": 0.4523046612739563,
"learning_rate": 1.7368288379127805e-05,
"loss": 0.0318,
"step": 3090
},
{
"epoch": 2.0819341840161183,
"grad_norm": 0.0008133721421472728,
"learning_rate": 1.7242248550541973e-05,
"loss": 0.0968,
"step": 3100
},
{
"epoch": 2.088650100738751,
"grad_norm": 0.28596845269203186,
"learning_rate": 1.7116208721956138e-05,
"loss": 0.0308,
"step": 3110
},
{
"epoch": 2.0953660174613833,
"grad_norm": 0.0006714498158544302,
"learning_rate": 1.6990168893370306e-05,
"loss": 0.0451,
"step": 3120
},
{
"epoch": 2.102081934184016,
"grad_norm": 0.3977266550064087,
"learning_rate": 1.6864129064784474e-05,
"loss": 0.0308,
"step": 3130
},
{
"epoch": 2.1087978509066487,
"grad_norm": 1.4212539196014404,
"learning_rate": 1.673808923619864e-05,
"loss": 0.0518,
"step": 3140
},
{
"epoch": 2.1155137676292814,
"grad_norm": 11.565996170043945,
"learning_rate": 1.6612049407612807e-05,
"loss": 0.0819,
"step": 3150
},
{
"epoch": 2.122229684351914,
"grad_norm": 2.776099920272827,
"learning_rate": 1.648600957902697e-05,
"loss": 0.055,
"step": 3160
},
{
"epoch": 2.1289456010745464,
"grad_norm": 0.0052488865330815315,
"learning_rate": 1.635996975044114e-05,
"loss": 0.0485,
"step": 3170
},
{
"epoch": 2.135661517797179,
"grad_norm": 0.7042301893234253,
"learning_rate": 1.6233929921855308e-05,
"loss": 0.0454,
"step": 3180
},
{
"epoch": 2.142377434519812,
"grad_norm": 0.3006635308265686,
"learning_rate": 1.6107890093269472e-05,
"loss": 0.0377,
"step": 3190
},
{
"epoch": 2.1490933512424446,
"grad_norm": 1.699796199798584,
"learning_rate": 1.598185026468364e-05,
"loss": 0.0731,
"step": 3200
},
{
"epoch": 2.1558092679650773,
"grad_norm": 0.20682166516780853,
"learning_rate": 1.5855810436097805e-05,
"loss": 0.0413,
"step": 3210
},
{
"epoch": 2.16252518468771,
"grad_norm": 0.0004898414481431246,
"learning_rate": 1.5729770607511973e-05,
"loss": 0.1173,
"step": 3220
},
{
"epoch": 2.1692411014103423,
"grad_norm": 0.0038841627538204193,
"learning_rate": 1.560373077892614e-05,
"loss": 0.0435,
"step": 3230
},
{
"epoch": 2.175957018132975,
"grad_norm": 0.014045453630387783,
"learning_rate": 1.547769095034031e-05,
"loss": 0.0457,
"step": 3240
},
{
"epoch": 2.1826729348556078,
"grad_norm": 0.010443809442222118,
"learning_rate": 1.5351651121754478e-05,
"loss": 0.0456,
"step": 3250
},
{
"epoch": 2.1893888515782405,
"grad_norm": 0.0003762775450013578,
"learning_rate": 1.5225611293168642e-05,
"loss": 0.0493,
"step": 3260
},
{
"epoch": 2.196104768300873,
"grad_norm": 0.3689648509025574,
"learning_rate": 1.509957146458281e-05,
"loss": 0.0494,
"step": 3270
},
{
"epoch": 2.2028206850235055,
"grad_norm": 5.371860243030824e-05,
"learning_rate": 1.4973531635996977e-05,
"loss": 0.0415,
"step": 3280
},
{
"epoch": 2.209536601746138,
"grad_norm": 0.3309116065502167,
"learning_rate": 1.4847491807411143e-05,
"loss": 0.0428,
"step": 3290
},
{
"epoch": 2.216252518468771,
"grad_norm": 0.4156160056591034,
"learning_rate": 1.472145197882531e-05,
"loss": 0.0647,
"step": 3300
},
{
"epoch": 2.2229684351914036,
"grad_norm": 0.44069477915763855,
"learning_rate": 1.4595412150239476e-05,
"loss": 0.0266,
"step": 3310
},
{
"epoch": 2.2296843519140364,
"grad_norm": 4.7501349449157715,
"learning_rate": 1.4469372321653642e-05,
"loss": 0.0934,
"step": 3320
},
{
"epoch": 2.236400268636669,
"grad_norm": 4.7144985728664324e-05,
"learning_rate": 1.434333249306781e-05,
"loss": 0.0488,
"step": 3330
},
{
"epoch": 2.2431161853593014,
"grad_norm": 0.0005517873796634376,
"learning_rate": 1.4217292664481977e-05,
"loss": 0.1232,
"step": 3340
},
{
"epoch": 2.249832102081934,
"grad_norm": 0.046205148100852966,
"learning_rate": 1.4091252835896143e-05,
"loss": 0.1431,
"step": 3350
},
{
"epoch": 2.256548018804567,
"grad_norm": 0.06340274959802628,
"learning_rate": 1.396521300731031e-05,
"loss": 0.0771,
"step": 3360
},
{
"epoch": 2.2632639355271995,
"grad_norm": 0.38464516401290894,
"learning_rate": 1.3839173178724476e-05,
"loss": 0.0383,
"step": 3370
},
{
"epoch": 2.2699798522498322,
"grad_norm": 1.4843387603759766,
"learning_rate": 1.3713133350138642e-05,
"loss": 0.0477,
"step": 3380
},
{
"epoch": 2.2766957689724645,
"grad_norm": 0.006335641257464886,
"learning_rate": 1.358709352155281e-05,
"loss": 0.0583,
"step": 3390
},
{
"epoch": 2.2834116856950972,
"grad_norm": 0.009423257783055305,
"learning_rate": 1.3461053692966979e-05,
"loss": 0.0493,
"step": 3400
},
{
"epoch": 2.29012760241773,
"grad_norm": 0.7405962347984314,
"learning_rate": 1.3335013864381147e-05,
"loss": 0.0444,
"step": 3410
},
{
"epoch": 2.2968435191403627,
"grad_norm": 0.13406887650489807,
"learning_rate": 1.3208974035795313e-05,
"loss": 0.0185,
"step": 3420
},
{
"epoch": 2.3035594358629954,
"grad_norm": 1.9850287437438965,
"learning_rate": 1.308293420720948e-05,
"loss": 0.0305,
"step": 3430
},
{
"epoch": 2.310275352585628,
"grad_norm": 0.35775044560432434,
"learning_rate": 1.2956894378623646e-05,
"loss": 0.0604,
"step": 3440
},
{
"epoch": 2.316991269308261,
"grad_norm": 0.46501487493515015,
"learning_rate": 1.2830854550037812e-05,
"loss": 0.0569,
"step": 3450
},
{
"epoch": 2.323707186030893,
"grad_norm": 0.0007875201408751309,
"learning_rate": 1.270481472145198e-05,
"loss": 0.0305,
"step": 3460
},
{
"epoch": 2.330423102753526,
"grad_norm": 0.0034116168972104788,
"learning_rate": 1.2578774892866147e-05,
"loss": 0.0443,
"step": 3470
},
{
"epoch": 2.3371390194761585,
"grad_norm": 0.4020250141620636,
"learning_rate": 1.2452735064280313e-05,
"loss": 0.0364,
"step": 3480
},
{
"epoch": 2.3438549361987913,
"grad_norm": 0.6396084427833557,
"learning_rate": 1.232669523569448e-05,
"loss": 0.0335,
"step": 3490
},
{
"epoch": 2.3505708529214235,
"grad_norm": 0.002272074343636632,
"learning_rate": 1.2200655407108646e-05,
"loss": 0.0348,
"step": 3500
},
{
"epoch": 2.3572867696440563,
"grad_norm": 0.36031338572502136,
"learning_rate": 1.2074615578522812e-05,
"loss": 0.0211,
"step": 3510
},
{
"epoch": 2.364002686366689,
"grad_norm": 1.6808172464370728,
"learning_rate": 1.194857574993698e-05,
"loss": 0.0838,
"step": 3520
},
{
"epoch": 2.3707186030893217,
"grad_norm": 0.00011936118971789256,
"learning_rate": 1.1822535921351149e-05,
"loss": 0.0281,
"step": 3530
},
{
"epoch": 2.3774345198119544,
"grad_norm": 0.0034045290667563677,
"learning_rate": 1.1696496092765315e-05,
"loss": 0.0384,
"step": 3540
},
{
"epoch": 2.384150436534587,
"grad_norm": 3.1306815799325705e-05,
"learning_rate": 1.1570456264179481e-05,
"loss": 0.026,
"step": 3550
},
{
"epoch": 2.39086635325722,
"grad_norm": 0.03722585365176201,
"learning_rate": 1.1444416435593648e-05,
"loss": 0.055,
"step": 3560
},
{
"epoch": 2.397582269979852,
"grad_norm": 0.00011495844955788925,
"learning_rate": 1.1318376607007814e-05,
"loss": 0.0821,
"step": 3570
},
{
"epoch": 2.404298186702485,
"grad_norm": 0.21863265335559845,
"learning_rate": 1.1192336778421982e-05,
"loss": 0.0207,
"step": 3580
},
{
"epoch": 2.4110141034251176,
"grad_norm": 0.3169233798980713,
"learning_rate": 1.1066296949836149e-05,
"loss": 0.0398,
"step": 3590
},
{
"epoch": 2.4177300201477503,
"grad_norm": 0.39215508103370667,
"learning_rate": 1.0940257121250315e-05,
"loss": 0.0436,
"step": 3600
},
{
"epoch": 2.4244459368703826,
"grad_norm": 0.9112158417701721,
"learning_rate": 1.0814217292664483e-05,
"loss": 0.059,
"step": 3610
},
{
"epoch": 2.4311618535930153,
"grad_norm": 0.2396714836359024,
"learning_rate": 1.068817746407865e-05,
"loss": 0.1052,
"step": 3620
},
{
"epoch": 2.437877770315648,
"grad_norm": 0.48746079206466675,
"learning_rate": 1.0562137635492816e-05,
"loss": 0.1301,
"step": 3630
},
{
"epoch": 2.4445936870382807,
"grad_norm": 0.2512687146663666,
"learning_rate": 1.0436097806906982e-05,
"loss": 0.0395,
"step": 3640
},
{
"epoch": 2.4513096037609134,
"grad_norm": 0.0035686984192579985,
"learning_rate": 1.031005797832115e-05,
"loss": 0.0353,
"step": 3650
},
{
"epoch": 2.458025520483546,
"grad_norm": 0.5211601257324219,
"learning_rate": 1.0184018149735317e-05,
"loss": 0.0351,
"step": 3660
},
{
"epoch": 2.464741437206179,
"grad_norm": 0.0005140244611538947,
"learning_rate": 1.0057978321149483e-05,
"loss": 0.024,
"step": 3670
},
{
"epoch": 2.471457353928811,
"grad_norm": 0.6637148261070251,
"learning_rate": 9.93193849256365e-06,
"loss": 0.0451,
"step": 3680
},
{
"epoch": 2.478173270651444,
"grad_norm": 1.191238522529602,
"learning_rate": 9.805898663977818e-06,
"loss": 0.0392,
"step": 3690
},
{
"epoch": 2.4848891873740766,
"grad_norm": 0.48035892844200134,
"learning_rate": 9.679858835391984e-06,
"loss": 0.071,
"step": 3700
},
{
"epoch": 2.4916051040967093,
"grad_norm": 0.0007876435411162674,
"learning_rate": 9.553819006806152e-06,
"loss": 0.0841,
"step": 3710
},
{
"epoch": 2.4983210208193416,
"grad_norm": 7.469072341918945,
"learning_rate": 9.427779178220319e-06,
"loss": 0.0629,
"step": 3720
},
{
"epoch": 2.5050369375419743,
"grad_norm": 0.004849706310778856,
"learning_rate": 9.301739349634485e-06,
"loss": 0.0444,
"step": 3730
},
{
"epoch": 2.511752854264607,
"grad_norm": 0.26340028643608093,
"learning_rate": 9.175699521048651e-06,
"loss": 0.0256,
"step": 3740
},
{
"epoch": 2.5184687709872398,
"grad_norm": 0.5283430218696594,
"learning_rate": 9.049659692462818e-06,
"loss": 0.0177,
"step": 3750
},
{
"epoch": 2.5251846877098725,
"grad_norm": 2.114301605615765e-05,
"learning_rate": 8.923619863876984e-06,
"loss": 0.0215,
"step": 3760
},
{
"epoch": 2.531900604432505,
"grad_norm": 0.432597279548645,
"learning_rate": 8.797580035291152e-06,
"loss": 0.0784,
"step": 3770
},
{
"epoch": 2.538616521155138,
"grad_norm": 0.00014320577611215413,
"learning_rate": 8.67154020670532e-06,
"loss": 0.0582,
"step": 3780
},
{
"epoch": 2.54533243787777,
"grad_norm": 0.00046885263873264194,
"learning_rate": 8.545500378119487e-06,
"loss": 0.0253,
"step": 3790
},
{
"epoch": 2.552048354600403,
"grad_norm": 0.0021923587191849947,
"learning_rate": 8.419460549533653e-06,
"loss": 0.0598,
"step": 3800
},
{
"epoch": 2.5587642713230356,
"grad_norm": 0.3422527015209198,
"learning_rate": 8.29342072094782e-06,
"loss": 0.0442,
"step": 3810
},
{
"epoch": 2.5654801880456684,
"grad_norm": 0.006465827114880085,
"learning_rate": 8.167380892361986e-06,
"loss": 0.0194,
"step": 3820
},
{
"epoch": 2.5721961047683006,
"grad_norm": 9.287328430218622e-05,
"learning_rate": 8.041341063776154e-06,
"loss": 0.0813,
"step": 3830
},
{
"epoch": 2.5789120214909333,
"grad_norm": 1.295121669769287,
"learning_rate": 7.91530123519032e-06,
"loss": 0.0261,
"step": 3840
},
{
"epoch": 2.585627938213566,
"grad_norm": 1.8356350660324097,
"learning_rate": 7.789261406604489e-06,
"loss": 0.0862,
"step": 3850
},
{
"epoch": 2.592343854936199,
"grad_norm": 0.40983912348747253,
"learning_rate": 7.663221578018655e-06,
"loss": 0.0302,
"step": 3860
},
{
"epoch": 2.5990597716588315,
"grad_norm": 0.001640897593460977,
"learning_rate": 7.537181749432821e-06,
"loss": 0.0264,
"step": 3870
},
{
"epoch": 2.6057756883814642,
"grad_norm": 2.8107948310207576e-05,
"learning_rate": 7.4111419208469886e-06,
"loss": 0.0615,
"step": 3880
},
{
"epoch": 2.612491605104097,
"grad_norm": 3.114527862635441e-05,
"learning_rate": 7.285102092261155e-06,
"loss": 0.0594,
"step": 3890
},
{
"epoch": 2.6192075218267292,
"grad_norm": 0.10608917474746704,
"learning_rate": 7.159062263675321e-06,
"loss": 0.0329,
"step": 3900
},
{
"epoch": 2.625923438549362,
"grad_norm": 0.0014250510139390826,
"learning_rate": 7.033022435089489e-06,
"loss": 0.0758,
"step": 3910
},
{
"epoch": 2.6326393552719947,
"grad_norm": 0.3084910213947296,
"learning_rate": 6.906982606503655e-06,
"loss": 0.0241,
"step": 3920
},
{
"epoch": 2.6393552719946274,
"grad_norm": 2.416365168755874e-05,
"learning_rate": 6.780942777917821e-06,
"loss": 0.0247,
"step": 3930
},
{
"epoch": 2.6460711887172597,
"grad_norm": 0.45002567768096924,
"learning_rate": 6.6549029493319895e-06,
"loss": 0.0558,
"step": 3940
},
{
"epoch": 2.6527871054398924,
"grad_norm": 2.132117986679077,
"learning_rate": 6.528863120746157e-06,
"loss": 0.0902,
"step": 3950
},
{
"epoch": 2.659503022162525,
"grad_norm": 0.6513732075691223,
"learning_rate": 6.402823292160323e-06,
"loss": 0.0566,
"step": 3960
},
{
"epoch": 2.666218938885158,
"grad_norm": 0.0017292428528890014,
"learning_rate": 6.2767834635744895e-06,
"loss": 0.0324,
"step": 3970
},
{
"epoch": 2.6729348556077905,
"grad_norm": 0.3677961826324463,
"learning_rate": 6.150743634988657e-06,
"loss": 0.0529,
"step": 3980
},
{
"epoch": 2.6796507723304233,
"grad_norm": 0.43927228450775146,
"learning_rate": 6.024703806402823e-06,
"loss": 0.0385,
"step": 3990
},
{
"epoch": 2.686366689053056,
"grad_norm": 0.47322943806648254,
"learning_rate": 5.89866397781699e-06,
"loss": 0.0735,
"step": 4000
},
{
"epoch": 2.6930826057756883,
"grad_norm": 0.00010313421807950363,
"learning_rate": 5.772624149231158e-06,
"loss": 0.062,
"step": 4010
},
{
"epoch": 2.699798522498321,
"grad_norm": 2.2094976902008057,
"learning_rate": 5.646584320645324e-06,
"loss": 0.0495,
"step": 4020
},
{
"epoch": 2.7065144392209537,
"grad_norm": 5.995333776809275e-05,
"learning_rate": 5.5205444920594905e-06,
"loss": 0.0336,
"step": 4030
},
{
"epoch": 2.7132303559435864,
"grad_norm": 0.1308954805135727,
"learning_rate": 5.3945046634736586e-06,
"loss": 0.0457,
"step": 4040
},
{
"epoch": 2.7199462726662187,
"grad_norm": 0.10739518702030182,
"learning_rate": 5.268464834887825e-06,
"loss": 0.0205,
"step": 4050
},
{
"epoch": 2.7266621893888514,
"grad_norm": 0.45175987482070923,
"learning_rate": 5.142425006301991e-06,
"loss": 0.0293,
"step": 4060
},
{
"epoch": 2.733378106111484,
"grad_norm": 0.4527464210987091,
"learning_rate": 5.016385177716159e-06,
"loss": 0.049,
"step": 4070
},
{
"epoch": 2.740094022834117,
"grad_norm": 0.9216799736022949,
"learning_rate": 4.890345349130326e-06,
"loss": 0.0472,
"step": 4080
},
{
"epoch": 2.7468099395567496,
"grad_norm": 0.4577857553958893,
"learning_rate": 4.764305520544492e-06,
"loss": 0.0317,
"step": 4090
},
{
"epoch": 2.7535258562793823,
"grad_norm": 0.2905367910861969,
"learning_rate": 4.6382656919586595e-06,
"loss": 0.0304,
"step": 4100
},
{
"epoch": 2.760241773002015,
"grad_norm": 2.5396127700805664,
"learning_rate": 4.512225863372826e-06,
"loss": 0.0583,
"step": 4110
},
{
"epoch": 2.7669576897246473,
"grad_norm": 0.0009265750413760543,
"learning_rate": 4.386186034786993e-06,
"loss": 0.0323,
"step": 4120
},
{
"epoch": 2.77367360644728,
"grad_norm": 0.8636244535446167,
"learning_rate": 4.2601462062011595e-06,
"loss": 0.074,
"step": 4130
},
{
"epoch": 2.7803895231699127,
"grad_norm": 0.0012430851347744465,
"learning_rate": 4.134106377615327e-06,
"loss": 0.03,
"step": 4140
},
{
"epoch": 2.7871054398925454,
"grad_norm": 0.567550778388977,
"learning_rate": 4.008066549029493e-06,
"loss": 0.0888,
"step": 4150
},
{
"epoch": 2.7938213566151777,
"grad_norm": 0.0012551085092127323,
"learning_rate": 3.88202672044366e-06,
"loss": 0.078,
"step": 4160
},
{
"epoch": 2.8005372733378104,
"grad_norm": 0.6136677265167236,
"learning_rate": 3.7559868918578272e-06,
"loss": 0.0619,
"step": 4170
},
{
"epoch": 2.807253190060443,
"grad_norm": 0.34767717123031616,
"learning_rate": 3.629947063271994e-06,
"loss": 0.0572,
"step": 4180
},
{
"epoch": 2.813969106783076,
"grad_norm": 0.26787152886390686,
"learning_rate": 3.503907234686161e-06,
"loss": 0.0449,
"step": 4190
},
{
"epoch": 2.8206850235057086,
"grad_norm": 0.8297312259674072,
"learning_rate": 3.377867406100328e-06,
"loss": 0.0545,
"step": 4200
},
{
"epoch": 2.8274009402283413,
"grad_norm": 8.094152144622058e-05,
"learning_rate": 3.251827577514495e-06,
"loss": 0.0491,
"step": 4210
},
{
"epoch": 2.834116856950974,
"grad_norm": 2.410463571548462,
"learning_rate": 3.1257877489286617e-06,
"loss": 0.0564,
"step": 4220
},
{
"epoch": 2.8408327736736063,
"grad_norm": 0.9952247142791748,
"learning_rate": 2.999747920342828e-06,
"loss": 0.0714,
"step": 4230
},
{
"epoch": 2.847548690396239,
"grad_norm": 0.4516734480857849,
"learning_rate": 2.8737080917569954e-06,
"loss": 0.047,
"step": 4240
},
{
"epoch": 2.8542646071188718,
"grad_norm": 0.3798590898513794,
"learning_rate": 2.747668263171162e-06,
"loss": 0.0654,
"step": 4250
},
{
"epoch": 2.8609805238415045,
"grad_norm": 1.9686492681503296,
"learning_rate": 2.621628434585329e-06,
"loss": 0.0626,
"step": 4260
},
{
"epoch": 2.8676964405641368,
"grad_norm": 0.4292641580104828,
"learning_rate": 2.495588605999496e-06,
"loss": 0.0485,
"step": 4270
},
{
"epoch": 2.8744123572867695,
"grad_norm": 5.976020111120306e-05,
"learning_rate": 2.3695487774136627e-06,
"loss": 0.0338,
"step": 4280
},
{
"epoch": 2.881128274009402,
"grad_norm": 0.027319636195898056,
"learning_rate": 2.2435089488278295e-06,
"loss": 0.0465,
"step": 4290
},
{
"epoch": 2.887844190732035,
"grad_norm": 0.37028828263282776,
"learning_rate": 2.1174691202419967e-06,
"loss": 0.0311,
"step": 4300
},
{
"epoch": 2.8945601074546676,
"grad_norm": 0.44889381527900696,
"learning_rate": 1.991429291656163e-06,
"loss": 0.0414,
"step": 4310
},
{
"epoch": 2.9012760241773004,
"grad_norm": 0.5002055168151855,
"learning_rate": 1.8653894630703304e-06,
"loss": 0.07,
"step": 4320
},
{
"epoch": 2.907991940899933,
"grad_norm": 0.003884148085489869,
"learning_rate": 1.739349634484497e-06,
"loss": 0.0356,
"step": 4330
},
{
"epoch": 2.9147078576225653,
"grad_norm": 0.35928064584732056,
"learning_rate": 1.613309805898664e-06,
"loss": 0.0508,
"step": 4340
},
{
"epoch": 2.921423774345198,
"grad_norm": 1.2552955150604248,
"learning_rate": 1.487269977312831e-06,
"loss": 0.0427,
"step": 4350
},
{
"epoch": 2.928139691067831,
"grad_norm": 0.007359993644058704,
"learning_rate": 1.3612301487269979e-06,
"loss": 0.0253,
"step": 4360
},
{
"epoch": 2.9348556077904635,
"grad_norm": 1.1389840841293335,
"learning_rate": 1.2351903201411647e-06,
"loss": 0.0632,
"step": 4370
},
{
"epoch": 2.941571524513096,
"grad_norm": 0.8267861008644104,
"learning_rate": 1.1091504915553315e-06,
"loss": 0.0396,
"step": 4380
},
{
"epoch": 2.9482874412357285,
"grad_norm": 3.601860284805298,
"learning_rate": 9.831106629694983e-07,
"loss": 0.0484,
"step": 4390
},
{
"epoch": 2.955003357958361,
"grad_norm": 0.8058955073356628,
"learning_rate": 8.570708343836652e-07,
"loss": 0.0346,
"step": 4400
},
{
"epoch": 2.961719274680994,
"grad_norm": 0.0007326535996980965,
"learning_rate": 7.310310057978322e-07,
"loss": 0.0317,
"step": 4410
},
{
"epoch": 2.9684351914036267,
"grad_norm": 0.9190682768821716,
"learning_rate": 6.04991177211999e-07,
"loss": 0.0232,
"step": 4420
},
{
"epoch": 2.9751511081262594,
"grad_norm": 0.0039444975554943085,
"learning_rate": 4.789513486261659e-07,
"loss": 0.0685,
"step": 4430
},
{
"epoch": 2.981867024848892,
"grad_norm": 0.9105575680732727,
"learning_rate": 3.5291152004033275e-07,
"loss": 0.0481,
"step": 4440
},
{
"epoch": 2.9885829415715244,
"grad_norm": 0.0033778748475015163,
"learning_rate": 2.2687169145449963e-07,
"loss": 0.0524,
"step": 4450
},
{
"epoch": 2.995298858294157,
"grad_norm": 0.5608463287353516,
"learning_rate": 1.008318628686665e-07,
"loss": 0.0465,
"step": 4460
}
],
"logging_steps": 10,
"max_steps": 4467,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1521654161384448.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}