eyesiltas's picture
Upload folder using huggingface_hub
7cde45f verified
Invalid JSON: Unexpected token 'N', ..."al_loss": NaN, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.6109660574412534,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013054830287206266,
"grad_norm": 43.415225982666016,
"learning_rate": 4.978241949521324e-05,
"loss": 2.9884,
"step": 10
},
{
"epoch": 0.02610966057441253,
"grad_norm": 49.437705993652344,
"learning_rate": 4.956483899042646e-05,
"loss": 2.9654,
"step": 20
},
{
"epoch": 0.0391644908616188,
"grad_norm": 47.24225997924805,
"learning_rate": 4.934725848563969e-05,
"loss": 3.5102,
"step": 30
},
{
"epoch": 0.05221932114882506,
"grad_norm": 32.93499755859375,
"learning_rate": 4.912967798085292e-05,
"loss": 2.4143,
"step": 40
},
{
"epoch": 0.06527415143603134,
"grad_norm": 31.01405143737793,
"learning_rate": 4.891209747606615e-05,
"loss": 2.6864,
"step": 50
},
{
"epoch": 0.0783289817232376,
"grad_norm": 77.32862854003906,
"learning_rate": 4.8694516971279375e-05,
"loss": 2.7403,
"step": 60
},
{
"epoch": 0.09138381201044386,
"grad_norm": 28.313295364379883,
"learning_rate": 4.84769364664926e-05,
"loss": 2.5107,
"step": 70
},
{
"epoch": 0.10443864229765012,
"grad_norm": 38.45579528808594,
"learning_rate": 4.825935596170583e-05,
"loss": 2.6546,
"step": 80
},
{
"epoch": 0.1174934725848564,
"grad_norm": 41.91643142700195,
"learning_rate": 4.8041775456919065e-05,
"loss": 2.82,
"step": 90
},
{
"epoch": 0.13054830287206268,
"grad_norm": 36.32301712036133,
"learning_rate": 4.782419495213229e-05,
"loss": 2.9063,
"step": 100
},
{
"epoch": 0.14360313315926893,
"grad_norm": 30.05735969543457,
"learning_rate": 4.760661444734552e-05,
"loss": 2.5942,
"step": 110
},
{
"epoch": 0.1566579634464752,
"grad_norm": 38.422706604003906,
"learning_rate": 4.738903394255875e-05,
"loss": 2.5929,
"step": 120
},
{
"epoch": 0.16971279373368145,
"grad_norm": 49.74126052856445,
"learning_rate": 4.7171453437771976e-05,
"loss": 2.5271,
"step": 130
},
{
"epoch": 0.18276762402088773,
"grad_norm": 31.077625274658203,
"learning_rate": 4.6953872932985203e-05,
"loss": 2.6357,
"step": 140
},
{
"epoch": 0.195822454308094,
"grad_norm": 24.155317306518555,
"learning_rate": 4.673629242819844e-05,
"loss": 2.5678,
"step": 150
},
{
"epoch": 0.20887728459530025,
"grad_norm": 30.788408279418945,
"learning_rate": 4.651871192341166e-05,
"loss": 2.8182,
"step": 160
},
{
"epoch": 0.22193211488250653,
"grad_norm": 25.434738159179688,
"learning_rate": 4.630113141862489e-05,
"loss": 3.303,
"step": 170
},
{
"epoch": 0.2349869451697128,
"grad_norm": 37.103668212890625,
"learning_rate": 4.608355091383813e-05,
"loss": 2.5012,
"step": 180
},
{
"epoch": 0.24804177545691905,
"grad_norm": 70.19502258300781,
"learning_rate": 4.586597040905135e-05,
"loss": 2.812,
"step": 190
},
{
"epoch": 0.26109660574412535,
"grad_norm": 37.93436050415039,
"learning_rate": 4.564838990426458e-05,
"loss": 2.6683,
"step": 200
},
{
"epoch": 0.2741514360313316,
"grad_norm": 149.83016967773438,
"learning_rate": 4.543080939947781e-05,
"loss": 2.9776,
"step": 210
},
{
"epoch": 0.28720626631853785,
"grad_norm": 31.74551010131836,
"learning_rate": 4.521322889469104e-05,
"loss": 2.9663,
"step": 220
},
{
"epoch": 0.3002610966057441,
"grad_norm": 34.0869255065918,
"learning_rate": 4.4995648389904266e-05,
"loss": 2.712,
"step": 230
},
{
"epoch": 0.3133159268929504,
"grad_norm": 28.85022735595703,
"learning_rate": 4.47780678851175e-05,
"loss": 2.3012,
"step": 240
},
{
"epoch": 0.3263707571801567,
"grad_norm": 36.18962860107422,
"learning_rate": 4.456048738033072e-05,
"loss": 2.4339,
"step": 250
},
{
"epoch": 0.3394255874673629,
"grad_norm": 27.252077102661133,
"learning_rate": 4.4342906875543956e-05,
"loss": 2.851,
"step": 260
},
{
"epoch": 0.3524804177545692,
"grad_norm": 38.39606857299805,
"learning_rate": 4.4125326370757184e-05,
"loss": 2.6867,
"step": 270
},
{
"epoch": 0.36553524804177545,
"grad_norm": 25.8907527923584,
"learning_rate": 4.390774586597041e-05,
"loss": 2.4467,
"step": 280
},
{
"epoch": 0.3785900783289817,
"grad_norm": 24.98986053466797,
"learning_rate": 4.369016536118364e-05,
"loss": 3.0278,
"step": 290
},
{
"epoch": 0.391644908616188,
"grad_norm": 24.536916732788086,
"learning_rate": 4.347258485639687e-05,
"loss": 2.3857,
"step": 300
},
{
"epoch": 0.4046997389033943,
"grad_norm": 22.012798309326172,
"learning_rate": 4.3255004351610094e-05,
"loss": 2.1414,
"step": 310
},
{
"epoch": 0.4177545691906005,
"grad_norm": 25.466167449951172,
"learning_rate": 4.303742384682333e-05,
"loss": 2.5714,
"step": 320
},
{
"epoch": 0.4308093994778068,
"grad_norm": 31.785062789916992,
"learning_rate": 4.281984334203655e-05,
"loss": 2.4608,
"step": 330
},
{
"epoch": 0.44386422976501305,
"grad_norm": 36.67721176147461,
"learning_rate": 4.2602262837249784e-05,
"loss": 2.787,
"step": 340
},
{
"epoch": 0.45691906005221933,
"grad_norm": 49.02054214477539,
"learning_rate": 4.238468233246302e-05,
"loss": 2.7207,
"step": 350
},
{
"epoch": 0.4699738903394256,
"grad_norm": 26.876636505126953,
"learning_rate": 4.216710182767624e-05,
"loss": 2.6041,
"step": 360
},
{
"epoch": 0.4830287206266319,
"grad_norm": 34.956336975097656,
"learning_rate": 4.1949521322889474e-05,
"loss": 3.1905,
"step": 370
},
{
"epoch": 0.4960835509138381,
"grad_norm": 35.72273254394531,
"learning_rate": 4.17319408181027e-05,
"loss": 2.121,
"step": 380
},
{
"epoch": 0.5091383812010444,
"grad_norm": 28.895980834960938,
"learning_rate": 4.151436031331593e-05,
"loss": 2.6956,
"step": 390
},
{
"epoch": 0.5221932114882507,
"grad_norm": 28.925390243530273,
"learning_rate": 4.129677980852916e-05,
"loss": 3.0316,
"step": 400
},
{
"epoch": 0.5352480417754569,
"grad_norm": 34.79185485839844,
"learning_rate": 4.107919930374239e-05,
"loss": 3.0389,
"step": 410
},
{
"epoch": 0.5483028720626631,
"grad_norm": 30.246923446655273,
"learning_rate": 4.086161879895561e-05,
"loss": 2.8086,
"step": 420
},
{
"epoch": 0.5613577023498695,
"grad_norm": 32.78372573852539,
"learning_rate": 4.064403829416885e-05,
"loss": 2.814,
"step": 430
},
{
"epoch": 0.5744125326370757,
"grad_norm": 24.346147537231445,
"learning_rate": 4.0426457789382075e-05,
"loss": 2.8039,
"step": 440
},
{
"epoch": 0.587467362924282,
"grad_norm": 31.166654586791992,
"learning_rate": 4.02088772845953e-05,
"loss": 2.9612,
"step": 450
},
{
"epoch": 0.6005221932114883,
"grad_norm": 23.0938777923584,
"learning_rate": 3.999129677980853e-05,
"loss": 2.5463,
"step": 460
},
{
"epoch": 0.6135770234986945,
"grad_norm": 26.590911865234375,
"learning_rate": 3.977371627502176e-05,
"loss": 2.3543,
"step": 470
},
{
"epoch": 0.6266318537859008,
"grad_norm": 29.803422927856445,
"learning_rate": 3.9556135770234985e-05,
"loss": 2.7445,
"step": 480
},
{
"epoch": 0.639686684073107,
"grad_norm": 46.66853713989258,
"learning_rate": 3.933855526544822e-05,
"loss": 2.4905,
"step": 490
},
{
"epoch": 0.6527415143603134,
"grad_norm": 39.04319381713867,
"learning_rate": 3.912097476066145e-05,
"loss": 1.8311,
"step": 500
},
{
"epoch": 0.6657963446475196,
"grad_norm": 30.50276756286621,
"learning_rate": 3.8903394255874675e-05,
"loss": 2.8152,
"step": 510
},
{
"epoch": 0.6788511749347258,
"grad_norm": 37.25984191894531,
"learning_rate": 3.868581375108791e-05,
"loss": 2.4759,
"step": 520
},
{
"epoch": 0.6919060052219321,
"grad_norm": 25.89512062072754,
"learning_rate": 3.846823324630113e-05,
"loss": 2.486,
"step": 530
},
{
"epoch": 0.7049608355091384,
"grad_norm": 37.318450927734375,
"learning_rate": 3.8250652741514365e-05,
"loss": 2.77,
"step": 540
},
{
"epoch": 0.7180156657963447,
"grad_norm": 34.45144271850586,
"learning_rate": 3.803307223672759e-05,
"loss": 2.2646,
"step": 550
},
{
"epoch": 0.7310704960835509,
"grad_norm": 50.494144439697266,
"learning_rate": 3.781549173194082e-05,
"loss": 2.7686,
"step": 560
},
{
"epoch": 0.7441253263707572,
"grad_norm": 29.753643035888672,
"learning_rate": 3.759791122715405e-05,
"loss": 2.6239,
"step": 570
},
{
"epoch": 0.7571801566579635,
"grad_norm": 39.54145431518555,
"learning_rate": 3.738033072236728e-05,
"loss": 2.4995,
"step": 580
},
{
"epoch": 0.7702349869451697,
"grad_norm": 36.82713317871094,
"learning_rate": 3.71627502175805e-05,
"loss": 3.0274,
"step": 590
},
{
"epoch": 0.783289817232376,
"grad_norm": 36.62627410888672,
"learning_rate": 3.694516971279374e-05,
"loss": 2.2364,
"step": 600
},
{
"epoch": 0.7963446475195822,
"grad_norm": 18.279882431030273,
"learning_rate": 3.6727589208006965e-05,
"loss": 2.3391,
"step": 610
},
{
"epoch": 0.8093994778067886,
"grad_norm": 23.61455535888672,
"learning_rate": 3.651000870322019e-05,
"loss": 2.5222,
"step": 620
},
{
"epoch": 0.8224543080939948,
"grad_norm": 32.03522872924805,
"learning_rate": 3.629242819843342e-05,
"loss": 2.2871,
"step": 630
},
{
"epoch": 0.835509138381201,
"grad_norm": 45.24649429321289,
"learning_rate": 3.607484769364665e-05,
"loss": 2.6863,
"step": 640
},
{
"epoch": 0.8485639686684073,
"grad_norm": 24.39188575744629,
"learning_rate": 3.5857267188859876e-05,
"loss": 2.6426,
"step": 650
},
{
"epoch": 0.8616187989556136,
"grad_norm": 21.67547607421875,
"learning_rate": 3.563968668407311e-05,
"loss": 2.1157,
"step": 660
},
{
"epoch": 0.8746736292428199,
"grad_norm": 24.245168685913086,
"learning_rate": 3.542210617928634e-05,
"loss": 2.3781,
"step": 670
},
{
"epoch": 0.8877284595300261,
"grad_norm": 27.57684326171875,
"learning_rate": 3.5204525674499566e-05,
"loss": 2.7673,
"step": 680
},
{
"epoch": 0.9007832898172323,
"grad_norm": 28.42872428894043,
"learning_rate": 3.49869451697128e-05,
"loss": 2.4503,
"step": 690
},
{
"epoch": 0.9138381201044387,
"grad_norm": 39.387813568115234,
"learning_rate": 3.476936466492602e-05,
"loss": 2.1765,
"step": 700
},
{
"epoch": 0.9268929503916449,
"grad_norm": 20.197811126708984,
"learning_rate": 3.4551784160139256e-05,
"loss": 2.555,
"step": 710
},
{
"epoch": 0.9399477806788512,
"grad_norm": 22.066137313842773,
"learning_rate": 3.4334203655352484e-05,
"loss": 2.4827,
"step": 720
},
{
"epoch": 0.9530026109660574,
"grad_norm": 32.67851638793945,
"learning_rate": 3.411662315056571e-05,
"loss": 2.762,
"step": 730
},
{
"epoch": 0.9660574412532638,
"grad_norm": 28.471988677978516,
"learning_rate": 3.389904264577894e-05,
"loss": 2.5872,
"step": 740
},
{
"epoch": 0.97911227154047,
"grad_norm": 22.934885025024414,
"learning_rate": 3.368146214099217e-05,
"loss": 2.8826,
"step": 750
},
{
"epoch": 0.9921671018276762,
"grad_norm": 24.063716888427734,
"learning_rate": 3.3463881636205394e-05,
"loss": 2.8909,
"step": 760
},
{
"epoch": 1.0,
"eval_loss": 2.3283705711364746,
"eval_runtime": 12.1219,
"eval_samples_per_second": 112.276,
"eval_steps_per_second": 14.107,
"step": 766
},
{
"epoch": 1.0052219321148825,
"grad_norm": 36.15023422241211,
"learning_rate": 3.324630113141863e-05,
"loss": 2.5282,
"step": 770
},
{
"epoch": 1.0182767624020888,
"grad_norm": 35.99642562866211,
"learning_rate": 3.3028720626631856e-05,
"loss": 2.176,
"step": 780
},
{
"epoch": 1.031331592689295,
"grad_norm": 30.217031478881836,
"learning_rate": 3.2811140121845084e-05,
"loss": 2.2727,
"step": 790
},
{
"epoch": 1.0443864229765012,
"grad_norm": 29.16168212890625,
"learning_rate": 3.259355961705831e-05,
"loss": 2.0302,
"step": 800
},
{
"epoch": 1.0574412532637076,
"grad_norm": 25.400541305541992,
"learning_rate": 3.237597911227154e-05,
"loss": 2.3489,
"step": 810
},
{
"epoch": 1.0704960835509139,
"grad_norm": 21.281591415405273,
"learning_rate": 3.215839860748477e-05,
"loss": 2.3976,
"step": 820
},
{
"epoch": 1.08355091383812,
"grad_norm": 23.941238403320312,
"learning_rate": 3.1940818102698e-05,
"loss": 2.6169,
"step": 830
},
{
"epoch": 1.0966057441253263,
"grad_norm": 26.626665115356445,
"learning_rate": 3.172323759791123e-05,
"loss": 2.6102,
"step": 840
},
{
"epoch": 1.1096605744125327,
"grad_norm": 28.539621353149414,
"learning_rate": 3.150565709312446e-05,
"loss": 2.019,
"step": 850
},
{
"epoch": 1.122715404699739,
"grad_norm": 36.77280044555664,
"learning_rate": 3.128807658833769e-05,
"loss": 2.2338,
"step": 860
},
{
"epoch": 1.1357702349869452,
"grad_norm": 38.7175407409668,
"learning_rate": 3.107049608355091e-05,
"loss": 2.7259,
"step": 870
},
{
"epoch": 1.1488250652741514,
"grad_norm": 32.9740104675293,
"learning_rate": 3.085291557876415e-05,
"loss": 2.294,
"step": 880
},
{
"epoch": 1.1618798955613576,
"grad_norm": 35.01115036010742,
"learning_rate": 3.0635335073977374e-05,
"loss": 2.5392,
"step": 890
},
{
"epoch": 1.174934725848564,
"grad_norm": 40.960968017578125,
"learning_rate": 3.0417754569190606e-05,
"loss": 1.8062,
"step": 900
},
{
"epoch": 1.1879895561357703,
"grad_norm": 24.627063751220703,
"learning_rate": 3.020017406440383e-05,
"loss": 2.746,
"step": 910
},
{
"epoch": 1.2010443864229765,
"grad_norm": 28.861692428588867,
"learning_rate": 2.998259355961706e-05,
"loss": 2.1434,
"step": 920
},
{
"epoch": 1.2140992167101827,
"grad_norm": 30.32466697692871,
"learning_rate": 2.976501305483029e-05,
"loss": 2.8282,
"step": 930
},
{
"epoch": 1.227154046997389,
"grad_norm": 45.79476547241211,
"learning_rate": 2.954743255004352e-05,
"loss": 2.1972,
"step": 940
},
{
"epoch": 1.2402088772845954,
"grad_norm": 34.27708435058594,
"learning_rate": 2.9329852045256744e-05,
"loss": 3.0359,
"step": 950
},
{
"epoch": 1.2532637075718016,
"grad_norm": 33.62773513793945,
"learning_rate": 2.9112271540469975e-05,
"loss": 2.4254,
"step": 960
},
{
"epoch": 1.2663185378590078,
"grad_norm": 26.693920135498047,
"learning_rate": 2.8894691035683203e-05,
"loss": 1.883,
"step": 970
},
{
"epoch": 1.279373368146214,
"grad_norm": 36.45111083984375,
"learning_rate": 2.8677110530896434e-05,
"loss": 2.232,
"step": 980
},
{
"epoch": 1.2924281984334205,
"grad_norm": 34.38032913208008,
"learning_rate": 2.845953002610966e-05,
"loss": 2.4289,
"step": 990
},
{
"epoch": 1.3054830287206267,
"grad_norm": 19.068925857543945,
"learning_rate": 2.8241949521322892e-05,
"loss": 2.473,
"step": 1000
},
{
"epoch": 1.318537859007833,
"grad_norm": 30.37474250793457,
"learning_rate": 2.8024369016536117e-05,
"loss": 2.6123,
"step": 1010
},
{
"epoch": 1.3315926892950392,
"grad_norm": 29.176149368286133,
"learning_rate": 2.7806788511749348e-05,
"loss": 2.3158,
"step": 1020
},
{
"epoch": 1.3446475195822454,
"grad_norm": 27.721553802490234,
"learning_rate": 2.758920800696258e-05,
"loss": 2.3646,
"step": 1030
},
{
"epoch": 1.3577023498694518,
"grad_norm": 12.712676048278809,
"learning_rate": 2.7371627502175807e-05,
"loss": 2.1114,
"step": 1040
},
{
"epoch": 1.370757180156658,
"grad_norm": 23.869230270385742,
"learning_rate": 2.7154046997389038e-05,
"loss": 2.1335,
"step": 1050
},
{
"epoch": 1.3838120104438643,
"grad_norm": 23.66261100769043,
"learning_rate": 2.6936466492602262e-05,
"loss": 2.0774,
"step": 1060
},
{
"epoch": 1.3968668407310705,
"grad_norm": 44.03451156616211,
"learning_rate": 2.6718885987815496e-05,
"loss": 2.6329,
"step": 1070
},
{
"epoch": 1.4099216710182767,
"grad_norm": 20.849573135375977,
"learning_rate": 2.650130548302872e-05,
"loss": 2.8549,
"step": 1080
},
{
"epoch": 1.4229765013054831,
"grad_norm": 27.392250061035156,
"learning_rate": 2.6283724978241952e-05,
"loss": 2.4537,
"step": 1090
},
{
"epoch": 1.4360313315926894,
"grad_norm": 25.04733657836914,
"learning_rate": 2.606614447345518e-05,
"loss": 1.9425,
"step": 1100
},
{
"epoch": 1.4490861618798956,
"grad_norm": 32.247894287109375,
"learning_rate": 2.584856396866841e-05,
"loss": 2.0056,
"step": 1110
},
{
"epoch": 1.4621409921671018,
"grad_norm": 32.498191833496094,
"learning_rate": 2.5630983463881635e-05,
"loss": 2.6457,
"step": 1120
},
{
"epoch": 1.475195822454308,
"grad_norm": 26.728214263916016,
"learning_rate": 2.5413402959094866e-05,
"loss": 1.6954,
"step": 1130
},
{
"epoch": 1.4882506527415145,
"grad_norm": 63.32724380493164,
"learning_rate": 2.5195822454308094e-05,
"loss": 2.5008,
"step": 1140
},
{
"epoch": 1.5013054830287205,
"grad_norm": 24.65690803527832,
"learning_rate": 2.4978241949521325e-05,
"loss": 1.5901,
"step": 1150
},
{
"epoch": 1.514360313315927,
"grad_norm": 19.37055206298828,
"learning_rate": 2.4760661444734552e-05,
"loss": 1.9548,
"step": 1160
},
{
"epoch": 1.5274151436031331,
"grad_norm": 23.001806259155273,
"learning_rate": 2.4543080939947783e-05,
"loss": 2.382,
"step": 1170
},
{
"epoch": 1.5404699738903394,
"grad_norm": 26.997100830078125,
"learning_rate": 2.432550043516101e-05,
"loss": 2.2834,
"step": 1180
},
{
"epoch": 1.5535248041775458,
"grad_norm": 22.0489559173584,
"learning_rate": 2.410791993037424e-05,
"loss": 2.1768,
"step": 1190
},
{
"epoch": 1.566579634464752,
"grad_norm": 29.986967086791992,
"learning_rate": 2.389033942558747e-05,
"loss": 2.0854,
"step": 1200
},
{
"epoch": 1.5796344647519582,
"grad_norm": 34.0990104675293,
"learning_rate": 2.3672758920800698e-05,
"loss": 2.4429,
"step": 1210
},
{
"epoch": 1.5926892950391645,
"grad_norm": 25.31661033630371,
"learning_rate": 2.3455178416013925e-05,
"loss": 2.3145,
"step": 1220
},
{
"epoch": 1.6057441253263707,
"grad_norm": 30.300716400146484,
"learning_rate": 2.3237597911227156e-05,
"loss": 1.9357,
"step": 1230
},
{
"epoch": 1.6187989556135771,
"grad_norm": 25.158327102661133,
"learning_rate": 2.3020017406440384e-05,
"loss": 2.3195,
"step": 1240
},
{
"epoch": 1.6318537859007833,
"grad_norm": 33.35712432861328,
"learning_rate": 2.280243690165361e-05,
"loss": 2.4724,
"step": 1250
},
{
"epoch": 1.6449086161879896,
"grad_norm": 22.938852310180664,
"learning_rate": 2.258485639686684e-05,
"loss": 2.3876,
"step": 1260
},
{
"epoch": 1.6579634464751958,
"grad_norm": 26.44889259338379,
"learning_rate": 2.236727589208007e-05,
"loss": 2.1164,
"step": 1270
},
{
"epoch": 1.671018276762402,
"grad_norm": 28.251296997070312,
"learning_rate": 2.2149695387293298e-05,
"loss": 2.3045,
"step": 1280
},
{
"epoch": 1.6840731070496084,
"grad_norm": 28.00015640258789,
"learning_rate": 2.193211488250653e-05,
"loss": 1.9546,
"step": 1290
},
{
"epoch": 1.6971279373368147,
"grad_norm": 20.0263729095459,
"learning_rate": 2.171453437771976e-05,
"loss": 2.2491,
"step": 1300
},
{
"epoch": 1.7101827676240209,
"grad_norm": 23.335580825805664,
"learning_rate": 2.1496953872932988e-05,
"loss": 2.5543,
"step": 1310
},
{
"epoch": 1.723237597911227,
"grad_norm": 25.97711181640625,
"learning_rate": 2.1279373368146216e-05,
"loss": 2.2486,
"step": 1320
},
{
"epoch": 1.7362924281984333,
"grad_norm": 24.391855239868164,
"learning_rate": 2.1061792863359443e-05,
"loss": 2.3483,
"step": 1330
},
{
"epoch": 1.7493472584856398,
"grad_norm": 29.249792098999023,
"learning_rate": 2.0844212358572674e-05,
"loss": 2.3703,
"step": 1340
},
{
"epoch": 1.762402088772846,
"grad_norm": 20.213987350463867,
"learning_rate": 2.0626631853785902e-05,
"loss": 2.3178,
"step": 1350
},
{
"epoch": 1.7754569190600522,
"grad_norm": 33.050018310546875,
"learning_rate": 2.040905134899913e-05,
"loss": 2.2622,
"step": 1360
},
{
"epoch": 1.7885117493472587,
"grad_norm": 25.058115005493164,
"learning_rate": 2.019147084421236e-05,
"loss": 2.2577,
"step": 1370
},
{
"epoch": 1.8015665796344646,
"grad_norm": 34.79226303100586,
"learning_rate": 1.997389033942559e-05,
"loss": 2.1586,
"step": 1380
},
{
"epoch": 1.814621409921671,
"grad_norm": 38.56571578979492,
"learning_rate": 1.9756309834638816e-05,
"loss": 2.0911,
"step": 1390
},
{
"epoch": 1.8276762402088773,
"grad_norm": 27.778825759887695,
"learning_rate": 1.9538729329852047e-05,
"loss": 2.2379,
"step": 1400
},
{
"epoch": 1.8407310704960835,
"grad_norm": 27.160274505615234,
"learning_rate": 1.9321148825065275e-05,
"loss": 2.0847,
"step": 1410
},
{
"epoch": 1.85378590078329,
"grad_norm": 26.11197853088379,
"learning_rate": 1.9103568320278503e-05,
"loss": 2.7021,
"step": 1420
},
{
"epoch": 1.866840731070496,
"grad_norm": 28.448244094848633,
"learning_rate": 1.8885987815491734e-05,
"loss": 2.1596,
"step": 1430
},
{
"epoch": 1.8798955613577024,
"grad_norm": 49.074729919433594,
"learning_rate": 1.866840731070496e-05,
"loss": 2.5753,
"step": 1440
},
{
"epoch": 1.8929503916449086,
"grad_norm": 21.96980094909668,
"learning_rate": 1.845082680591819e-05,
"loss": 2.5118,
"step": 1450
},
{
"epoch": 1.9060052219321149,
"grad_norm": 20.993181228637695,
"learning_rate": 1.823324630113142e-05,
"loss": 2.2332,
"step": 1460
},
{
"epoch": 1.9190600522193213,
"grad_norm": 20.049209594726562,
"learning_rate": 1.801566579634465e-05,
"loss": 2.1748,
"step": 1470
},
{
"epoch": 1.9321148825065273,
"grad_norm": 35.51521682739258,
"learning_rate": 1.779808529155788e-05,
"loss": 2.781,
"step": 1480
},
{
"epoch": 1.9451697127937337,
"grad_norm": 25.36643409729004,
"learning_rate": 1.7580504786771106e-05,
"loss": 2.7489,
"step": 1490
},
{
"epoch": 1.95822454308094,
"grad_norm": 38.09309387207031,
"learning_rate": 1.7362924281984334e-05,
"loss": 2.1326,
"step": 1500
},
{
"epoch": 1.9712793733681462,
"grad_norm": 30.636632919311523,
"learning_rate": 1.7145343777197565e-05,
"loss": 2.2151,
"step": 1510
},
{
"epoch": 1.9843342036553526,
"grad_norm": 27.038352966308594,
"learning_rate": 1.6927763272410793e-05,
"loss": 2.4523,
"step": 1520
},
{
"epoch": 1.9973890339425586,
"grad_norm": 19.101573944091797,
"learning_rate": 1.671018276762402e-05,
"loss": 2.518,
"step": 1530
},
{
"epoch": 2.0,
"eval_loss": NaN,
"eval_runtime": 11.9494,
"eval_samples_per_second": 113.897,
"eval_steps_per_second": 14.31,
"step": 1532
},
{
"epoch": 2.010443864229765,
"grad_norm": 28.463035583496094,
"learning_rate": 1.649260226283725e-05,
"loss": 1.8945,
"step": 1540
},
{
"epoch": 2.023498694516971,
"grad_norm": 30.520097732543945,
"learning_rate": 1.627502175805048e-05,
"loss": 2.3685,
"step": 1550
},
{
"epoch": 2.0365535248041775,
"grad_norm": 19.876482009887695,
"learning_rate": 1.6057441253263707e-05,
"loss": 2.1494,
"step": 1560
},
{
"epoch": 2.049608355091384,
"grad_norm": 23.423219680786133,
"learning_rate": 1.5839860748476938e-05,
"loss": 1.9791,
"step": 1570
},
{
"epoch": 2.06266318537859,
"grad_norm": 20.257450103759766,
"learning_rate": 1.5622280243690166e-05,
"loss": 2.2526,
"step": 1580
},
{
"epoch": 2.0757180156657964,
"grad_norm": 47.68708038330078,
"learning_rate": 1.5404699738903393e-05,
"loss": 1.9962,
"step": 1590
},
{
"epoch": 2.0887728459530024,
"grad_norm": 27.561660766601562,
"learning_rate": 1.5187119234116623e-05,
"loss": 1.6106,
"step": 1600
},
{
"epoch": 2.101827676240209,
"grad_norm": 26.832944869995117,
"learning_rate": 1.4969538729329852e-05,
"loss": 1.5015,
"step": 1610
},
{
"epoch": 2.1148825065274153,
"grad_norm": 45.05983352661133,
"learning_rate": 1.475195822454308e-05,
"loss": 1.8897,
"step": 1620
},
{
"epoch": 2.1279373368146213,
"grad_norm": 19.24533462524414,
"learning_rate": 1.4534377719756313e-05,
"loss": 2.4187,
"step": 1630
},
{
"epoch": 2.1409921671018277,
"grad_norm": 29.476770401000977,
"learning_rate": 1.431679721496954e-05,
"loss": 2.1733,
"step": 1640
},
{
"epoch": 2.1540469973890337,
"grad_norm": 26.505355834960938,
"learning_rate": 1.409921671018277e-05,
"loss": 1.8432,
"step": 1650
},
{
"epoch": 2.16710182767624,
"grad_norm": 28.1693058013916,
"learning_rate": 1.3881636205395997e-05,
"loss": 2.1419,
"step": 1660
},
{
"epoch": 2.1801566579634466,
"grad_norm": 37.704498291015625,
"learning_rate": 1.3664055700609227e-05,
"loss": 2.7488,
"step": 1670
},
{
"epoch": 2.1932114882506526,
"grad_norm": 22.38772964477539,
"learning_rate": 1.3446475195822456e-05,
"loss": 2.3531,
"step": 1680
},
{
"epoch": 2.206266318537859,
"grad_norm": 22.58838653564453,
"learning_rate": 1.3228894691035684e-05,
"loss": 1.8415,
"step": 1690
},
{
"epoch": 2.2193211488250655,
"grad_norm": 30.01149559020996,
"learning_rate": 1.3011314186248913e-05,
"loss": 2.0757,
"step": 1700
},
{
"epoch": 2.2323759791122715,
"grad_norm": 23.964759826660156,
"learning_rate": 1.2793733681462141e-05,
"loss": 2.4956,
"step": 1710
},
{
"epoch": 2.245430809399478,
"grad_norm": 33.133541107177734,
"learning_rate": 1.257615317667537e-05,
"loss": 1.971,
"step": 1720
},
{
"epoch": 2.258485639686684,
"grad_norm": 27.34188461303711,
"learning_rate": 1.23585726718886e-05,
"loss": 2.2084,
"step": 1730
},
{
"epoch": 2.2715404699738904,
"grad_norm": 25.62513542175293,
"learning_rate": 1.2140992167101827e-05,
"loss": 1.9445,
"step": 1740
},
{
"epoch": 2.2845953002610964,
"grad_norm": 33.618385314941406,
"learning_rate": 1.1923411662315057e-05,
"loss": 2.4505,
"step": 1750
},
{
"epoch": 2.297650130548303,
"grad_norm": 25.787757873535156,
"learning_rate": 1.1705831157528286e-05,
"loss": 2.748,
"step": 1760
},
{
"epoch": 2.3107049608355092,
"grad_norm": 24.533018112182617,
"learning_rate": 1.1488250652741515e-05,
"loss": 1.496,
"step": 1770
},
{
"epoch": 2.3237597911227152,
"grad_norm": 41.41205596923828,
"learning_rate": 1.1270670147954745e-05,
"loss": 2.5213,
"step": 1780
},
{
"epoch": 2.3368146214099217,
"grad_norm": 20.163238525390625,
"learning_rate": 1.1053089643167972e-05,
"loss": 1.8429,
"step": 1790
},
{
"epoch": 2.349869451697128,
"grad_norm": 36.58127975463867,
"learning_rate": 1.0835509138381202e-05,
"loss": 2.617,
"step": 1800
},
{
"epoch": 2.362924281984334,
"grad_norm": 29.932636260986328,
"learning_rate": 1.061792863359443e-05,
"loss": 2.2462,
"step": 1810
},
{
"epoch": 2.3759791122715406,
"grad_norm": 20.780025482177734,
"learning_rate": 1.0400348128807659e-05,
"loss": 1.7878,
"step": 1820
},
{
"epoch": 2.3890339425587466,
"grad_norm": 26.663557052612305,
"learning_rate": 1.0182767624020888e-05,
"loss": 2.2757,
"step": 1830
},
{
"epoch": 2.402088772845953,
"grad_norm": 34.194435119628906,
"learning_rate": 9.965187119234116e-06,
"loss": 1.8959,
"step": 1840
},
{
"epoch": 2.4151436031331595,
"grad_norm": 13.468405723571777,
"learning_rate": 9.747606614447347e-06,
"loss": 2.2176,
"step": 1850
},
{
"epoch": 2.4281984334203655,
"grad_norm": 36.56803512573242,
"learning_rate": 9.530026109660575e-06,
"loss": 2.2373,
"step": 1860
},
{
"epoch": 2.441253263707572,
"grad_norm": 25.580917358398438,
"learning_rate": 9.312445604873804e-06,
"loss": 2.004,
"step": 1870
},
{
"epoch": 2.454308093994778,
"grad_norm": 47.98051071166992,
"learning_rate": 9.094865100087033e-06,
"loss": 2.09,
"step": 1880
},
{
"epoch": 2.4673629242819843,
"grad_norm": 44.3414421081543,
"learning_rate": 8.877284595300261e-06,
"loss": 2.4412,
"step": 1890
},
{
"epoch": 2.480417754569191,
"grad_norm": 22.451644897460938,
"learning_rate": 8.65970409051349e-06,
"loss": 1.5646,
"step": 1900
},
{
"epoch": 2.493472584856397,
"grad_norm": 21.15165138244629,
"learning_rate": 8.442123585726718e-06,
"loss": 1.5899,
"step": 1910
},
{
"epoch": 2.506527415143603,
"grad_norm": 49.44068145751953,
"learning_rate": 8.224543080939948e-06,
"loss": 2.0553,
"step": 1920
},
{
"epoch": 2.5195822454308097,
"grad_norm": 30.540063858032227,
"learning_rate": 8.006962576153177e-06,
"loss": 1.7215,
"step": 1930
},
{
"epoch": 2.5326370757180157,
"grad_norm": 29.058853149414062,
"learning_rate": 7.789382071366406e-06,
"loss": 1.7025,
"step": 1940
},
{
"epoch": 2.5456919060052217,
"grad_norm": 36.38139343261719,
"learning_rate": 7.571801566579635e-06,
"loss": 1.9976,
"step": 1950
},
{
"epoch": 2.558746736292428,
"grad_norm": 21.742773056030273,
"learning_rate": 7.354221061792864e-06,
"loss": 1.8528,
"step": 1960
},
{
"epoch": 2.5718015665796345,
"grad_norm": 22.30496597290039,
"learning_rate": 7.136640557006093e-06,
"loss": 1.726,
"step": 1970
},
{
"epoch": 2.584856396866841,
"grad_norm": 28.57794761657715,
"learning_rate": 6.919060052219321e-06,
"loss": 1.9179,
"step": 1980
},
{
"epoch": 2.597911227154047,
"grad_norm": 35.680999755859375,
"learning_rate": 6.70147954743255e-06,
"loss": 1.9789,
"step": 1990
},
{
"epoch": 2.6109660574412534,
"grad_norm": 41.53910827636719,
"learning_rate": 6.483899042645779e-06,
"loss": 2.3311,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2298,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2103832360255488.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}