SpatialStack / Order_Ablations /E115 /trainer_state.json
Journey9ni's picture
Upload Order_Ablations without checkpoints
19d4cfa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996222709073053,
"eval_steps": 500,
"global_step": 3308,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030218327415577548,
"grad_norm": 166.1145477294922,
"learning_rate": 1.0000000000000002e-06,
"loss": 13.8334,
"step": 10
},
{
"epoch": 0.0060436654831155096,
"grad_norm": 81.33020782470703,
"learning_rate": 2.0000000000000003e-06,
"loss": 13.1022,
"step": 20
},
{
"epoch": 0.009065498224673264,
"grad_norm": 51.09122085571289,
"learning_rate": 3e-06,
"loss": 12.0899,
"step": 30
},
{
"epoch": 0.012087330966231019,
"grad_norm": 49.01457214355469,
"learning_rate": 4.000000000000001e-06,
"loss": 10.2942,
"step": 40
},
{
"epoch": 0.015109163707788774,
"grad_norm": 46.00205612182617,
"learning_rate": 5e-06,
"loss": 9.538,
"step": 50
},
{
"epoch": 0.01813099644934653,
"grad_norm": 41.61635971069336,
"learning_rate": 6e-06,
"loss": 8.3357,
"step": 60
},
{
"epoch": 0.021152829190904283,
"grad_norm": 31.584325790405273,
"learning_rate": 7e-06,
"loss": 6.9115,
"step": 70
},
{
"epoch": 0.024174661932462038,
"grad_norm": 51.664695739746094,
"learning_rate": 8.000000000000001e-06,
"loss": 6.7596,
"step": 80
},
{
"epoch": 0.027196494674019793,
"grad_norm": 31.014076232910156,
"learning_rate": 9e-06,
"loss": 7.0298,
"step": 90
},
{
"epoch": 0.030218327415577548,
"grad_norm": 37.792179107666016,
"learning_rate": 1e-05,
"loss": 6.7021,
"step": 100
},
{
"epoch": 0.0332401601571353,
"grad_norm": 37.68498611450195,
"learning_rate": 9.999760394462267e-06,
"loss": 7.3545,
"step": 110
},
{
"epoch": 0.03626199289869306,
"grad_norm": 38.4805793762207,
"learning_rate": 9.999041600813393e-06,
"loss": 7.0073,
"step": 120
},
{
"epoch": 0.03928382564025081,
"grad_norm": 32.300174713134766,
"learning_rate": 9.997843687944153e-06,
"loss": 6.2416,
"step": 130
},
{
"epoch": 0.04230565838180857,
"grad_norm": 29.263317108154297,
"learning_rate": 9.996166770665168e-06,
"loss": 5.5583,
"step": 140
},
{
"epoch": 0.04532749112336632,
"grad_norm": 33.3656005859375,
"learning_rate": 9.994011009695908e-06,
"loss": 5.6737,
"step": 150
},
{
"epoch": 0.048349323864924076,
"grad_norm": 32.699825286865234,
"learning_rate": 9.991376611649278e-06,
"loss": 6.0879,
"step": 160
},
{
"epoch": 0.05137115660648183,
"grad_norm": 27.45968246459961,
"learning_rate": 9.988263829011821e-06,
"loss": 5.4056,
"step": 170
},
{
"epoch": 0.054392989348039586,
"grad_norm": 25.30878448486328,
"learning_rate": 9.984672960119523e-06,
"loss": 5.3618,
"step": 180
},
{
"epoch": 0.05741482208959734,
"grad_norm": 40.055721282958984,
"learning_rate": 9.980604349129212e-06,
"loss": 5.7602,
"step": 190
},
{
"epoch": 0.060436654831155096,
"grad_norm": 26.245195388793945,
"learning_rate": 9.976058385985575e-06,
"loss": 5.186,
"step": 200
},
{
"epoch": 0.06345848757271286,
"grad_norm": 34.81965637207031,
"learning_rate": 9.971035506383791e-06,
"loss": 5.5341,
"step": 210
},
{
"epoch": 0.0664803203142706,
"grad_norm": 29.513893127441406,
"learning_rate": 9.96553619172777e-06,
"loss": 5.0542,
"step": 220
},
{
"epoch": 0.06950215305582837,
"grad_norm": 32.30284118652344,
"learning_rate": 9.959560969084004e-06,
"loss": 5.3365,
"step": 230
},
{
"epoch": 0.07252398579738611,
"grad_norm": 27.652576446533203,
"learning_rate": 9.953110411131073e-06,
"loss": 4.7513,
"step": 240
},
{
"epoch": 0.07554581853894388,
"grad_norm": 28.387413024902344,
"learning_rate": 9.946185136104736e-06,
"loss": 5.4127,
"step": 250
},
{
"epoch": 0.07856765128050162,
"grad_norm": 29.694316864013672,
"learning_rate": 9.938785807738692e-06,
"loss": 4.813,
"step": 260
},
{
"epoch": 0.08158948402205939,
"grad_norm": 31.964120864868164,
"learning_rate": 9.930913135200964e-06,
"loss": 5.4212,
"step": 270
},
{
"epoch": 0.08461131676361713,
"grad_norm": 23.594715118408203,
"learning_rate": 9.922567873025924e-06,
"loss": 5.2445,
"step": 280
},
{
"epoch": 0.0876331495051749,
"grad_norm": 23.0896053314209,
"learning_rate": 9.913750821041988e-06,
"loss": 4.5194,
"step": 290
},
{
"epoch": 0.09065498224673264,
"grad_norm": 25.44329833984375,
"learning_rate": 9.904462824294945e-06,
"loss": 4.6093,
"step": 300
},
{
"epoch": 0.0936768149882904,
"grad_norm": 27.408288955688477,
"learning_rate": 9.894704772966978e-06,
"loss": 4.512,
"step": 310
},
{
"epoch": 0.09669864772984815,
"grad_norm": 24.26542091369629,
"learning_rate": 9.884477602291343e-06,
"loss": 4.5071,
"step": 320
},
{
"epoch": 0.09972048047140591,
"grad_norm": 35.8819694519043,
"learning_rate": 9.873782292462727e-06,
"loss": 4.3557,
"step": 330
},
{
"epoch": 0.10274231321296366,
"grad_norm": 29.487594604492188,
"learning_rate": 9.862619868543323e-06,
"loss": 8.2236,
"step": 340
},
{
"epoch": 0.10576414595452142,
"grad_norm": 38.13749694824219,
"learning_rate": 9.850991400364557e-06,
"loss": 5.1538,
"step": 350
},
{
"epoch": 0.10878597869607917,
"grad_norm": 25.492799758911133,
"learning_rate": 9.838898002424586e-06,
"loss": 6.0666,
"step": 360
},
{
"epoch": 0.11180781143763693,
"grad_norm": 31.119089126586914,
"learning_rate": 9.826340833781448e-06,
"loss": 5.8633,
"step": 370
},
{
"epoch": 0.11482964417919468,
"grad_norm": 21.065149307250977,
"learning_rate": 9.813321097942005e-06,
"loss": 5.1017,
"step": 380
},
{
"epoch": 0.11785147692075244,
"grad_norm": 29.40814971923828,
"learning_rate": 9.79984004274658e-06,
"loss": 4.9132,
"step": 390
},
{
"epoch": 0.12087330966231019,
"grad_norm": 22.45477294921875,
"learning_rate": 9.785898960249365e-06,
"loss": 4.2496,
"step": 400
},
{
"epoch": 0.12389514240386795,
"grad_norm": 19.05487060546875,
"learning_rate": 9.771499186594586e-06,
"loss": 5.0767,
"step": 410
},
{
"epoch": 0.12691697514542571,
"grad_norm": 31.310686111450195,
"learning_rate": 9.756642101888449e-06,
"loss": 5.192,
"step": 420
},
{
"epoch": 0.12993880788698345,
"grad_norm": 25.689640045166016,
"learning_rate": 9.74132913006686e-06,
"loss": 3.445,
"step": 430
},
{
"epoch": 0.1329606406285412,
"grad_norm": 21.052574157714844,
"learning_rate": 9.725561738758956e-06,
"loss": 3.3354,
"step": 440
},
{
"epoch": 0.13598247337009897,
"grad_norm": 24.987884521484375,
"learning_rate": 9.709341439146452e-06,
"loss": 5.0777,
"step": 450
},
{
"epoch": 0.13900430611165673,
"grad_norm": 26.842397689819336,
"learning_rate": 9.692669785818787e-06,
"loss": 6.4292,
"step": 460
},
{
"epoch": 0.14202613885321447,
"grad_norm": 35.66836166381836,
"learning_rate": 9.675548376624149e-06,
"loss": 5.7348,
"step": 470
},
{
"epoch": 0.14504797159477223,
"grad_norm": 29.318471908569336,
"learning_rate": 9.657978852516318e-06,
"loss": 5.6924,
"step": 480
},
{
"epoch": 0.14806980433633,
"grad_norm": 23.544092178344727,
"learning_rate": 9.639962897397405e-06,
"loss": 4.183,
"step": 490
},
{
"epoch": 0.15109163707788775,
"grad_norm": 22.90180206298828,
"learning_rate": 9.621502237956452e-06,
"loss": 5.085,
"step": 500
},
{
"epoch": 0.1541134698194455,
"grad_norm": 23.748275756835938,
"learning_rate": 9.602598643503957e-06,
"loss": 3.2694,
"step": 510
},
{
"epoch": 0.15713530256100325,
"grad_norm": 29.096708297729492,
"learning_rate": 9.583253925802283e-06,
"loss": 4.2373,
"step": 520
},
{
"epoch": 0.160157135302561,
"grad_norm": 24.87314796447754,
"learning_rate": 9.563469938892023e-06,
"loss": 4.8482,
"step": 530
},
{
"epoch": 0.16317896804411877,
"grad_norm": 24.310091018676758,
"learning_rate": 9.543248578914309e-06,
"loss": 3.2299,
"step": 540
},
{
"epoch": 0.1662008007856765,
"grad_norm": 24.80878257751465,
"learning_rate": 9.522591783929069e-06,
"loss": 4.8424,
"step": 550
},
{
"epoch": 0.16922263352723427,
"grad_norm": 22.142215728759766,
"learning_rate": 9.501501533729297e-06,
"loss": 4.1786,
"step": 560
},
{
"epoch": 0.17224446626879203,
"grad_norm": 33.77587890625,
"learning_rate": 9.479979849651287e-06,
"loss": 5.7505,
"step": 570
},
{
"epoch": 0.1752662990103498,
"grad_norm": 25.414831161499023,
"learning_rate": 9.45802879438091e-06,
"loss": 6.3392,
"step": 580
},
{
"epoch": 0.17828813175190752,
"grad_norm": 25.716073989868164,
"learning_rate": 9.43565047175593e-06,
"loss": 4.1603,
"step": 590
},
{
"epoch": 0.1813099644934653,
"grad_norm": 25.389522552490234,
"learning_rate": 9.412847026564359e-06,
"loss": 3.9676,
"step": 600
},
{
"epoch": 0.18433179723502305,
"grad_norm": 22.911640167236328,
"learning_rate": 9.389620644338893e-06,
"loss": 4.1508,
"step": 610
},
{
"epoch": 0.1873536299765808,
"grad_norm": 36.27210998535156,
"learning_rate": 9.365973551147453e-06,
"loss": 4.691,
"step": 620
},
{
"epoch": 0.19037546271813854,
"grad_norm": 23.555246353149414,
"learning_rate": 9.341908013379832e-06,
"loss": 4.7148,
"step": 630
},
{
"epoch": 0.1933972954596963,
"grad_norm": 25.42097282409668,
"learning_rate": 9.317426337530477e-06,
"loss": 4.0105,
"step": 640
},
{
"epoch": 0.19641912820125407,
"grad_norm": 24.92901611328125,
"learning_rate": 9.292530869977432e-06,
"loss": 5.5589,
"step": 650
},
{
"epoch": 0.19944096094281183,
"grad_norm": 26.411352157592773,
"learning_rate": 9.26722399675745e-06,
"loss": 3.1881,
"step": 660
},
{
"epoch": 0.20246279368436956,
"grad_norm": 22.39121437072754,
"learning_rate": 9.24150814333732e-06,
"loss": 3.9177,
"step": 670
},
{
"epoch": 0.20548462642592732,
"grad_norm": 21.436046600341797,
"learning_rate": 9.215385774381395e-06,
"loss": 6.2124,
"step": 680
},
{
"epoch": 0.2085064591674851,
"grad_norm": 42.19996643066406,
"learning_rate": 9.188859393515382e-06,
"loss": 4.863,
"step": 690
},
{
"epoch": 0.21152829190904285,
"grad_norm": 24.43948745727539,
"learning_rate": 9.16193154308638e-06,
"loss": 6.0562,
"step": 700
},
{
"epoch": 0.21455012465060058,
"grad_norm": 36.5896110534668,
"learning_rate": 9.13460480391922e-06,
"loss": 6.1878,
"step": 710
},
{
"epoch": 0.21757195739215834,
"grad_norm": 39.19657897949219,
"learning_rate": 9.106881795069116e-06,
"loss": 6.4964,
"step": 720
},
{
"epoch": 0.2205937901337161,
"grad_norm": 19.438859939575195,
"learning_rate": 9.078765173570649e-06,
"loss": 3.1914,
"step": 730
},
{
"epoch": 0.22361562287527387,
"grad_norm": 26.316898345947266,
"learning_rate": 9.0502576341831e-06,
"loss": 4.0543,
"step": 740
},
{
"epoch": 0.2266374556168316,
"grad_norm": 21.5406436920166,
"learning_rate": 9.02136190913219e-06,
"loss": 5.4649,
"step": 750
},
{
"epoch": 0.22965928835838936,
"grad_norm": 38.014617919921875,
"learning_rate": 8.99208076784822e-06,
"loss": 4.6499,
"step": 760
},
{
"epoch": 0.23268112109994712,
"grad_norm": 16.046876907348633,
"learning_rate": 8.962417016700624e-06,
"loss": 3.0368,
"step": 770
},
{
"epoch": 0.2357029538415049,
"grad_norm": 25.170169830322266,
"learning_rate": 8.932373498729026e-06,
"loss": 4.6374,
"step": 780
},
{
"epoch": 0.23872478658306262,
"grad_norm": 28.294591903686523,
"learning_rate": 8.901953093370734e-06,
"loss": 4.0344,
"step": 790
},
{
"epoch": 0.24174661932462038,
"grad_norm": 25.618423461914062,
"learning_rate": 8.871158716184784e-06,
"loss": 3.9153,
"step": 800
},
{
"epoch": 0.24476845206617814,
"grad_norm": 33.044132232666016,
"learning_rate": 8.839993318572497e-06,
"loss": 4.852,
"step": 810
},
{
"epoch": 0.2477902848077359,
"grad_norm": 19.522127151489258,
"learning_rate": 8.808459887494617e-06,
"loss": 3.0679,
"step": 820
},
{
"epoch": 0.25081211754929367,
"grad_norm": 17.915157318115234,
"learning_rate": 8.77656144518502e-06,
"loss": 3.832,
"step": 830
},
{
"epoch": 0.25383395029085143,
"grad_norm": 18.468053817749023,
"learning_rate": 8.744301048861083e-06,
"loss": 2.9134,
"step": 840
},
{
"epoch": 0.25685578303240914,
"grad_norm": 25.19109535217285,
"learning_rate": 8.711681790430646e-06,
"loss": 2.9987,
"step": 850
},
{
"epoch": 0.2598776157739669,
"grad_norm": 27.227184295654297,
"learning_rate": 8.678706796195694e-06,
"loss": 4.7592,
"step": 860
},
{
"epoch": 0.26289944851552466,
"grad_norm": 28.04375457763672,
"learning_rate": 8.645379226552712e-06,
"loss": 3.7402,
"step": 870
},
{
"epoch": 0.2659212812570824,
"grad_norm": 21.457616806030273,
"learning_rate": 8.611702275689805e-06,
"loss": 4.6756,
"step": 880
},
{
"epoch": 0.2689431139986402,
"grad_norm": 35.01508331298828,
"learning_rate": 8.577679171280538e-06,
"loss": 4.5315,
"step": 890
},
{
"epoch": 0.27196494674019794,
"grad_norm": 20.160045623779297,
"learning_rate": 8.543313174174601e-06,
"loss": 5.2698,
"step": 900
},
{
"epoch": 0.2749867794817557,
"grad_norm": 22.52850341796875,
"learning_rate": 8.508607578085281e-06,
"loss": 3.849,
"step": 910
},
{
"epoch": 0.27800861222331347,
"grad_norm": 21.895462036132812,
"learning_rate": 8.473565709273786e-06,
"loss": 3.8616,
"step": 920
},
{
"epoch": 0.2810304449648712,
"grad_norm": 16.077316284179688,
"learning_rate": 8.438190926230439e-06,
"loss": 3.8386,
"step": 930
},
{
"epoch": 0.28405227770642894,
"grad_norm": 33.1984977722168,
"learning_rate": 8.40248661935281e-06,
"loss": 4.3994,
"step": 940
},
{
"epoch": 0.2870741104479867,
"grad_norm": 27.1571102142334,
"learning_rate": 8.366456210620756e-06,
"loss": 3.1029,
"step": 950
},
{
"epoch": 0.29009594318954446,
"grad_norm": 31.706750869750977,
"learning_rate": 8.330103153268464e-06,
"loss": 3.7567,
"step": 960
},
{
"epoch": 0.2931177759311022,
"grad_norm": 24.30504608154297,
"learning_rate": 8.29343093145347e-06,
"loss": 3.6988,
"step": 970
},
{
"epoch": 0.29613960867266,
"grad_norm": 24.231523513793945,
"learning_rate": 8.25644305992275e-06,
"loss": 3.6097,
"step": 980
},
{
"epoch": 0.29916144141421774,
"grad_norm": 19.621383666992188,
"learning_rate": 8.21914308367584e-06,
"loss": 4.5566,
"step": 990
},
{
"epoch": 0.3021832741557755,
"grad_norm": 21.627859115600586,
"learning_rate": 8.181534577625088e-06,
"loss": 3.7714,
"step": 1000
},
{
"epoch": 0.3052051068973332,
"grad_norm": 14.206421852111816,
"learning_rate": 8.143621146253022e-06,
"loss": 4.6373,
"step": 1010
},
{
"epoch": 0.308226939638891,
"grad_norm": 27.084983825683594,
"learning_rate": 8.105406423266884e-06,
"loss": 4.6538,
"step": 1020
},
{
"epoch": 0.31124877238044873,
"grad_norm": 20.950910568237305,
"learning_rate": 8.066894071250374e-06,
"loss": 4.4614,
"step": 1030
},
{
"epoch": 0.3142706051220065,
"grad_norm": 20.357742309570312,
"learning_rate": 8.02808778131262e-06,
"loss": 3.7694,
"step": 1040
},
{
"epoch": 0.31729243786356426,
"grad_norm": 18.685476303100586,
"learning_rate": 7.988991272734407e-06,
"loss": 4.4575,
"step": 1050
},
{
"epoch": 0.320314270605122,
"grad_norm": 24.249338150024414,
"learning_rate": 7.94960829261172e-06,
"loss": 4.4394,
"step": 1060
},
{
"epoch": 0.3233361033466798,
"grad_norm": 22.846027374267578,
"learning_rate": 7.909942615496613e-06,
"loss": 4.7241,
"step": 1070
},
{
"epoch": 0.32635793608823754,
"grad_norm": 30.40308952331543,
"learning_rate": 7.869998043035442e-06,
"loss": 5.3999,
"step": 1080
},
{
"epoch": 0.32937976882979525,
"grad_norm": 17.647789001464844,
"learning_rate": 7.829778403604512e-06,
"loss": 5.0469,
"step": 1090
},
{
"epoch": 0.332401601571353,
"grad_norm": 33.98617935180664,
"learning_rate": 7.789287551943158e-06,
"loss": 6.0896,
"step": 1100
},
{
"epoch": 0.3354234343129108,
"grad_norm": 21.646024703979492,
"learning_rate": 7.748529368784293e-06,
"loss": 4.5196,
"step": 1110
},
{
"epoch": 0.33844526705446853,
"grad_norm": 18.94881820678711,
"learning_rate": 7.707507760482473e-06,
"loss": 6.1607,
"step": 1120
},
{
"epoch": 0.3414670997960263,
"grad_norm": 18.058412551879883,
"learning_rate": 7.666226658639507e-06,
"loss": 3.7909,
"step": 1130
},
{
"epoch": 0.34448893253758406,
"grad_norm": 22.541349411010742,
"learning_rate": 7.624690019727636e-06,
"loss": 3.638,
"step": 1140
},
{
"epoch": 0.3475107652791418,
"grad_norm": 23.882991790771484,
"learning_rate": 7.58290182471034e-06,
"loss": 4.53,
"step": 1150
},
{
"epoch": 0.3505325980206996,
"grad_norm": 19.6879940032959,
"learning_rate": 7.5408660786607976e-06,
"loss": 3.6987,
"step": 1160
},
{
"epoch": 0.3535544307622573,
"grad_norm": 20.6401309967041,
"learning_rate": 7.498586810378019e-06,
"loss": 2.9513,
"step": 1170
},
{
"epoch": 0.35657626350381505,
"grad_norm": 22.658132553100586,
"learning_rate": 7.456068072000731e-06,
"loss": 2.8103,
"step": 1180
},
{
"epoch": 0.3595980962453728,
"grad_norm": 23.935726165771484,
"learning_rate": 7.4133139386190026e-06,
"loss": 4.5498,
"step": 1190
},
{
"epoch": 0.3626199289869306,
"grad_norm": 18.697385787963867,
"learning_rate": 7.3703285078836796e-06,
"loss": 5.2042,
"step": 1200
},
{
"epoch": 0.36564176172848833,
"grad_norm": 17.5216064453125,
"learning_rate": 7.3271158996136625e-06,
"loss": 3.7229,
"step": 1210
},
{
"epoch": 0.3686635944700461,
"grad_norm": 18.313034057617188,
"learning_rate": 7.283680255401049e-06,
"loss": 4.403,
"step": 1220
},
{
"epoch": 0.37168542721160386,
"grad_norm": 19.784748077392578,
"learning_rate": 7.240025738214193e-06,
"loss": 6.1978,
"step": 1230
},
{
"epoch": 0.3747072599531616,
"grad_norm": 33.28024673461914,
"learning_rate": 7.196156531998718e-06,
"loss": 4.4892,
"step": 1240
},
{
"epoch": 0.3777290926947193,
"grad_norm": 20.449913024902344,
"learning_rate": 7.152076841276527e-06,
"loss": 3.6566,
"step": 1250
},
{
"epoch": 0.3807509254362771,
"grad_norm": 19.441957473754883,
"learning_rate": 7.1077908907428154e-06,
"loss": 3.7812,
"step": 1260
},
{
"epoch": 0.38377275817783485,
"grad_norm": 32.515724182128906,
"learning_rate": 7.063302924861182e-06,
"loss": 3.8969,
"step": 1270
},
{
"epoch": 0.3867945909193926,
"grad_norm": 22.129140853881836,
"learning_rate": 7.018617207456821e-06,
"loss": 3.5997,
"step": 1280
},
{
"epoch": 0.3898164236609504,
"grad_norm": 19.576011657714844,
"learning_rate": 6.973738021307872e-06,
"loss": 3.6646,
"step": 1290
},
{
"epoch": 0.39283825640250813,
"grad_norm": 17.848796844482422,
"learning_rate": 6.9286696677349455e-06,
"loss": 5.9623,
"step": 1300
},
{
"epoch": 0.3958600891440659,
"grad_norm": 15.815289497375488,
"learning_rate": 6.883416466188881e-06,
"loss": 3.6821,
"step": 1310
},
{
"epoch": 0.39888192188562366,
"grad_norm": 17.62392807006836,
"learning_rate": 6.837982753836755e-06,
"loss": 2.8778,
"step": 1320
},
{
"epoch": 0.40190375462718136,
"grad_norm": 34.39213180541992,
"learning_rate": 6.7923728851461955e-06,
"loss": 6.0046,
"step": 1330
},
{
"epoch": 0.4049255873687391,
"grad_norm": 22.834793090820312,
"learning_rate": 6.74659123146805e-06,
"loss": 3.6498,
"step": 1340
},
{
"epoch": 0.4079474201102969,
"grad_norm": 18.146869659423828,
"learning_rate": 6.70064218061742e-06,
"loss": 2.8181,
"step": 1350
},
{
"epoch": 0.41096925285185465,
"grad_norm": 18.262357711791992,
"learning_rate": 6.654530136453119e-06,
"loss": 4.3635,
"step": 1360
},
{
"epoch": 0.4139910855934124,
"grad_norm": 18.1636905670166,
"learning_rate": 6.608259518455599e-06,
"loss": 5.2127,
"step": 1370
},
{
"epoch": 0.4170129183349702,
"grad_norm": 17.246234893798828,
"learning_rate": 6.5618347613033875e-06,
"loss": 5.1173,
"step": 1380
},
{
"epoch": 0.42003475107652793,
"grad_norm": 19.54306983947754,
"learning_rate": 6.5152603144480406e-06,
"loss": 5.9817,
"step": 1390
},
{
"epoch": 0.4230565838180857,
"grad_norm": 31.445457458496094,
"learning_rate": 6.468540641687716e-06,
"loss": 4.5568,
"step": 1400
},
{
"epoch": 0.4260784165596434,
"grad_norm": 19.258493423461914,
"learning_rate": 6.421680220739337e-06,
"loss": 3.9311,
"step": 1410
},
{
"epoch": 0.42910024930120116,
"grad_norm": 33.21185302734375,
"learning_rate": 6.374683542809447e-06,
"loss": 7.8417,
"step": 1420
},
{
"epoch": 0.4321220820427589,
"grad_norm": 19.956239700317383,
"learning_rate": 6.327555112163761e-06,
"loss": 4.3582,
"step": 1430
},
{
"epoch": 0.4351439147843167,
"grad_norm": 19.256486892700195,
"learning_rate": 6.280299445695469e-06,
"loss": 5.2,
"step": 1440
},
{
"epoch": 0.43816574752587445,
"grad_norm": 20.045286178588867,
"learning_rate": 6.232921072492319e-06,
"loss": 4.3409,
"step": 1450
},
{
"epoch": 0.4411875802674322,
"grad_norm": 24.16641616821289,
"learning_rate": 6.185424533402543e-06,
"loss": 4.3162,
"step": 1460
},
{
"epoch": 0.44420941300899,
"grad_norm": 23.316164016723633,
"learning_rate": 6.13781438059966e-06,
"loss": 3.5112,
"step": 1470
},
{
"epoch": 0.44723124575054773,
"grad_norm": 34.204627990722656,
"learning_rate": 6.090095177146178e-06,
"loss": 5.1696,
"step": 1480
},
{
"epoch": 0.45025307849210544,
"grad_norm": 17.53434181213379,
"learning_rate": 6.042271496556255e-06,
"loss": 2.7874,
"step": 1490
},
{
"epoch": 0.4532749112336632,
"grad_norm": 21.362934112548828,
"learning_rate": 5.994347922357372e-06,
"loss": 3.8133,
"step": 1500
},
{
"epoch": 0.45629674397522096,
"grad_norm": 19.935638427734375,
"learning_rate": 5.946329047651037e-06,
"loss": 3.592,
"step": 1510
},
{
"epoch": 0.4593185767167787,
"grad_norm": 17.95412826538086,
"learning_rate": 5.8982194746725686e-06,
"loss": 2.7345,
"step": 1520
},
{
"epoch": 0.4623404094583365,
"grad_norm": 24.026193618774414,
"learning_rate": 5.850023814350007e-06,
"loss": 4.2519,
"step": 1530
},
{
"epoch": 0.46536224219989425,
"grad_norm": 12.00658893585205,
"learning_rate": 5.801746685862197e-06,
"loss": 6.0717,
"step": 1540
},
{
"epoch": 0.468384074941452,
"grad_norm": 14.519695281982422,
"learning_rate": 5.753392716196069e-06,
"loss": 2.8474,
"step": 1550
},
{
"epoch": 0.4714059076830098,
"grad_norm": 15.277630805969238,
"learning_rate": 5.704966539703185e-06,
"loss": 3.6301,
"step": 1560
},
{
"epoch": 0.4744277404245675,
"grad_norm": 17.934938430786133,
"learning_rate": 5.656472797655571e-06,
"loss": 4.4189,
"step": 1570
},
{
"epoch": 0.47744957316612524,
"grad_norm": 17.185529708862305,
"learning_rate": 5.60791613780088e-06,
"loss": 2.7758,
"step": 1580
},
{
"epoch": 0.480471405907683,
"grad_norm": 25.111557006835938,
"learning_rate": 5.5593012139169525e-06,
"loss": 4.296,
"step": 1590
},
{
"epoch": 0.48349323864924076,
"grad_norm": 23.77570343017578,
"learning_rate": 5.510632685365777e-06,
"loss": 4.4462,
"step": 1600
},
{
"epoch": 0.4865150713907985,
"grad_norm": 17.37128448486328,
"learning_rate": 5.461915216646938e-06,
"loss": 2.7426,
"step": 1610
},
{
"epoch": 0.4895369041323563,
"grad_norm": 23.484580993652344,
"learning_rate": 5.41315347695055e-06,
"loss": 4.2378,
"step": 1620
},
{
"epoch": 0.49255873687391405,
"grad_norm": 23.495826721191406,
"learning_rate": 5.364352139709758e-06,
"loss": 4.8879,
"step": 1630
},
{
"epoch": 0.4955805696154718,
"grad_norm": 16.23356819152832,
"learning_rate": 5.315515882152822e-06,
"loss": 3.5359,
"step": 1640
},
{
"epoch": 0.4986024023570295,
"grad_norm": 16.77799415588379,
"learning_rate": 5.266649384854842e-06,
"loss": 4.2516,
"step": 1650
},
{
"epoch": 0.5016242350985873,
"grad_norm": 21.264799118041992,
"learning_rate": 5.217757331289165e-06,
"loss": 3.6844,
"step": 1660
},
{
"epoch": 0.5046460678401451,
"grad_norm": 18.198184967041016,
"learning_rate": 5.168844407378506e-06,
"loss": 4.8485,
"step": 1670
},
{
"epoch": 0.5076679005817029,
"grad_norm": 13.497072219848633,
"learning_rate": 5.119915301045836e-06,
"loss": 2.8835,
"step": 1680
},
{
"epoch": 0.5106897333232605,
"grad_norm": 24.342716217041016,
"learning_rate": 5.070974701765089e-06,
"loss": 5.1527,
"step": 1690
},
{
"epoch": 0.5137115660648183,
"grad_norm": 25.917234420776367,
"learning_rate": 5.022027300111712e-06,
"loss": 4.3981,
"step": 1700
},
{
"epoch": 0.516733398806376,
"grad_norm": 15.280237197875977,
"learning_rate": 4.973077787313099e-06,
"loss": 4.4554,
"step": 1710
},
{
"epoch": 0.5197552315479338,
"grad_norm": 17.290264129638672,
"learning_rate": 4.924130854798983e-06,
"loss": 5.1108,
"step": 1720
},
{
"epoch": 0.5227770642894916,
"grad_norm": 15.63051700592041,
"learning_rate": 4.875191193751803e-06,
"loss": 2.8006,
"step": 1730
},
{
"epoch": 0.5257988970310493,
"grad_norm": 15.663633346557617,
"learning_rate": 4.826263494657077e-06,
"loss": 3.4979,
"step": 1740
},
{
"epoch": 0.5288207297726071,
"grad_norm": 35.42136001586914,
"learning_rate": 4.777352446853863e-06,
"loss": 4.9996,
"step": 1750
},
{
"epoch": 0.5318425625141648,
"grad_norm": 23.063594818115234,
"learning_rate": 4.72846273808533e-06,
"loss": 3.509,
"step": 1760
},
{
"epoch": 0.5348643952557226,
"grad_norm": 21.706233978271484,
"learning_rate": 4.679599054049458e-06,
"loss": 3.3899,
"step": 1770
},
{
"epoch": 0.5378862279972804,
"grad_norm": 20.82579231262207,
"learning_rate": 4.630766077949965e-06,
"loss": 5.9861,
"step": 1780
},
{
"epoch": 0.5409080607388381,
"grad_norm": 32.06898880004883,
"learning_rate": 4.5819684900474484e-06,
"loss": 4.3172,
"step": 1790
},
{
"epoch": 0.5439298934803959,
"grad_norm": 16.330984115600586,
"learning_rate": 4.5332109672108245e-06,
"loss": 4.4365,
"step": 1800
},
{
"epoch": 0.5469517262219536,
"grad_norm": 17.189834594726562,
"learning_rate": 4.484498182469085e-06,
"loss": 3.6319,
"step": 1810
},
{
"epoch": 0.5499735589635114,
"grad_norm": 19.211336135864258,
"learning_rate": 4.435834804563422e-06,
"loss": 5.8999,
"step": 1820
},
{
"epoch": 0.5529953917050692,
"grad_norm": 26.310638427734375,
"learning_rate": 4.387225497499767e-06,
"loss": 3.5792,
"step": 1830
},
{
"epoch": 0.5560172244466269,
"grad_norm": 20.680715560913086,
"learning_rate": 4.3386749201017856e-06,
"loss": 3.4555,
"step": 1840
},
{
"epoch": 0.5590390571881846,
"grad_norm": 15.533769607543945,
"learning_rate": 4.290187725564356e-06,
"loss": 6.0278,
"step": 1850
},
{
"epoch": 0.5620608899297423,
"grad_norm": 13.684257507324219,
"learning_rate": 4.2417685610076135e-06,
"loss": 3.4758,
"step": 1860
},
{
"epoch": 0.5650827226713001,
"grad_norm": 15.711587905883789,
"learning_rate": 4.193422067031535e-06,
"loss": 4.3166,
"step": 1870
},
{
"epoch": 0.5681045554128579,
"grad_norm": 18.764991760253906,
"learning_rate": 4.145152877271196e-06,
"loss": 4.1625,
"step": 1880
},
{
"epoch": 0.5711263881544156,
"grad_norm": 19.19873809814453,
"learning_rate": 4.096965617952667e-06,
"loss": 4.4233,
"step": 1890
},
{
"epoch": 0.5741482208959734,
"grad_norm": 20.817365646362305,
"learning_rate": 4.048864907449619e-06,
"loss": 3.5268,
"step": 1900
},
{
"epoch": 0.5771700536375312,
"grad_norm": 18.440645217895508,
"learning_rate": 4.000855355840695e-06,
"loss": 3.5747,
"step": 1910
},
{
"epoch": 0.5801918863790889,
"grad_norm": 15.997143745422363,
"learning_rate": 3.952941564467665e-06,
"loss": 4.2257,
"step": 1920
},
{
"epoch": 0.5832137191206467,
"grad_norm": 20.629562377929688,
"learning_rate": 3.905128125494427e-06,
"loss": 4.3136,
"step": 1930
},
{
"epoch": 0.5862355518622044,
"grad_norm": 33.730995178222656,
"learning_rate": 3.8574196214668876e-06,
"loss": 4.509,
"step": 1940
},
{
"epoch": 0.5892573846037622,
"grad_norm": 30.045576095581055,
"learning_rate": 3.8098206248737486e-06,
"loss": 5.139,
"step": 1950
},
{
"epoch": 0.59227921734532,
"grad_norm": 23.693470001220703,
"learning_rate": 3.7623356977082794e-06,
"loss": 2.5913,
"step": 1960
},
{
"epoch": 0.5953010500868777,
"grad_norm": 18.655092239379883,
"learning_rate": 3.714969391031084e-06,
"loss": 4.3328,
"step": 1970
},
{
"epoch": 0.5983228828284355,
"grad_norm": 15.45345687866211,
"learning_rate": 3.6677262445339136e-06,
"loss": 3.5691,
"step": 1980
},
{
"epoch": 0.6013447155699932,
"grad_norm": 21.302995681762695,
"learning_rate": 3.6206107861045803e-06,
"loss": 2.5934,
"step": 1990
},
{
"epoch": 0.604366548311551,
"grad_norm": 13.75935173034668,
"learning_rate": 3.5736275313929826e-06,
"loss": 4.3405,
"step": 2000
},
{
"epoch": 0.6073883810531087,
"grad_norm": 17.593429565429688,
"learning_rate": 3.5267809833783213e-06,
"loss": 4.8443,
"step": 2010
},
{
"epoch": 0.6104102137946664,
"grad_norm": 23.467853546142578,
"learning_rate": 3.4800756319375326e-06,
"loss": 3.4879,
"step": 2020
},
{
"epoch": 0.6134320465362242,
"grad_norm": 25.12725830078125,
"learning_rate": 3.433515953414953e-06,
"loss": 2.7966,
"step": 2030
},
{
"epoch": 0.616453879277782,
"grad_norm": 33.0245475769043,
"learning_rate": 3.387106410193308e-06,
"loss": 5.8078,
"step": 2040
},
{
"epoch": 0.6194757120193397,
"grad_norm": 18.8001651763916,
"learning_rate": 3.3408514502660195e-06,
"loss": 5.2049,
"step": 2050
},
{
"epoch": 0.6224975447608975,
"grad_norm": 16.787553787231445,
"learning_rate": 3.2947555068109057e-06,
"loss": 3.3988,
"step": 2060
},
{
"epoch": 0.6255193775024552,
"grad_norm": 21.532262802124023,
"learning_rate": 3.248822997765295e-06,
"loss": 2.815,
"step": 2070
},
{
"epoch": 0.628541210244013,
"grad_norm": 24.630603790283203,
"learning_rate": 3.203058325402599e-06,
"loss": 4.3332,
"step": 2080
},
{
"epoch": 0.6315630429855708,
"grad_norm": 16.667922973632812,
"learning_rate": 3.1574658759103904e-06,
"loss": 4.3038,
"step": 2090
},
{
"epoch": 0.6345848757271285,
"grad_norm": 20.671772003173828,
"learning_rate": 3.1120500189700204e-06,
"loss": 3.4132,
"step": 2100
},
{
"epoch": 0.6376067084686863,
"grad_norm": 21.932987213134766,
"learning_rate": 3.066815107337815e-06,
"loss": 4.1988,
"step": 2110
},
{
"epoch": 0.640628541210244,
"grad_norm": 17.348411560058594,
"learning_rate": 3.0217654764279114e-06,
"loss": 3.5937,
"step": 2120
},
{
"epoch": 0.6436503739518018,
"grad_norm": 25.625871658325195,
"learning_rate": 2.9769054438967192e-06,
"loss": 5.9817,
"step": 2130
},
{
"epoch": 0.6466722066933596,
"grad_norm": 31.0660457611084,
"learning_rate": 2.9322393092291256e-06,
"loss": 5.6772,
"step": 2140
},
{
"epoch": 0.6496940394349173,
"grad_norm": 20.511960983276367,
"learning_rate": 2.887771353326422e-06,
"loss": 4.2915,
"step": 2150
},
{
"epoch": 0.6527158721764751,
"grad_norm": 17.798234939575195,
"learning_rate": 2.8435058380959957e-06,
"loss": 2.642,
"step": 2160
},
{
"epoch": 0.6557377049180327,
"grad_norm": 18.133886337280273,
"learning_rate": 2.7994470060428835e-06,
"loss": 4.1208,
"step": 2170
},
{
"epoch": 0.6587595376595905,
"grad_norm": 18.74016571044922,
"learning_rate": 2.7555990798631436e-06,
"loss": 4.8817,
"step": 2180
},
{
"epoch": 0.6617813704011483,
"grad_norm": 15.885804176330566,
"learning_rate": 2.711966262039145e-06,
"loss": 3.3242,
"step": 2190
},
{
"epoch": 0.664803203142706,
"grad_norm": 24.100414276123047,
"learning_rate": 2.668552734436802e-06,
"loss": 4.3377,
"step": 2200
},
{
"epoch": 0.6678250358842638,
"grad_norm": 17.113306045532227,
"learning_rate": 2.6253626579047653e-06,
"loss": 5.7855,
"step": 2210
},
{
"epoch": 0.6708468686258215,
"grad_norm": 33.268699645996094,
"learning_rate": 2.582400171875638e-06,
"loss": 3.4326,
"step": 2220
},
{
"epoch": 0.6738687013673793,
"grad_norm": 29.673768997192383,
"learning_rate": 2.5396693939692474e-06,
"loss": 4.8596,
"step": 2230
},
{
"epoch": 0.6768905341089371,
"grad_norm": 14.550185203552246,
"learning_rate": 2.4971744195979985e-06,
"loss": 5.1031,
"step": 2240
},
{
"epoch": 0.6799123668504948,
"grad_norm": 32.16508102416992,
"learning_rate": 2.4549193215743706e-06,
"loss": 5.833,
"step": 2250
},
{
"epoch": 0.6829341995920526,
"grad_norm": 18.873088836669922,
"learning_rate": 2.4129081497205536e-06,
"loss": 3.3544,
"step": 2260
},
{
"epoch": 0.6859560323336104,
"grad_norm": 31.875137329101562,
"learning_rate": 2.3711449304803174e-06,
"loss": 4.0864,
"step": 2270
},
{
"epoch": 0.6889778650751681,
"grad_norm": 27.996572494506836,
"learning_rate": 2.329633666533103e-06,
"loss": 4.0582,
"step": 2280
},
{
"epoch": 0.6919996978167259,
"grad_norm": 19.299062728881836,
"learning_rate": 2.288378336410398e-06,
"loss": 4.2188,
"step": 2290
},
{
"epoch": 0.6950215305582836,
"grad_norm": 21.146148681640625,
"learning_rate": 2.2473828941144277e-06,
"loss": 4.8756,
"step": 2300
},
{
"epoch": 0.6980433632998414,
"grad_norm": 28.3226261138916,
"learning_rate": 2.20665126873919e-06,
"loss": 3.3593,
"step": 2310
},
{
"epoch": 0.7010651960413992,
"grad_norm": 16.02470588684082,
"learning_rate": 2.1661873640938818e-06,
"loss": 4.1255,
"step": 2320
},
{
"epoch": 0.7040870287829568,
"grad_norm": 21.263837814331055,
"learning_rate": 2.1259950583287633e-06,
"loss": 4.145,
"step": 2330
},
{
"epoch": 0.7071088615245146,
"grad_norm": 22.879661560058594,
"learning_rate": 2.086078203563439e-06,
"loss": 4.7453,
"step": 2340
},
{
"epoch": 0.7101306942660723,
"grad_norm": 15.726652145385742,
"learning_rate": 2.0464406255176967e-06,
"loss": 4.019,
"step": 2350
},
{
"epoch": 0.7131525270076301,
"grad_norm": 30.606904983520508,
"learning_rate": 2.0070861231448142e-06,
"loss": 4.9014,
"step": 2360
},
{
"epoch": 0.7161743597491879,
"grad_norm": 17.185054779052734,
"learning_rate": 1.968018468267472e-06,
"loss": 4.1918,
"step": 2370
},
{
"epoch": 0.7191961924907456,
"grad_norm": 15.510167121887207,
"learning_rate": 1.929241405216254e-06,
"loss": 4.0934,
"step": 2380
},
{
"epoch": 0.7222180252323034,
"grad_norm": 20.12055206298828,
"learning_rate": 1.8907586504707776e-06,
"loss": 4.701,
"step": 2390
},
{
"epoch": 0.7252398579738611,
"grad_norm": 19.135282516479492,
"learning_rate": 1.8525738923035002e-06,
"loss": 2.5439,
"step": 2400
},
{
"epoch": 0.7282616907154189,
"grad_norm": 19.167003631591797,
"learning_rate": 1.8146907904262268e-06,
"loss": 4.2791,
"step": 2410
},
{
"epoch": 0.7312835234569767,
"grad_norm": 24.79986572265625,
"learning_rate": 1.7771129756393545e-06,
"loss": 3.4256,
"step": 2420
},
{
"epoch": 0.7343053561985344,
"grad_norm": 20.59393310546875,
"learning_rate": 1.7398440494838947e-06,
"loss": 3.5206,
"step": 2430
},
{
"epoch": 0.7373271889400922,
"grad_norm": 25.903627395629883,
"learning_rate": 1.7028875838962822e-06,
"loss": 4.1281,
"step": 2440
},
{
"epoch": 0.74034902168165,
"grad_norm": 35.45489501953125,
"learning_rate": 1.6662471208660392e-06,
"loss": 4.0468,
"step": 2450
},
{
"epoch": 0.7433708544232077,
"grad_norm": 20.3117618560791,
"learning_rate": 1.6299261720963095e-06,
"loss": 4.1749,
"step": 2460
},
{
"epoch": 0.7463926871647655,
"grad_norm": 15.878867149353027,
"learning_rate": 1.5939282186672705e-06,
"loss": 4.8916,
"step": 2470
},
{
"epoch": 0.7494145199063232,
"grad_norm": 19.15277099609375,
"learning_rate": 1.5582567107025237e-06,
"loss": 4.8288,
"step": 2480
},
{
"epoch": 0.7524363526478809,
"grad_norm": 29.44374656677246,
"learning_rate": 1.5229150670384057e-06,
"loss": 3.3806,
"step": 2490
},
{
"epoch": 0.7554581853894387,
"grad_norm": 23.206140518188477,
"learning_rate": 1.4879066748963295e-06,
"loss": 2.5563,
"step": 2500
},
{
"epoch": 0.7584800181309964,
"grad_norm": 27.133193969726562,
"learning_rate": 1.4532348895581466e-06,
"loss": 3.4434,
"step": 2510
},
{
"epoch": 0.7615018508725542,
"grad_norm": 29.599319458007812,
"learning_rate": 1.4189030340445648e-06,
"loss": 6.7087,
"step": 2520
},
{
"epoch": 0.7645236836141119,
"grad_norm": 17.123348236083984,
"learning_rate": 1.3849143987966646e-06,
"loss": 4.9595,
"step": 2530
},
{
"epoch": 0.7675455163556697,
"grad_norm": 16.49233627319336,
"learning_rate": 1.3512722413605356e-06,
"loss": 4.0857,
"step": 2540
},
{
"epoch": 0.7705673490972275,
"grad_norm": 16.6666316986084,
"learning_rate": 1.3179797860750654e-06,
"loss": 4.8943,
"step": 2550
},
{
"epoch": 0.7735891818387852,
"grad_norm": 19.440494537353516,
"learning_rate": 1.2850402237629184e-06,
"loss": 4.1448,
"step": 2560
},
{
"epoch": 0.776611014580343,
"grad_norm": 14.674943923950195,
"learning_rate": 1.2524567114247083e-06,
"loss": 3.3491,
"step": 2570
},
{
"epoch": 0.7796328473219007,
"grad_norm": 16.349637985229492,
"learning_rate": 1.2202323719364324e-06,
"loss": 3.2897,
"step": 2580
},
{
"epoch": 0.7826546800634585,
"grad_norm": 19.67890739440918,
"learning_rate": 1.1883702937501708e-06,
"loss": 4.0901,
"step": 2590
},
{
"epoch": 0.7856765128050163,
"grad_norm": 21.339618682861328,
"learning_rate": 1.1568735305980694e-06,
"loss": 4.1003,
"step": 2600
},
{
"epoch": 0.788698345546574,
"grad_norm": 21.269119262695312,
"learning_rate": 1.1257451011996807e-06,
"loss": 3.4165,
"step": 2610
},
{
"epoch": 0.7917201782881318,
"grad_norm": 33.041419982910156,
"learning_rate": 1.0949879889726295e-06,
"loss": 3.4622,
"step": 2620
},
{
"epoch": 0.7947420110296896,
"grad_norm": 28.960115432739258,
"learning_rate": 1.0646051417466801e-06,
"loss": 3.4136,
"step": 2630
},
{
"epoch": 0.7977638437712473,
"grad_norm": 24.76239013671875,
"learning_rate": 1.0345994714812135e-06,
"loss": 4.1335,
"step": 2640
},
{
"epoch": 0.800785676512805,
"grad_norm": 15.773963928222656,
"learning_rate": 1.0049738539861332e-06,
"loss": 3.2818,
"step": 2650
},
{
"epoch": 0.8038075092543627,
"grad_norm": 21.248395919799805,
"learning_rate": 9.757311286462428e-07,
"loss": 4.1348,
"step": 2660
},
{
"epoch": 0.8068293419959205,
"grad_norm": 23.75290298461914,
"learning_rate": 9.468740981491143e-07,
"loss": 4.1947,
"step": 2670
},
{
"epoch": 0.8098511747374783,
"grad_norm": 16.7280330657959,
"learning_rate": 9.1840552821647e-07,
"loss": 4.0364,
"step": 2680
},
{
"epoch": 0.812873007479036,
"grad_norm": 17.696247100830078,
"learning_rate": 8.903281473391152e-07,
"loss": 3.3641,
"step": 2690
},
{
"epoch": 0.8158948402205938,
"grad_norm": 16.840299606323242,
"learning_rate": 8.62644646515427e-07,
"loss": 5.7446,
"step": 2700
},
{
"epoch": 0.8189166729621515,
"grad_norm": 13.25534725189209,
"learning_rate": 8.353576789934436e-07,
"loss": 3.3763,
"step": 2710
},
{
"epoch": 0.8219385057037093,
"grad_norm": 19.88932991027832,
"learning_rate": 8.084698600165797e-07,
"loss": 3.5133,
"step": 2720
},
{
"epoch": 0.8249603384452671,
"grad_norm": 17.921199798583984,
"learning_rate": 7.819837665729596e-07,
"loss": 4.1018,
"step": 2730
},
{
"epoch": 0.8279821711868248,
"grad_norm": 29.57664680480957,
"learning_rate": 7.559019371484521e-07,
"loss": 3.3378,
"step": 2740
},
{
"epoch": 0.8310040039283826,
"grad_norm": 17.720863342285156,
"learning_rate": 7.302268714833622e-07,
"loss": 4.1487,
"step": 2750
},
{
"epoch": 0.8340258366699403,
"grad_norm": 17.34684944152832,
"learning_rate": 7.049610303328541e-07,
"loss": 3.5199,
"step": 2760
},
{
"epoch": 0.8370476694114981,
"grad_norm": 16.739910125732422,
"learning_rate": 6.80106835231113e-07,
"loss": 4.2899,
"step": 2770
},
{
"epoch": 0.8400695021530559,
"grad_norm": 17.1294002532959,
"learning_rate": 6.556666682592494e-07,
"loss": 3.3016,
"step": 2780
},
{
"epoch": 0.8430913348946136,
"grad_norm": 14.801079750061035,
"learning_rate": 6.316428718170037e-07,
"loss": 2.4169,
"step": 2790
},
{
"epoch": 0.8461131676361714,
"grad_norm": 19.354856491088867,
"learning_rate": 6.080377483982425e-07,
"loss": 3.2883,
"step": 2800
},
{
"epoch": 0.849135000377729,
"grad_norm": 17.925838470458984,
"learning_rate": 5.848535603702798e-07,
"loss": 3.3497,
"step": 2810
},
{
"epoch": 0.8521568331192868,
"grad_norm": 20.340959548950195,
"learning_rate": 5.62092529757054e-07,
"loss": 6.4132,
"step": 2820
},
{
"epoch": 0.8551786658608446,
"grad_norm": 21.507797241210938,
"learning_rate": 5.397568380261559e-07,
"loss": 2.3404,
"step": 2830
},
{
"epoch": 0.8582004986024023,
"grad_norm": 16.9514102935791,
"learning_rate": 5.178486258797555e-07,
"loss": 4.0876,
"step": 2840
},
{
"epoch": 0.8612223313439601,
"grad_norm": 14.505171775817871,
"learning_rate": 4.963699930494365e-07,
"loss": 3.3715,
"step": 2850
},
{
"epoch": 0.8642441640855179,
"grad_norm": 22.551313400268555,
"learning_rate": 4.75322998094942e-07,
"loss": 4.2347,
"step": 2860
},
{
"epoch": 0.8672659968270756,
"grad_norm": 20.145078659057617,
"learning_rate": 4.5470965820689384e-07,
"loss": 2.5903,
"step": 2870
},
{
"epoch": 0.8702878295686334,
"grad_norm": 17.447914123535156,
"learning_rate": 4.345319490134453e-07,
"loss": 3.0177,
"step": 2880
},
{
"epoch": 0.8733096623101911,
"grad_norm": 16.10365104675293,
"learning_rate": 4.147918043909405e-07,
"loss": 4.764,
"step": 2890
},
{
"epoch": 0.8763314950517489,
"grad_norm": 19.066129684448242,
"learning_rate": 3.9549111627856794e-07,
"loss": 4.7699,
"step": 2900
},
{
"epoch": 0.8793533277933067,
"grad_norm": 19.604887008666992,
"learning_rate": 3.766317344970288e-07,
"loss": 4.1165,
"step": 2910
},
{
"epoch": 0.8823751605348644,
"grad_norm": 17.465734481811523,
"learning_rate": 3.582154665712473e-07,
"loss": 2.4443,
"step": 2920
},
{
"epoch": 0.8853969932764222,
"grad_norm": 22.400236129760742,
"learning_rate": 3.402440775571364e-07,
"loss": 4.0664,
"step": 2930
},
{
"epoch": 0.88841882601798,
"grad_norm": 21.420312881469727,
"learning_rate": 3.227192898724252e-07,
"loss": 5.7203,
"step": 2940
},
{
"epoch": 0.8914406587595377,
"grad_norm": 23.331478118896484,
"learning_rate": 3.056427831315878e-07,
"loss": 3.367,
"step": 2950
},
{
"epoch": 0.8944624915010955,
"grad_norm": 21.29648208618164,
"learning_rate": 2.890161939848535e-07,
"loss": 4.1604,
"step": 2960
},
{
"epoch": 0.8974843242426531,
"grad_norm": 15.172201156616211,
"learning_rate": 2.72841115961357e-07,
"loss": 4.2335,
"step": 2970
},
{
"epoch": 0.9005061569842109,
"grad_norm": 16.736038208007812,
"learning_rate": 2.5711909931640633e-07,
"loss": 3.9793,
"step": 2980
},
{
"epoch": 0.9035279897257686,
"grad_norm": 22.6779727935791,
"learning_rate": 2.418516508829e-07,
"loss": 2.4922,
"step": 2990
},
{
"epoch": 0.9065498224673264,
"grad_norm": 32.2912712097168,
"learning_rate": 2.270402339269162e-07,
"loss": 5.6454,
"step": 3000
},
{
"epoch": 0.9095716552088842,
"grad_norm": 18.107574462890625,
"learning_rate": 2.126862680074643e-07,
"loss": 5.0056,
"step": 3010
},
{
"epoch": 0.9125934879504419,
"grad_norm": 32.63033676147461,
"learning_rate": 1.9879112884043317e-07,
"loss": 2.5369,
"step": 3020
},
{
"epoch": 0.9156153206919997,
"grad_norm": 18.089956283569336,
"learning_rate": 1.853561481667404e-07,
"loss": 2.4556,
"step": 3030
},
{
"epoch": 0.9186371534335575,
"grad_norm": 13.772138595581055,
"learning_rate": 1.7238261362469256e-07,
"loss": 3.0884,
"step": 3040
},
{
"epoch": 0.9216589861751152,
"grad_norm": 22.537776947021484,
"learning_rate": 1.5987176862657883e-07,
"loss": 3.2805,
"step": 3050
},
{
"epoch": 0.924680818916673,
"grad_norm": 30.13243865966797,
"learning_rate": 1.4782481223949597e-07,
"loss": 3.2507,
"step": 3060
},
{
"epoch": 0.9277026516582307,
"grad_norm": 20.858510971069336,
"learning_rate": 1.3624289907042787e-07,
"loss": 4.1981,
"step": 3070
},
{
"epoch": 0.9307244843997885,
"grad_norm": 30.669658660888672,
"learning_rate": 1.2512713915559027e-07,
"loss": 4.9341,
"step": 3080
},
{
"epoch": 0.9337463171413463,
"grad_norm": 32.03891372680664,
"learning_rate": 1.1447859785403359e-07,
"loss": 4.8266,
"step": 3090
},
{
"epoch": 0.936768149882904,
"grad_norm": 18.382429122924805,
"learning_rate": 1.0429829574554573e-07,
"loss": 3.4044,
"step": 3100
},
{
"epoch": 0.9397899826244618,
"grad_norm": 16.341550827026367,
"learning_rate": 9.458720853282977e-08,
"loss": 4.1438,
"step": 3110
},
{
"epoch": 0.9428118153660195,
"grad_norm": 32.575286865234375,
"learning_rate": 8.534626694799485e-08,
"loss": 5.6917,
"step": 3120
},
{
"epoch": 0.9458336481075772,
"grad_norm": 19.515989303588867,
"learning_rate": 7.657635666335317e-08,
"loss": 2.5437,
"step": 3130
},
{
"epoch": 0.948855480849135,
"grad_norm": 18.81734275817871,
"learning_rate": 6.827831820653163e-08,
"loss": 2.5297,
"step": 3140
},
{
"epoch": 0.9518773135906927,
"grad_norm": 20.44892120361328,
"learning_rate": 6.045294687991643e-08,
"loss": 5.3046,
"step": 3150
},
{
"epoch": 0.9548991463322505,
"grad_norm": 30.222261428833008,
"learning_rate": 5.310099268443114e-08,
"loss": 7.1585,
"step": 3160
},
{
"epoch": 0.9579209790738082,
"grad_norm": 22.93487548828125,
"learning_rate": 4.622316024765039e-08,
"loss": 3.9296,
"step": 3170
},
{
"epoch": 0.960942811815366,
"grad_norm": 20.129398345947266,
"learning_rate": 3.982010875626885e-08,
"loss": 3.2971,
"step": 3180
},
{
"epoch": 0.9639646445569238,
"grad_norm": 20.64815330505371,
"learning_rate": 3.389245189292622e-08,
"loss": 4.1501,
"step": 3190
},
{
"epoch": 0.9669864772984815,
"grad_norm": 19.435129165649414,
"learning_rate": 2.8440757777385976e-08,
"loss": 4.9552,
"step": 3200
},
{
"epoch": 0.9700083100400393,
"grad_norm": 17.719867706298828,
"learning_rate": 2.3465548912088298e-08,
"loss": 2.6329,
"step": 3210
},
{
"epoch": 0.973030142781597,
"grad_norm": 21.178937911987305,
"learning_rate": 1.896730213207132e-08,
"loss": 4.0836,
"step": 3220
},
{
"epoch": 0.9760519755231548,
"grad_norm": 16.906330108642578,
"learning_rate": 1.4946448559270964e-08,
"loss": 2.397,
"step": 3230
},
{
"epoch": 0.9790738082647126,
"grad_norm": 23.301292419433594,
"learning_rate": 1.1403373561199583e-08,
"loss": 4.2365,
"step": 3240
},
{
"epoch": 0.9820956410062703,
"grad_norm": 20.07245635986328,
"learning_rate": 8.338416714013254e-09,
"loss": 3.444,
"step": 3250
},
{
"epoch": 0.9851174737478281,
"grad_norm": 16.27911949157715,
"learning_rate": 5.751871769965056e-09,
"loss": 5.5038,
"step": 3260
},
{
"epoch": 0.9881393064893859,
"grad_norm": 21.404827117919922,
"learning_rate": 3.643986629253138e-09,
"loss": 4.1734,
"step": 3270
},
{
"epoch": 0.9911611392309436,
"grad_norm": 32.63972473144531,
"learning_rate": 2.014963316257501e-09,
"loss": 4.9837,
"step": 3280
},
{
"epoch": 0.9941829719725013,
"grad_norm": 19.831165313720703,
"learning_rate": 8.649579601810454e-10,
"loss": 4.9867,
"step": 3290
},
{
"epoch": 0.997204804714059,
"grad_norm": 21.53673553466797,
"learning_rate": 1.9408078008431587e-10,
"loss": 3.3738,
"step": 3300
},
{
"epoch": 0.9996222709073053,
"step": 3308,
"total_flos": 1.175877708593234e+19,
"train_loss": 4.4771003486744005,
"train_runtime": 52781.7624,
"train_samples_per_second": 4.013,
"train_steps_per_second": 0.063
}
],
"logging_steps": 10,
"max_steps": 3309,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.175877708593234e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}