downstream-7b / trainer_state.json
terry69's picture
Model save
ecef6e0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992103185048697,
"eval_steps": 500,
"global_step": 949,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010529086601737299,
"grad_norm": 4.026114469450001,
"learning_rate": 2.105263157894737e-06,
"loss": 1.3755,
"step": 1
},
{
"epoch": 0.0052645433008686494,
"grad_norm": 1.1251945428540173,
"learning_rate": 1.0526315789473684e-05,
"loss": 1.3102,
"step": 5
},
{
"epoch": 0.010529086601737299,
"grad_norm": 0.563945023031292,
"learning_rate": 2.105263157894737e-05,
"loss": 1.2626,
"step": 10
},
{
"epoch": 0.01579362990260595,
"grad_norm": 0.4336837922055097,
"learning_rate": 3.157894736842105e-05,
"loss": 1.2133,
"step": 15
},
{
"epoch": 0.021058173203474598,
"grad_norm": 0.33707427690007363,
"learning_rate": 4.210526315789474e-05,
"loss": 1.1636,
"step": 20
},
{
"epoch": 0.026322716504343247,
"grad_norm": 0.2412443925117212,
"learning_rate": 5.2631578947368424e-05,
"loss": 1.1845,
"step": 25
},
{
"epoch": 0.0315872598052119,
"grad_norm": 0.2414606698915264,
"learning_rate": 6.31578947368421e-05,
"loss": 1.1401,
"step": 30
},
{
"epoch": 0.03685180310608055,
"grad_norm": 0.22628642837844729,
"learning_rate": 7.368421052631579e-05,
"loss": 1.1591,
"step": 35
},
{
"epoch": 0.042116346406949196,
"grad_norm": 0.2208133931977146,
"learning_rate": 8.421052631578948e-05,
"loss": 1.1397,
"step": 40
},
{
"epoch": 0.04738088970781785,
"grad_norm": 0.21905054641153135,
"learning_rate": 9.473684210526316e-05,
"loss": 1.1295,
"step": 45
},
{
"epoch": 0.052645433008686494,
"grad_norm": 0.1951061692287286,
"learning_rate": 0.00010526315789473685,
"loss": 1.141,
"step": 50
},
{
"epoch": 0.05790997630955515,
"grad_norm": 0.18792729468212402,
"learning_rate": 0.00011578947368421053,
"loss": 1.121,
"step": 55
},
{
"epoch": 0.0631745196104238,
"grad_norm": 0.189919407852987,
"learning_rate": 0.0001263157894736842,
"loss": 1.1347,
"step": 60
},
{
"epoch": 0.06843906291129244,
"grad_norm": 0.18793881851552416,
"learning_rate": 0.0001368421052631579,
"loss": 1.0961,
"step": 65
},
{
"epoch": 0.0737036062121611,
"grad_norm": 0.18263188517758086,
"learning_rate": 0.00014736842105263158,
"loss": 1.0937,
"step": 70
},
{
"epoch": 0.07896814951302975,
"grad_norm": 0.18520098125405152,
"learning_rate": 0.00015789473684210527,
"loss": 1.1419,
"step": 75
},
{
"epoch": 0.08423269281389839,
"grad_norm": 0.18675524775004465,
"learning_rate": 0.00016842105263157895,
"loss": 1.1094,
"step": 80
},
{
"epoch": 0.08949723611476705,
"grad_norm": 0.18469057661828525,
"learning_rate": 0.00017894736842105264,
"loss": 1.0952,
"step": 85
},
{
"epoch": 0.0947617794156357,
"grad_norm": 0.17860571701450936,
"learning_rate": 0.00018947368421052632,
"loss": 1.1035,
"step": 90
},
{
"epoch": 0.10002632271650434,
"grad_norm": 0.2032976356381528,
"learning_rate": 0.0002,
"loss": 1.1329,
"step": 95
},
{
"epoch": 0.10529086601737299,
"grad_norm": 0.18932762375677964,
"learning_rate": 0.0001999830846194422,
"loss": 1.0902,
"step": 100
},
{
"epoch": 0.11055540931824165,
"grad_norm": 0.17823414518835126,
"learning_rate": 0.00019993234420037073,
"loss": 1.0951,
"step": 105
},
{
"epoch": 0.1158199526191103,
"grad_norm": 0.19033211796864122,
"learning_rate": 0.00019984779590865556,
"loss": 1.11,
"step": 110
},
{
"epoch": 0.12108449591997894,
"grad_norm": 0.1781090004150184,
"learning_rate": 0.0001997294683476273,
"loss": 1.1216,
"step": 115
},
{
"epoch": 0.1263490392208476,
"grad_norm": 0.20142566240295628,
"learning_rate": 0.0001995774015484005,
"loss": 1.088,
"step": 120
},
{
"epoch": 0.13161358252171623,
"grad_norm": 0.16738672746077932,
"learning_rate": 0.00019939164695633067,
"loss": 1.1069,
"step": 125
},
{
"epoch": 0.13687812582258488,
"grad_norm": 0.17141306079033702,
"learning_rate": 0.00019917226741361015,
"loss": 1.1178,
"step": 130
},
{
"epoch": 0.14214266912345355,
"grad_norm": 0.18242919862111662,
"learning_rate": 0.00019891933713800798,
"loss": 1.115,
"step": 135
},
{
"epoch": 0.1474072124243222,
"grad_norm": 0.18858703761293544,
"learning_rate": 0.00019863294169776148,
"loss": 1.092,
"step": 140
},
{
"epoch": 0.15267175572519084,
"grad_norm": 0.1851910906506613,
"learning_rate": 0.00019831317798262786,
"loss": 1.1015,
"step": 145
},
{
"epoch": 0.1579362990260595,
"grad_norm": 0.17061718616532065,
"learning_rate": 0.00019796015417110577,
"loss": 1.0834,
"step": 150
},
{
"epoch": 0.16320084232692814,
"grad_norm": 0.19083175263550564,
"learning_rate": 0.0001975739896938375,
"loss": 1.0915,
"step": 155
},
{
"epoch": 0.16846538562779678,
"grad_norm": 0.17041981222039004,
"learning_rate": 0.00019715481519320496,
"loss": 1.1045,
"step": 160
},
{
"epoch": 0.17372992892866543,
"grad_norm": 0.17539080495334333,
"learning_rate": 0.00019670277247913205,
"loss": 1.0822,
"step": 165
},
{
"epoch": 0.1789944722295341,
"grad_norm": 0.16847918243353582,
"learning_rate": 0.00019621801448110952,
"loss": 1.1113,
"step": 170
},
{
"epoch": 0.18425901553040275,
"grad_norm": 0.16577520965121645,
"learning_rate": 0.00019570070519645767,
"loss": 1.0726,
"step": 175
},
{
"epoch": 0.1895235588312714,
"grad_norm": 0.17216940817918563,
"learning_rate": 0.00019515101963484485,
"loss": 1.1214,
"step": 180
},
{
"epoch": 0.19478810213214004,
"grad_norm": 0.16717603329959776,
"learning_rate": 0.00019456914375908023,
"loss": 1.0749,
"step": 185
},
{
"epoch": 0.20005264543300869,
"grad_norm": 0.16743311436795275,
"learning_rate": 0.0001939552744222014,
"loss": 1.0856,
"step": 190
},
{
"epoch": 0.20531718873387733,
"grad_norm": 0.16628473925396028,
"learning_rate": 0.00019330961930087725,
"loss": 1.1088,
"step": 195
},
{
"epoch": 0.21058173203474598,
"grad_norm": 0.1735673710306468,
"learning_rate": 0.00019263239682514952,
"loss": 1.094,
"step": 200
},
{
"epoch": 0.21584627533561462,
"grad_norm": 0.16463208491106188,
"learning_rate": 0.00019192383610453618,
"loss": 1.1191,
"step": 205
},
{
"epoch": 0.2211108186364833,
"grad_norm": 0.1697201646634803,
"learning_rate": 0.00019118417685052194,
"loss": 1.1188,
"step": 210
},
{
"epoch": 0.22637536193735194,
"grad_norm": 0.15930348674889006,
"learning_rate": 0.00019041366929546219,
"loss": 1.1132,
"step": 215
},
{
"epoch": 0.2316399052382206,
"grad_norm": 0.16154587528605638,
"learning_rate": 0.0001896125741079272,
"loss": 1.1029,
"step": 220
},
{
"epoch": 0.23690444853908924,
"grad_norm": 0.1689593754891321,
"learning_rate": 0.00018878116230451613,
"loss": 1.1196,
"step": 225
},
{
"epoch": 0.24216899183995788,
"grad_norm": 0.158620581331537,
"learning_rate": 0.0001879197151581702,
"loss": 1.0786,
"step": 230
},
{
"epoch": 0.24743353514082653,
"grad_norm": 0.1591976970503649,
"learning_rate": 0.00018702852410301554,
"loss": 1.0861,
"step": 235
},
{
"epoch": 0.2526980784416952,
"grad_norm": 0.16271741933565712,
"learning_rate": 0.00018610789063576913,
"loss": 1.077,
"step": 240
},
{
"epoch": 0.25796262174256385,
"grad_norm": 0.16691583214883504,
"learning_rate": 0.00018515812621373997,
"loss": 1.0931,
"step": 245
},
{
"epoch": 0.26322716504343247,
"grad_norm": 0.15795453416798677,
"learning_rate": 0.00018417955214946092,
"loss": 1.0929,
"step": 250
},
{
"epoch": 0.26849170834430114,
"grad_norm": 0.15734119895920037,
"learning_rate": 0.00018317249950198597,
"loss": 1.086,
"step": 255
},
{
"epoch": 0.27375625164516976,
"grad_norm": 0.15815121783491273,
"learning_rate": 0.0001821373089648906,
"loss": 1.1142,
"step": 260
},
{
"epoch": 0.27902079494603843,
"grad_norm": 0.15790684873329372,
"learning_rate": 0.00018107433075101252,
"loss": 1.0907,
"step": 265
},
{
"epoch": 0.2842853382469071,
"grad_norm": 0.1603612919235376,
"learning_rate": 0.00017998392447397197,
"loss": 1.103,
"step": 270
},
{
"epoch": 0.2895498815477757,
"grad_norm": 0.1935643212000403,
"learning_rate": 0.00017886645902651167,
"loss": 1.1207,
"step": 275
},
{
"epoch": 0.2948144248486444,
"grad_norm": 0.16197395404790052,
"learning_rate": 0.0001777223124556978,
"loss": 1.1036,
"step": 280
},
{
"epoch": 0.300078968149513,
"grad_norm": 0.16503760296000086,
"learning_rate": 0.00017655187183502344,
"loss": 1.0647,
"step": 285
},
{
"epoch": 0.3053435114503817,
"grad_norm": 0.1772283442967409,
"learning_rate": 0.00017535553313345904,
"loss": 1.1075,
"step": 290
},
{
"epoch": 0.3106080547512503,
"grad_norm": 0.16282645295325013,
"learning_rate": 0.00017413370108149286,
"loss": 1.1094,
"step": 295
},
{
"epoch": 0.315872598052119,
"grad_norm": 0.15561402718068354,
"learning_rate": 0.00017288678903420762,
"loss": 1.0776,
"step": 300
},
{
"epoch": 0.32113714135298765,
"grad_norm": 0.15594031474920508,
"learning_rate": 0.00017161521883143934,
"loss": 1.1078,
"step": 305
},
{
"epoch": 0.32640168465385627,
"grad_norm": 0.1570395383591175,
"learning_rate": 0.00017031942065506576,
"loss": 1.1124,
"step": 310
},
{
"epoch": 0.33166622795472495,
"grad_norm": 0.15773152315944608,
"learning_rate": 0.00016899983288347248,
"loss": 1.0913,
"step": 315
},
{
"epoch": 0.33693077125559356,
"grad_norm": 0.15310806808595664,
"learning_rate": 0.00016765690194324616,
"loss": 1.0845,
"step": 320
},
{
"epoch": 0.34219531455646224,
"grad_norm": 0.16384678369715433,
"learning_rate": 0.00016629108215814525,
"loss": 1.1173,
"step": 325
},
{
"epoch": 0.34745985785733086,
"grad_norm": 0.165818325184464,
"learning_rate": 0.00016490283559539838,
"loss": 1.1014,
"step": 330
},
{
"epoch": 0.35272440115819953,
"grad_norm": 0.15456003221800826,
"learning_rate": 0.000163492631909384,
"loss": 1.0915,
"step": 335
},
{
"epoch": 0.3579889444590682,
"grad_norm": 0.16059867173233644,
"learning_rate": 0.00016206094818274229,
"loss": 1.0969,
"step": 340
},
{
"epoch": 0.3632534877599368,
"grad_norm": 0.17415674474557066,
"learning_rate": 0.00016060826876497478,
"loss": 1.1145,
"step": 345
},
{
"epoch": 0.3685180310608055,
"grad_norm": 0.16440403677512835,
"learning_rate": 0.0001591350851085851,
"loss": 1.0683,
"step": 350
},
{
"epoch": 0.3737825743616741,
"grad_norm": 0.15901493438320982,
"learning_rate": 0.00015764189560281677,
"loss": 1.1199,
"step": 355
},
{
"epoch": 0.3790471176625428,
"grad_norm": 0.15988293404570103,
"learning_rate": 0.00015612920540504453,
"loss": 1.0709,
"step": 360
},
{
"epoch": 0.3843116609634114,
"grad_norm": 0.1616109424204681,
"learning_rate": 0.00015459752626987563,
"loss": 1.1027,
"step": 365
},
{
"epoch": 0.3895762042642801,
"grad_norm": 0.1513607201651111,
"learning_rate": 0.00015304737637601926,
"loss": 1.0956,
"step": 370
},
{
"epoch": 0.3948407475651487,
"grad_norm": 0.15452619863423803,
"learning_rate": 0.0001514792801509831,
"loss": 1.0952,
"step": 375
},
{
"epoch": 0.40010529086601737,
"grad_norm": 0.15418975657555584,
"learning_rate": 0.00014989376809365493,
"loss": 1.0934,
"step": 380
},
{
"epoch": 0.40536983416688604,
"grad_norm": 0.15158447263390024,
"learning_rate": 0.00014829137659483143,
"loss": 1.0981,
"step": 385
},
{
"epoch": 0.41063437746775466,
"grad_norm": 0.15420702474431047,
"learning_rate": 0.0001466726477557527,
"loss": 1.1013,
"step": 390
},
{
"epoch": 0.41589892076862334,
"grad_norm": 0.1513401762569788,
"learning_rate": 0.00014503812920470534,
"loss": 1.1128,
"step": 395
},
{
"epoch": 0.42116346406949196,
"grad_norm": 0.1759021276212348,
"learning_rate": 0.00014338837391175582,
"loss": 1.0793,
"step": 400
},
{
"epoch": 0.42642800737036063,
"grad_norm": 0.15639002655528358,
"learning_rate": 0.00014172394000167623,
"loss": 1.1126,
"step": 405
},
{
"epoch": 0.43169255067122925,
"grad_norm": 0.1558922751326013,
"learning_rate": 0.00014004539056512667,
"loss": 1.0864,
"step": 410
},
{
"epoch": 0.4369570939720979,
"grad_norm": 0.15449223519766864,
"learning_rate": 0.00013835329346815716,
"loss": 1.1161,
"step": 415
},
{
"epoch": 0.4422216372729666,
"grad_norm": 0.15398779214531882,
"learning_rate": 0.0001366482211600945,
"loss": 1.113,
"step": 420
},
{
"epoch": 0.4474861805738352,
"grad_norm": 0.15902962443654645,
"learning_rate": 0.000134930750479878,
"loss": 1.0783,
"step": 425
},
{
"epoch": 0.4527507238747039,
"grad_norm": 0.15614703146804315,
"learning_rate": 0.00013320146246091074,
"loss": 1.0891,
"step": 430
},
{
"epoch": 0.4580152671755725,
"grad_norm": 0.151735228198923,
"learning_rate": 0.00013146094213449148,
"loss": 1.1006,
"step": 435
},
{
"epoch": 0.4632798104764412,
"grad_norm": 0.1633743946888902,
"learning_rate": 0.00012970977833189393,
"loss": 1.0717,
"step": 440
},
{
"epoch": 0.4685443537773098,
"grad_norm": 0.16534257355481496,
"learning_rate": 0.00012794856348516095,
"loss": 1.0778,
"step": 445
},
{
"epoch": 0.47380889707817847,
"grad_norm": 0.1856142828881669,
"learning_rate": 0.00012617789342668004,
"loss": 1.0859,
"step": 450
},
{
"epoch": 0.47907344037904714,
"grad_norm": 0.15229515578033356,
"learning_rate": 0.00012439836718760886,
"loss": 1.0761,
"step": 455
},
{
"epoch": 0.48433798367991576,
"grad_norm": 0.15984985984562605,
"learning_rate": 0.00012261058679521834,
"loss": 1.0926,
"step": 460
},
{
"epoch": 0.48960252698078444,
"grad_norm": 0.14896040772758903,
"learning_rate": 0.00012081515706922227,
"loss": 1.0834,
"step": 465
},
{
"epoch": 0.49486707028165305,
"grad_norm": 0.1514924492192347,
"learning_rate": 0.00011901268541716224,
"loss": 1.0885,
"step": 470
},
{
"epoch": 0.5001316135825217,
"grad_norm": 0.1513889418015892,
"learning_rate": 0.00011720378162891708,
"loss": 1.1001,
"step": 475
},
{
"epoch": 0.5053961568833903,
"grad_norm": 0.15159825336613816,
"learning_rate": 0.0001153890576704062,
"loss": 1.1082,
"step": 480
},
{
"epoch": 0.510660700184259,
"grad_norm": 0.15427722774659086,
"learning_rate": 0.00011356912747655685,
"loss": 1.0843,
"step": 485
},
{
"epoch": 0.5159252434851277,
"grad_norm": 0.14639500931900093,
"learning_rate": 0.00011174460674360549,
"loss": 1.1058,
"step": 490
},
{
"epoch": 0.5211897867859964,
"grad_norm": 0.15320269723203808,
"learning_rate": 0.00010991611272080269,
"loss": 1.1125,
"step": 495
},
{
"epoch": 0.5264543300868649,
"grad_norm": 0.15092814943890553,
"learning_rate": 0.00010808426400159338,
"loss": 1.0898,
"step": 500
},
{
"epoch": 0.5317188733877336,
"grad_norm": 0.14712598563479434,
"learning_rate": 0.00010624968031434173,
"loss": 1.0975,
"step": 505
},
{
"epoch": 0.5369834166886023,
"grad_norm": 0.1506174008648404,
"learning_rate": 0.00010441298231267242,
"loss": 1.0789,
"step": 510
},
{
"epoch": 0.542247959989471,
"grad_norm": 0.14915164476738402,
"learning_rate": 0.00010257479136549889,
"loss": 1.088,
"step": 515
},
{
"epoch": 0.5475125032903395,
"grad_norm": 0.14933216158522156,
"learning_rate": 0.00010073572934680919,
"loss": 1.1012,
"step": 520
},
{
"epoch": 0.5527770465912082,
"grad_norm": 0.1623395783916047,
"learning_rate": 9.889641842528178e-05,
"loss": 1.0992,
"step": 525
},
{
"epoch": 0.5580415898920769,
"grad_norm": 0.15524883773019818,
"learning_rate": 9.70574808538006e-05,
"loss": 1.0558,
"step": 530
},
{
"epoch": 0.5633061331929455,
"grad_norm": 0.14879516385003932,
"learning_rate": 9.521953875894257e-05,
"loss": 1.0634,
"step": 535
},
{
"epoch": 0.5685706764938142,
"grad_norm": 0.14856407933911947,
"learning_rate": 9.338321393050719e-05,
"loss": 1.0513,
"step": 540
},
{
"epoch": 0.5738352197946828,
"grad_norm": 0.1514919636398635,
"learning_rate": 9.154912761116056e-05,
"loss": 1.0899,
"step": 545
},
{
"epoch": 0.5790997630955514,
"grad_norm": 0.15005939408454377,
"learning_rate": 8.971790028626395e-05,
"loss": 1.09,
"step": 550
},
{
"epoch": 0.5843643063964201,
"grad_norm": 0.1541140355049706,
"learning_rate": 8.789015147395919e-05,
"loss": 1.072,
"step": 555
},
{
"epoch": 0.5896288496972888,
"grad_norm": 0.14756189100480177,
"learning_rate": 8.606649951558073e-05,
"loss": 1.0548,
"step": 560
},
{
"epoch": 0.5948933929981574,
"grad_norm": 0.14468591274130843,
"learning_rate": 8.424756136646623e-05,
"loss": 1.056,
"step": 565
},
{
"epoch": 0.600157936299026,
"grad_norm": 0.1510683202100121,
"learning_rate": 8.243395238723571e-05,
"loss": 1.0999,
"step": 570
},
{
"epoch": 0.6054224795998947,
"grad_norm": 0.14942489035639112,
"learning_rate": 8.062628613561051e-05,
"loss": 1.08,
"step": 575
},
{
"epoch": 0.6106870229007634,
"grad_norm": 0.14792710995590722,
"learning_rate": 7.8825174158842e-05,
"loss": 1.0916,
"step": 580
},
{
"epoch": 0.615951566201632,
"grad_norm": 0.14543568608581728,
"learning_rate": 7.703122578682046e-05,
"loss": 1.061,
"step": 585
},
{
"epoch": 0.6212161095025006,
"grad_norm": 0.14792849899325772,
"learning_rate": 7.524504792593419e-05,
"loss": 1.1101,
"step": 590
},
{
"epoch": 0.6264806528033693,
"grad_norm": 0.14574924924348462,
"learning_rate": 7.346724485374837e-05,
"loss": 1.0687,
"step": 595
},
{
"epoch": 0.631745196104238,
"grad_norm": 0.1434166906369258,
"learning_rate": 7.169841801457347e-05,
"loss": 1.0825,
"step": 600
},
{
"epoch": 0.6370097394051066,
"grad_norm": 0.14254720323207454,
"learning_rate": 6.993916581599202e-05,
"loss": 1.0896,
"step": 605
},
{
"epoch": 0.6422742827059753,
"grad_norm": 0.14534591022474969,
"learning_rate": 6.819008342641273e-05,
"loss": 1.0805,
"step": 610
},
{
"epoch": 0.6475388260068439,
"grad_norm": 0.1471482502229213,
"learning_rate": 6.645176257372055e-05,
"loss": 1.0933,
"step": 615
},
{
"epoch": 0.6528033693077125,
"grad_norm": 0.14967562406928056,
"learning_rate": 6.472479134509052e-05,
"loss": 1.0987,
"step": 620
},
{
"epoch": 0.6580679126085812,
"grad_norm": 0.14756218985788289,
"learning_rate": 6.300975398803362e-05,
"loss": 1.0862,
"step": 625
},
{
"epoch": 0.6633324559094499,
"grad_norm": 0.14358810278632364,
"learning_rate": 6.130723071274107e-05,
"loss": 1.0736,
"step": 630
},
{
"epoch": 0.6685969992103185,
"grad_norm": 0.14508119820046267,
"learning_rate": 5.961779749579516e-05,
"loss": 1.077,
"step": 635
},
{
"epoch": 0.6738615425111871,
"grad_norm": 0.14868475648668983,
"learning_rate": 5.794202588531166e-05,
"loss": 1.0921,
"step": 640
},
{
"epoch": 0.6791260858120558,
"grad_norm": 0.14136660751737096,
"learning_rate": 5.628048280758096e-05,
"loss": 1.0967,
"step": 645
},
{
"epoch": 0.6843906291129245,
"grad_norm": 0.14429824406995242,
"learning_rate": 5.4633730375272594e-05,
"loss": 1.094,
"step": 650
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.1435583500936634,
"learning_rate": 5.300232569726804e-05,
"loss": 1.0796,
"step": 655
},
{
"epoch": 0.6949197157146617,
"grad_norm": 0.14917594264214823,
"learning_rate": 5.13868206901867e-05,
"loss": 1.0813,
"step": 660
},
{
"epoch": 0.7001842590155304,
"grad_norm": 0.14484547003342338,
"learning_rate": 4.9787761891668397e-05,
"loss": 1.0833,
"step": 665
},
{
"epoch": 0.7054488023163991,
"grad_norm": 0.14125281408090304,
"learning_rate": 4.820569027547533e-05,
"loss": 1.0813,
"step": 670
},
{
"epoch": 0.7107133456172677,
"grad_norm": 0.1408995053360923,
"learning_rate": 4.6641141068476666e-05,
"loss": 1.0752,
"step": 675
},
{
"epoch": 0.7159778889181364,
"grad_norm": 0.1414179653044325,
"learning_rate": 4.5094643569577186e-05,
"loss": 1.054,
"step": 680
},
{
"epoch": 0.721242432219005,
"grad_norm": 0.14582058548503438,
"learning_rate": 4.356672097065134e-05,
"loss": 1.1048,
"step": 685
},
{
"epoch": 0.7265069755198736,
"grad_norm": 0.14009606861616825,
"learning_rate": 4.205789017954364e-05,
"loss": 1.0683,
"step": 690
},
{
"epoch": 0.7317715188207423,
"grad_norm": 0.14586506040118713,
"learning_rate": 4.056866164519465e-05,
"loss": 1.0728,
"step": 695
},
{
"epoch": 0.737036062121611,
"grad_norm": 0.14168474565307407,
"learning_rate": 3.909953918495234e-05,
"loss": 1.0476,
"step": 700
},
{
"epoch": 0.7423006054224796,
"grad_norm": 0.14476382479542646,
"learning_rate": 3.7651019814126654e-05,
"loss": 1.05,
"step": 705
},
{
"epoch": 0.7475651487233482,
"grad_norm": 0.14528550784733454,
"learning_rate": 3.622359357784569e-05,
"loss": 1.0611,
"step": 710
},
{
"epoch": 0.7528296920242169,
"grad_norm": 0.14781069746763306,
"learning_rate": 3.481774338526954e-05,
"loss": 1.0952,
"step": 715
},
{
"epoch": 0.7580942353250856,
"grad_norm": 0.15618197530507127,
"learning_rate": 3.343394484621855e-05,
"loss": 1.0836,
"step": 720
},
{
"epoch": 0.7633587786259542,
"grad_norm": 0.22087793925041818,
"learning_rate": 3.207266611027069e-05,
"loss": 1.0727,
"step": 725
},
{
"epoch": 0.7686233219268228,
"grad_norm": 0.14674869869141435,
"learning_rate": 3.0734367708383294e-05,
"loss": 1.0712,
"step": 730
},
{
"epoch": 0.7738878652276915,
"grad_norm": 0.14673826341334423,
"learning_rate": 2.9419502397091713e-05,
"loss": 1.0852,
"step": 735
},
{
"epoch": 0.7791524085285602,
"grad_norm": 0.1426087824509766,
"learning_rate": 2.812851500533843e-05,
"loss": 1.0604,
"step": 740
},
{
"epoch": 0.7844169518294288,
"grad_norm": 0.1446320144127932,
"learning_rate": 2.6861842283983953e-05,
"loss": 1.0537,
"step": 745
},
{
"epoch": 0.7896814951302974,
"grad_norm": 0.14326111319394175,
"learning_rate": 2.5619912758050725e-05,
"loss": 1.0942,
"step": 750
},
{
"epoch": 0.7949460384311661,
"grad_norm": 0.14149919988871043,
"learning_rate": 2.4403146581749925e-05,
"loss": 1.0578,
"step": 755
},
{
"epoch": 0.8002105817320347,
"grad_norm": 0.14034086298796508,
"learning_rate": 2.3211955396340002e-05,
"loss": 1.0808,
"step": 760
},
{
"epoch": 0.8054751250329034,
"grad_norm": 0.1433790314655123,
"learning_rate": 2.204674219086531e-05,
"loss": 1.0906,
"step": 765
},
{
"epoch": 0.8107396683337721,
"grad_norm": 0.138618618401559,
"learning_rate": 2.090790116582191e-05,
"loss": 1.0559,
"step": 770
},
{
"epoch": 0.8160042116346407,
"grad_norm": 0.1429827381187093,
"learning_rate": 1.9795817599796418e-05,
"loss": 1.0792,
"step": 775
},
{
"epoch": 0.8212687549355093,
"grad_norm": 0.14200271718072968,
"learning_rate": 1.871086771912348e-05,
"loss": 1.0702,
"step": 780
},
{
"epoch": 0.826533298236378,
"grad_norm": 0.1429932480295589,
"learning_rate": 1.7653418570605475e-05,
"loss": 1.0715,
"step": 785
},
{
"epoch": 0.8317978415372467,
"grad_norm": 0.14431467515210814,
"learning_rate": 1.6623827897337762e-05,
"loss": 1.0713,
"step": 790
},
{
"epoch": 0.8370623848381153,
"grad_norm": 0.15238820455432608,
"learning_rate": 1.562244401768144e-05,
"loss": 1.0824,
"step": 795
},
{
"epoch": 0.8423269281389839,
"grad_norm": 0.14830242766673976,
"learning_rate": 1.4649605707424707e-05,
"loss": 1.0787,
"step": 800
},
{
"epoch": 0.8475914714398526,
"grad_norm": 0.14468170557092047,
"learning_rate": 1.3705642085172366e-05,
"loss": 1.0737,
"step": 805
},
{
"epoch": 0.8528560147407213,
"grad_norm": 0.14674968769736463,
"learning_rate": 1.2790872501002472e-05,
"loss": 1.0577,
"step": 810
},
{
"epoch": 0.8581205580415899,
"grad_norm": 0.14311627432536864,
"learning_rate": 1.1905606428427774e-05,
"loss": 1.0692,
"step": 815
},
{
"epoch": 0.8633851013424585,
"grad_norm": 0.14558376197107287,
"learning_rate": 1.105014335969855e-05,
"loss": 1.0934,
"step": 820
},
{
"epoch": 0.8686496446433272,
"grad_norm": 0.14414555681497093,
"learning_rate": 1.0224772704482033e-05,
"loss": 1.0875,
"step": 825
},
{
"epoch": 0.8739141879441958,
"grad_norm": 0.1399627142514978,
"learning_rate": 9.429773691952858e-06,
"loss": 1.082,
"step": 830
},
{
"epoch": 0.8791787312450645,
"grad_norm": 0.1392001373823857,
"learning_rate": 8.665415276327871e-06,
"loss": 1.0573,
"step": 835
},
{
"epoch": 0.8844432745459332,
"grad_norm": 0.13993969105859186,
"learning_rate": 7.931956045876688e-06,
"loss": 1.0448,
"step": 840
},
{
"epoch": 0.8897078178468018,
"grad_norm": 0.16741517197447736,
"learning_rate": 7.229644135439473e-06,
"loss": 1.104,
"step": 845
},
{
"epoch": 0.8949723611476704,
"grad_norm": 0.14123729142229655,
"learning_rate": 6.558717142480919e-06,
"loss": 1.0808,
"step": 850
},
{
"epoch": 0.9002369044485391,
"grad_norm": 0.1424278055064695,
"learning_rate": 5.919402046709288e-06,
"loss": 1.0709,
"step": 855
},
{
"epoch": 0.9055014477494078,
"grad_norm": 0.13993993967003346,
"learning_rate": 5.311915133287415e-06,
"loss": 1.0941,
"step": 860
},
{
"epoch": 0.9107659910502763,
"grad_norm": 0.14557850289664284,
"learning_rate": 4.7364619196617495e-06,
"loss": 1.0492,
"step": 865
},
{
"epoch": 0.916030534351145,
"grad_norm": 0.1450177459066908,
"learning_rate": 4.193237086034351e-06,
"loss": 1.0972,
"step": 870
},
{
"epoch": 0.9212950776520137,
"grad_norm": 0.1570091074884799,
"learning_rate": 3.6824244095010065e-06,
"loss": 1.0695,
"step": 875
},
{
"epoch": 0.9265596209528824,
"grad_norm": 0.14097561405495265,
"learning_rate": 3.2041967018780707e-06,
"loss": 1.0948,
"step": 880
},
{
"epoch": 0.931824164253751,
"grad_norm": 0.1420984285291773,
"learning_rate": 2.7587157512388718e-06,
"loss": 1.0573,
"step": 885
},
{
"epoch": 0.9370887075546196,
"grad_norm": 0.1545471738706476,
"learning_rate": 2.346132267179646e-06,
"loss": 1.0786,
"step": 890
},
{
"epoch": 0.9423532508554883,
"grad_norm": 0.14481364480205125,
"learning_rate": 1.9665858298333005e-06,
"loss": 1.0939,
"step": 895
},
{
"epoch": 0.9476177941563569,
"grad_norm": 0.1446556897144525,
"learning_rate": 1.6202048426483651e-06,
"loss": 1.0752,
"step": 900
},
{
"epoch": 0.9528823374572256,
"grad_norm": 0.13840641658264988,
"learning_rate": 1.3071064889491724e-06,
"loss": 1.0757,
"step": 905
},
{
"epoch": 0.9581468807580943,
"grad_norm": 0.1405867091258211,
"learning_rate": 1.0273966922918155e-06,
"loss": 1.0886,
"step": 910
},
{
"epoch": 0.9634114240589629,
"grad_norm": 0.15143973079201015,
"learning_rate": 7.81170080629412e-07,
"loss": 1.0337,
"step": 915
},
{
"epoch": 0.9686759673598315,
"grad_norm": 0.15113893856195346,
"learning_rate": 5.68509954298757e-07,
"loss": 1.099,
"step": 920
},
{
"epoch": 0.9739405106607002,
"grad_norm": 0.1436446854214333,
"learning_rate": 3.8948825783918784e-07,
"loss": 1.0595,
"step": 925
},
{
"epoch": 0.9792050539615689,
"grad_norm": 0.14373165990559605,
"learning_rate": 2.4416555565318635e-07,
"loss": 1.0815,
"step": 930
},
{
"epoch": 0.9844695972624374,
"grad_norm": 0.14233020784379563,
"learning_rate": 1.3259101151694708e-07,
"loss": 1.0569,
"step": 935
},
{
"epoch": 0.9897341405633061,
"grad_norm": 0.13823967108377017,
"learning_rate": 5.480237194799287e-08,
"loss": 1.0689,
"step": 940
},
{
"epoch": 0.9949986838641748,
"grad_norm": 0.1431568671824589,
"learning_rate": 1.0825953435122938e-08,
"loss": 1.0709,
"step": 945
},
{
"epoch": 0.9992103185048697,
"eval_loss": 1.07915198802948,
"eval_runtime": 3821.2872,
"eval_samples_per_second": 3.522,
"eval_steps_per_second": 0.881,
"step": 949
},
{
"epoch": 0.9992103185048697,
"step": 949,
"total_flos": 1959448100732928.0,
"train_loss": 1.0930153100081064,
"train_runtime": 22340.3866,
"train_samples_per_second": 2.72,
"train_steps_per_second": 0.042
}
],
"logging_steps": 5,
"max_steps": 949,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1959448100732928.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}