sanity_syntax_20p_100k / trainer_state.json
terry69's picture
Model save
5566327 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999162712810494,
"eval_steps": 500,
"global_step": 895,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011163829193413341,
"grad_norm": 0.3974737008844776,
"learning_rate": 2.2222222222222225e-06,
"loss": 1.607,
"step": 1
},
{
"epoch": 0.0055819145967066705,
"grad_norm": 0.4252789938746273,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.5942,
"step": 5
},
{
"epoch": 0.011163829193413341,
"grad_norm": 0.4658525758416883,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.5877,
"step": 10
},
{
"epoch": 0.01674574379012001,
"grad_norm": 0.27282017063503095,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.5695,
"step": 15
},
{
"epoch": 0.022327658386826682,
"grad_norm": 0.24165395076839943,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.558,
"step": 20
},
{
"epoch": 0.027909572983533353,
"grad_norm": 0.1767403193777301,
"learning_rate": 5.555555555555556e-05,
"loss": 1.4678,
"step": 25
},
{
"epoch": 0.03349148758024002,
"grad_norm": 0.16356442786177314,
"learning_rate": 6.666666666666667e-05,
"loss": 1.467,
"step": 30
},
{
"epoch": 0.039073402176946694,
"grad_norm": 0.15556520577978836,
"learning_rate": 7.777777777777778e-05,
"loss": 1.429,
"step": 35
},
{
"epoch": 0.044655316773653364,
"grad_norm": 0.1263609432879071,
"learning_rate": 8.888888888888889e-05,
"loss": 1.4253,
"step": 40
},
{
"epoch": 0.050237231370360035,
"grad_norm": 0.1696978939183065,
"learning_rate": 0.0001,
"loss": 1.3895,
"step": 45
},
{
"epoch": 0.055819145967066705,
"grad_norm": 0.10830406775154863,
"learning_rate": 0.00011111111111111112,
"loss": 1.3645,
"step": 50
},
{
"epoch": 0.061401060563773376,
"grad_norm": 0.08414898733986972,
"learning_rate": 0.00012222222222222224,
"loss": 1.3082,
"step": 55
},
{
"epoch": 0.06698297516048005,
"grad_norm": 0.07973185533121883,
"learning_rate": 0.00013333333333333334,
"loss": 1.2962,
"step": 60
},
{
"epoch": 0.07256488975718671,
"grad_norm": 0.09811845100733502,
"learning_rate": 0.00014444444444444444,
"loss": 1.3061,
"step": 65
},
{
"epoch": 0.07814680435389339,
"grad_norm": 0.08298371354138047,
"learning_rate": 0.00015555555555555556,
"loss": 1.3017,
"step": 70
},
{
"epoch": 0.08372871895060005,
"grad_norm": 0.07510078793315819,
"learning_rate": 0.0001666666666666667,
"loss": 1.2989,
"step": 75
},
{
"epoch": 0.08931063354730673,
"grad_norm": 0.07085309149624731,
"learning_rate": 0.00017777777777777779,
"loss": 1.2787,
"step": 80
},
{
"epoch": 0.09489254814401339,
"grad_norm": 0.09400917029194135,
"learning_rate": 0.00018888888888888888,
"loss": 1.2843,
"step": 85
},
{
"epoch": 0.10047446274072007,
"grad_norm": 0.09230059652672952,
"learning_rate": 0.0002,
"loss": 1.262,
"step": 90
},
{
"epoch": 0.10605637733742673,
"grad_norm": 0.10009657676945562,
"learning_rate": 0.00019998096274980728,
"loss": 1.2821,
"step": 95
},
{
"epoch": 0.11163829193413341,
"grad_norm": 0.12201167887174731,
"learning_rate": 0.000199923858247567,
"loss": 1.2668,
"step": 100
},
{
"epoch": 0.11722020653084007,
"grad_norm": 0.09628889966493127,
"learning_rate": 0.00019982870823553308,
"loss": 1.2503,
"step": 105
},
{
"epoch": 0.12280212112754675,
"grad_norm": 0.10028621820088561,
"learning_rate": 0.00019969554894159723,
"loss": 1.2632,
"step": 110
},
{
"epoch": 0.12838403572425341,
"grad_norm": 0.08593461106683208,
"learning_rate": 0.00019952443106549533,
"loss": 1.2396,
"step": 115
},
{
"epoch": 0.1339659503209601,
"grad_norm": 0.08827739693201113,
"learning_rate": 0.00019931541975950378,
"loss": 1.2784,
"step": 120
},
{
"epoch": 0.13954786491766677,
"grad_norm": 0.0911508607290428,
"learning_rate": 0.00019906859460363307,
"loss": 1.2689,
"step": 125
},
{
"epoch": 0.14512977951437342,
"grad_norm": 0.12157025851983183,
"learning_rate": 0.00019878404957532814,
"loss": 1.2563,
"step": 130
},
{
"epoch": 0.1507116941110801,
"grad_norm": 0.10772740664174668,
"learning_rate": 0.0001984618930136869,
"loss": 1.2853,
"step": 135
},
{
"epoch": 0.15629360870778677,
"grad_norm": 0.09940063564218579,
"learning_rate": 0.00019810224757821064,
"loss": 1.241,
"step": 140
},
{
"epoch": 0.16187552330449345,
"grad_norm": 0.09118466185918958,
"learning_rate": 0.00019770525020210204,
"loss": 1.2746,
"step": 145
},
{
"epoch": 0.1674574379012001,
"grad_norm": 0.09674538853934604,
"learning_rate": 0.0001972710520401287,
"loss": 1.2561,
"step": 150
},
{
"epoch": 0.17303935249790678,
"grad_norm": 0.1126652956332537,
"learning_rate": 0.0001967998184110713,
"loss": 1.257,
"step": 155
},
{
"epoch": 0.17862126709461346,
"grad_norm": 0.0869341846350413,
"learning_rate": 0.00019629172873477995,
"loss": 1.2529,
"step": 160
},
{
"epoch": 0.18420318169132013,
"grad_norm": 0.09888626799953022,
"learning_rate": 0.00019574697646386027,
"loss": 1.244,
"step": 165
},
{
"epoch": 0.18978509628802678,
"grad_norm": 0.09785278620381999,
"learning_rate": 0.0001951657690100178,
"loss": 1.2334,
"step": 170
},
{
"epoch": 0.19536701088473346,
"grad_norm": 0.07378537831469305,
"learning_rate": 0.0001945483276650868,
"loss": 1.2415,
"step": 175
},
{
"epoch": 0.20094892548144014,
"grad_norm": 0.08814263560160436,
"learning_rate": 0.0001938948875167745,
"loss": 1.2512,
"step": 180
},
{
"epoch": 0.20653084007814682,
"grad_norm": 0.09775538276417937,
"learning_rate": 0.00019320569735915271,
"loss": 1.2213,
"step": 185
},
{
"epoch": 0.21211275467485347,
"grad_norm": 0.09538626874304115,
"learning_rate": 0.00019248101959793066,
"loss": 1.2354,
"step": 190
},
{
"epoch": 0.21769466927156014,
"grad_norm": 0.08332625788355251,
"learning_rate": 0.00019172113015054532,
"loss": 1.2444,
"step": 195
},
{
"epoch": 0.22327658386826682,
"grad_norm": 0.08309090570657847,
"learning_rate": 0.00019092631834110723,
"loss": 1.2316,
"step": 200
},
{
"epoch": 0.2288584984649735,
"grad_norm": 0.09054323693110126,
"learning_rate": 0.0001900968867902419,
"loss": 1.27,
"step": 205
},
{
"epoch": 0.23444041306168015,
"grad_norm": 0.08549436898181585,
"learning_rate": 0.00018923315129986835,
"loss": 1.2348,
"step": 210
},
{
"epoch": 0.24002232765838682,
"grad_norm": 0.086610993256363,
"learning_rate": 0.00018833544073295917,
"loss": 1.2461,
"step": 215
},
{
"epoch": 0.2456042422550935,
"grad_norm": 0.08146109722648563,
"learning_rate": 0.00018740409688832764,
"loss": 1.2431,
"step": 220
},
{
"epoch": 0.2511861568518002,
"grad_norm": 0.08232534290451142,
"learning_rate": 0.00018643947437048944,
"loss": 1.2408,
"step": 225
},
{
"epoch": 0.25676807144850683,
"grad_norm": 0.08507739560575232,
"learning_rate": 0.00018544194045464886,
"loss": 1.243,
"step": 230
},
{
"epoch": 0.26234998604521353,
"grad_norm": 0.09782665661618925,
"learning_rate": 0.00018441187494686053,
"loss": 1.2426,
"step": 235
},
{
"epoch": 0.2679319006419202,
"grad_norm": 0.0809973818897895,
"learning_rate": 0.0001833496700394202,
"loss": 1.2345,
"step": 240
},
{
"epoch": 0.27351381523862683,
"grad_norm": 0.09269081567542259,
"learning_rate": 0.00018225573016153945,
"loss": 1.2343,
"step": 245
},
{
"epoch": 0.27909572983533354,
"grad_norm": 0.09671785308848269,
"learning_rate": 0.00018113047182536127,
"loss": 1.2327,
"step": 250
},
{
"epoch": 0.2846776444320402,
"grad_norm": 0.0906432644454991,
"learning_rate": 0.00017997432346737524,
"loss": 1.2532,
"step": 255
},
{
"epoch": 0.29025955902874684,
"grad_norm": 0.08371586611488784,
"learning_rate": 0.00017878772528529232,
"loss": 1.2384,
"step": 260
},
{
"epoch": 0.29584147362545354,
"grad_norm": 0.08640773776491195,
"learning_rate": 0.000177571129070442,
"loss": 1.2193,
"step": 265
},
{
"epoch": 0.3014233882221602,
"grad_norm": 0.08164649256677078,
"learning_rate": 0.00017632499803575474,
"loss": 1.2327,
"step": 270
},
{
"epoch": 0.3070053028188669,
"grad_norm": 0.09156690890905773,
"learning_rate": 0.00017504980663939613,
"loss": 1.2534,
"step": 275
},
{
"epoch": 0.31258721741557355,
"grad_norm": 0.08393163680296412,
"learning_rate": 0.00017374604040411935,
"loss": 1.2411,
"step": 280
},
{
"epoch": 0.3181691320122802,
"grad_norm": 0.08340859881557235,
"learning_rate": 0.00017241419573240462,
"loss": 1.2398,
"step": 285
},
{
"epoch": 0.3237510466089869,
"grad_norm": 0.08622506272483123,
"learning_rate": 0.00017105477971745666,
"loss": 1.2321,
"step": 290
},
{
"epoch": 0.32933296120569355,
"grad_norm": 0.08338497396964428,
"learning_rate": 0.00016966830995013133,
"loss": 1.2453,
"step": 295
},
{
"epoch": 0.3349148758024002,
"grad_norm": 0.08718794446584939,
"learning_rate": 0.00016825531432186543,
"loss": 1.2134,
"step": 300
},
{
"epoch": 0.3404967903991069,
"grad_norm": 0.09158015865602193,
"learning_rate": 0.00016681633082368498,
"loss": 1.223,
"step": 305
},
{
"epoch": 0.34607870499581356,
"grad_norm": 0.08768121171152027,
"learning_rate": 0.0001653519073413675,
"loss": 1.235,
"step": 310
},
{
"epoch": 0.3516606195925202,
"grad_norm": 0.08907125432704804,
"learning_rate": 0.00016386260144683745,
"loss": 1.2169,
"step": 315
},
{
"epoch": 0.3572425341892269,
"grad_norm": 0.08767993008424768,
"learning_rate": 0.00016234898018587337,
"loss": 1.2435,
"step": 320
},
{
"epoch": 0.36282444878593356,
"grad_norm": 0.08991663909567185,
"learning_rate": 0.00016081161986220807,
"loss": 1.2371,
"step": 325
},
{
"epoch": 0.36840636338264027,
"grad_norm": 0.07876061570647706,
"learning_rate": 0.00015925110581810394,
"loss": 1.2118,
"step": 330
},
{
"epoch": 0.3739882779793469,
"grad_norm": 0.09088539514665886,
"learning_rate": 0.00015766803221148673,
"loss": 1.2333,
"step": 335
},
{
"epoch": 0.37957019257605357,
"grad_norm": 0.09371191064756335,
"learning_rate": 0.00015606300178972287,
"loss": 1.2192,
"step": 340
},
{
"epoch": 0.38515210717276027,
"grad_norm": 0.0988524027231739,
"learning_rate": 0.00015443662566012645,
"loss": 1.2201,
"step": 345
},
{
"epoch": 0.3907340217694669,
"grad_norm": 0.08068655015289312,
"learning_rate": 0.00015278952305728324,
"loss": 1.2312,
"step": 350
},
{
"epoch": 0.39631593636617357,
"grad_norm": 0.08530580419429784,
"learning_rate": 0.00015112232110728015,
"loss": 1.2103,
"step": 355
},
{
"epoch": 0.4018978509628803,
"grad_norm": 0.0832856621155852,
"learning_rate": 0.00014943565458893,
"loss": 1.2049,
"step": 360
},
{
"epoch": 0.4074797655595869,
"grad_norm": 0.10112900442930213,
"learning_rate": 0.00014773016569208283,
"loss": 1.2381,
"step": 365
},
{
"epoch": 0.41306168015629363,
"grad_norm": 0.08250019530921109,
"learning_rate": 0.00014600650377311522,
"loss": 1.2185,
"step": 370
},
{
"epoch": 0.4186435947530003,
"grad_norm": 0.0987578329954232,
"learning_rate": 0.0001442653251076912,
"loss": 1.2222,
"step": 375
},
{
"epoch": 0.42422550934970693,
"grad_norm": 0.08530899013880136,
"learning_rate": 0.00014250729264088843,
"loss": 1.2556,
"step": 380
},
{
"epoch": 0.42980742394641364,
"grad_norm": 0.10267562745822716,
"learning_rate": 0.00014073307573478526,
"loss": 1.2146,
"step": 385
},
{
"epoch": 0.4353893385431203,
"grad_norm": 0.09189285950155643,
"learning_rate": 0.00013894334991360448,
"loss": 1.2206,
"step": 390
},
{
"epoch": 0.44097125313982694,
"grad_norm": 0.08370196846674145,
"learning_rate": 0.00013713879660651068,
"loss": 1.2076,
"step": 395
},
{
"epoch": 0.44655316773653364,
"grad_norm": 0.08423557906306067,
"learning_rate": 0.0001353201028881598,
"loss": 1.2223,
"step": 400
},
{
"epoch": 0.4521350823332403,
"grad_norm": 0.08292081122541138,
"learning_rate": 0.00013348796121709862,
"loss": 1.2294,
"step": 405
},
{
"epoch": 0.457716996929947,
"grad_norm": 0.08767079524531268,
"learning_rate": 0.00013164306917211476,
"loss": 1.2229,
"step": 410
},
{
"epoch": 0.46329891152665365,
"grad_norm": 0.0865942463810843,
"learning_rate": 0.000129786129186637,
"loss": 1.2163,
"step": 415
},
{
"epoch": 0.4688808261233603,
"grad_norm": 0.08101515714055764,
"learning_rate": 0.00012791784828128724,
"loss": 1.2337,
"step": 420
},
{
"epoch": 0.474462740720067,
"grad_norm": 0.09009147490161429,
"learning_rate": 0.00012603893779468604,
"loss": 1.2148,
"step": 425
},
{
"epoch": 0.48004465531677365,
"grad_norm": 0.08757351279515291,
"learning_rate": 0.0001241501131126138,
"loss": 1.2056,
"step": 430
},
{
"epoch": 0.4856265699134803,
"grad_norm": 0.08418609867162384,
"learning_rate": 0.00012225209339563145,
"loss": 1.2419,
"step": 435
},
{
"epoch": 0.491208484510187,
"grad_norm": 0.08790367723325618,
"learning_rate": 0.0001203456013052634,
"loss": 1.2115,
"step": 440
},
{
"epoch": 0.49679039910689365,
"grad_norm": 0.08071789319204539,
"learning_rate": 0.00011843136272884794,
"loss": 1.2072,
"step": 445
},
{
"epoch": 0.5023723137036004,
"grad_norm": 0.0879278395825441,
"learning_rate": 0.00011651010650315923,
"loss": 1.2194,
"step": 450
},
{
"epoch": 0.507954228300307,
"grad_norm": 0.08506166782358492,
"learning_rate": 0.00011458256413690633,
"loss": 1.2077,
"step": 455
},
{
"epoch": 0.5135361428970137,
"grad_norm": 0.08984730610411729,
"learning_rate": 0.00011264946953221496,
"loss": 1.2484,
"step": 460
},
{
"epoch": 0.5191180574937203,
"grad_norm": 0.2978083078661545,
"learning_rate": 0.00011071155870519777,
"loss": 1.2491,
"step": 465
},
{
"epoch": 0.5246999720904271,
"grad_norm": 0.08504227931172395,
"learning_rate": 0.00010876956950572006,
"loss": 1.2268,
"step": 470
},
{
"epoch": 0.5302818866871337,
"grad_norm": 0.08620167875904892,
"learning_rate": 0.0001068242413364671,
"loss": 1.2252,
"step": 475
},
{
"epoch": 0.5358638012838404,
"grad_norm": 0.08669957736640198,
"learning_rate": 0.00010487631487142017,
"loss": 1.217,
"step": 480
},
{
"epoch": 0.541445715880547,
"grad_norm": 0.08577871896034497,
"learning_rate": 0.00010292653177384876,
"loss": 1.2169,
"step": 485
},
{
"epoch": 0.5470276304772537,
"grad_norm": 0.08417260057895289,
"learning_rate": 0.00010097563441392581,
"loss": 1.2354,
"step": 490
},
{
"epoch": 0.5526095450739603,
"grad_norm": 0.08676422431924583,
"learning_rate": 9.90243655860742e-05,
"loss": 1.2039,
"step": 495
},
{
"epoch": 0.5581914596706671,
"grad_norm": 0.09103906295111437,
"learning_rate": 9.707346822615128e-05,
"loss": 1.2194,
"step": 500
},
{
"epoch": 0.5637733742673737,
"grad_norm": 0.08594537537719427,
"learning_rate": 9.512368512857984e-05,
"loss": 1.1949,
"step": 505
},
{
"epoch": 0.5693552888640804,
"grad_norm": 0.08392759057088481,
"learning_rate": 9.317575866353292e-05,
"loss": 1.2196,
"step": 510
},
{
"epoch": 0.574937203460787,
"grad_norm": 0.08201912454761111,
"learning_rate": 9.123043049427995e-05,
"loss": 1.2131,
"step": 515
},
{
"epoch": 0.5805191180574937,
"grad_norm": 0.08925291750313868,
"learning_rate": 8.928844129480227e-05,
"loss": 1.2369,
"step": 520
},
{
"epoch": 0.5861010326542004,
"grad_norm": 0.08954980070951671,
"learning_rate": 8.735053046778506e-05,
"loss": 1.2175,
"step": 525
},
{
"epoch": 0.5916829472509071,
"grad_norm": 0.08574100993825345,
"learning_rate": 8.541743586309365e-05,
"loss": 1.2166,
"step": 530
},
{
"epoch": 0.5972648618476137,
"grad_norm": 0.08840883290578404,
"learning_rate": 8.348989349684076e-05,
"loss": 1.2271,
"step": 535
},
{
"epoch": 0.6028467764443204,
"grad_norm": 0.08443946017557556,
"learning_rate": 8.156863727115211e-05,
"loss": 1.2329,
"step": 540
},
{
"epoch": 0.608428691041027,
"grad_norm": 0.0902640782545258,
"learning_rate": 7.965439869473664e-05,
"loss": 1.2253,
"step": 545
},
{
"epoch": 0.6140106056377338,
"grad_norm": 0.08988630625422679,
"learning_rate": 7.774790660436858e-05,
"loss": 1.1785,
"step": 550
},
{
"epoch": 0.6195925202344404,
"grad_norm": 0.08134808753957644,
"learning_rate": 7.584988688738622e-05,
"loss": 1.2261,
"step": 555
},
{
"epoch": 0.6251744348311471,
"grad_norm": 0.08768193779762151,
"learning_rate": 7.396106220531398e-05,
"loss": 1.2463,
"step": 560
},
{
"epoch": 0.6307563494278537,
"grad_norm": 0.0885816930556393,
"learning_rate": 7.208215171871277e-05,
"loss": 1.2141,
"step": 565
},
{
"epoch": 0.6363382640245604,
"grad_norm": 0.08553683878588977,
"learning_rate": 7.021387081336301e-05,
"loss": 1.2026,
"step": 570
},
{
"epoch": 0.641920178621267,
"grad_norm": 0.09505838067263224,
"learning_rate": 6.835693082788525e-05,
"loss": 1.2168,
"step": 575
},
{
"epoch": 0.6475020932179738,
"grad_norm": 0.08769224685329463,
"learning_rate": 6.651203878290139e-05,
"loss": 1.2493,
"step": 580
},
{
"epoch": 0.6530840078146805,
"grad_norm": 0.07990213288377576,
"learning_rate": 6.46798971118402e-05,
"loss": 1.2308,
"step": 585
},
{
"epoch": 0.6586659224113871,
"grad_norm": 0.08133261350163556,
"learning_rate": 6.286120339348935e-05,
"loss": 1.2014,
"step": 590
},
{
"epoch": 0.6642478370080938,
"grad_norm": 0.09363089434544866,
"learning_rate": 6.105665008639557e-05,
"loss": 1.2238,
"step": 595
},
{
"epoch": 0.6698297516048004,
"grad_norm": 0.07910287951552411,
"learning_rate": 5.926692426521474e-05,
"loss": 1.2473,
"step": 600
},
{
"epoch": 0.6754116662015072,
"grad_norm": 0.0801209902764544,
"learning_rate": 5.749270735911158e-05,
"loss": 1.1975,
"step": 605
},
{
"epoch": 0.6809935807982138,
"grad_norm": 0.08087293360533905,
"learning_rate": 5.573467489230879e-05,
"loss": 1.1966,
"step": 610
},
{
"epoch": 0.6865754953949205,
"grad_norm": 0.08220997258417966,
"learning_rate": 5.399349622688479e-05,
"loss": 1.2345,
"step": 615
},
{
"epoch": 0.6921574099916271,
"grad_norm": 0.0825575277760057,
"learning_rate": 5.226983430791722e-05,
"loss": 1.2289,
"step": 620
},
{
"epoch": 0.6977393245883338,
"grad_norm": 0.08305460425818378,
"learning_rate": 5.0564345411070025e-05,
"loss": 1.204,
"step": 625
},
{
"epoch": 0.7033212391850404,
"grad_norm": 0.08011105262542664,
"learning_rate": 4.8877678892719866e-05,
"loss": 1.1946,
"step": 630
},
{
"epoch": 0.7089031537817472,
"grad_norm": 0.08686069747720479,
"learning_rate": 4.721047694271676e-05,
"loss": 1.2,
"step": 635
},
{
"epoch": 0.7144850683784538,
"grad_norm": 0.08537977661965272,
"learning_rate": 4.556337433987359e-05,
"loss": 1.2054,
"step": 640
},
{
"epoch": 0.7200669829751605,
"grad_norm": 0.08857193949478791,
"learning_rate": 4.393699821027716e-05,
"loss": 1.1988,
"step": 645
},
{
"epoch": 0.7256488975718671,
"grad_norm": 0.09608004999262602,
"learning_rate": 4.2331967788513295e-05,
"loss": 1.2226,
"step": 650
},
{
"epoch": 0.7312308121685738,
"grad_norm": 0.08235757922811432,
"learning_rate": 4.074889418189608e-05,
"loss": 1.2202,
"step": 655
},
{
"epoch": 0.7368127267652805,
"grad_norm": 0.08660069823512372,
"learning_rate": 3.9188380137791936e-05,
"loss": 1.215,
"step": 660
},
{
"epoch": 0.7423946413619872,
"grad_norm": 0.08090639704744831,
"learning_rate": 3.7651019814126654e-05,
"loss": 1.2255,
"step": 665
},
{
"epoch": 0.7479765559586938,
"grad_norm": 0.08082821477995833,
"learning_rate": 3.613739855316257e-05,
"loss": 1.2176,
"step": 670
},
{
"epoch": 0.7535584705554005,
"grad_norm": 0.08469395080984878,
"learning_rate": 3.46480926586325e-05,
"loss": 1.2275,
"step": 675
},
{
"epoch": 0.7591403851521071,
"grad_norm": 0.0871555466504494,
"learning_rate": 3.3183669176315045e-05,
"loss": 1.2351,
"step": 680
},
{
"epoch": 0.7647222997488139,
"grad_norm": 0.08170223557553191,
"learning_rate": 3.174468567813461e-05,
"loss": 1.2074,
"step": 685
},
{
"epoch": 0.7703042143455205,
"grad_norm": 0.0838318843856818,
"learning_rate": 3.033169004986873e-05,
"loss": 1.2396,
"step": 690
},
{
"epoch": 0.7758861289422272,
"grad_norm": 0.08831381148889993,
"learning_rate": 2.894522028254334e-05,
"loss": 1.1947,
"step": 695
},
{
"epoch": 0.7814680435389338,
"grad_norm": 0.08158536981215994,
"learning_rate": 2.7585804267595384e-05,
"loss": 1.208,
"step": 700
},
{
"epoch": 0.7870499581356405,
"grad_norm": 0.08116519613000232,
"learning_rate": 2.6253959595880673e-05,
"loss": 1.2191,
"step": 705
},
{
"epoch": 0.7926318727323471,
"grad_norm": 0.08294169676184929,
"learning_rate": 2.495019336060387e-05,
"loss": 1.195,
"step": 710
},
{
"epoch": 0.7982137873290539,
"grad_norm": 0.08406756837278591,
"learning_rate": 2.367500196424529e-05,
"loss": 1.2203,
"step": 715
},
{
"epoch": 0.8037957019257606,
"grad_norm": 0.08211403607563178,
"learning_rate": 2.242887092955801e-05,
"loss": 1.2041,
"step": 720
},
{
"epoch": 0.8093776165224672,
"grad_norm": 0.07980978787138238,
"learning_rate": 2.121227471470768e-05,
"loss": 1.2394,
"step": 725
},
{
"epoch": 0.8149595311191739,
"grad_norm": 0.08416184610807921,
"learning_rate": 2.002567653262479e-05,
"loss": 1.2228,
"step": 730
},
{
"epoch": 0.8205414457158805,
"grad_norm": 0.08256062792318115,
"learning_rate": 1.8869528174638752e-05,
"loss": 1.203,
"step": 735
},
{
"epoch": 0.8261233603125873,
"grad_norm": 0.09043351264554417,
"learning_rate": 1.774426983846058e-05,
"loss": 1.2275,
"step": 740
},
{
"epoch": 0.8317052749092939,
"grad_norm": 0.08486147964302236,
"learning_rate": 1.6650329960579792e-05,
"loss": 1.2208,
"step": 745
},
{
"epoch": 0.8372871895060006,
"grad_norm": 0.0935945466460169,
"learning_rate": 1.5588125053139468e-05,
"loss": 1.2131,
"step": 750
},
{
"epoch": 0.8428691041027072,
"grad_norm": 0.08282716353976063,
"learning_rate": 1.4558059545351143e-05,
"loss": 1.2284,
"step": 755
},
{
"epoch": 0.8484510186994139,
"grad_norm": 0.08286515378820142,
"learning_rate": 1.3560525629510568e-05,
"loss": 1.2086,
"step": 760
},
{
"epoch": 0.8540329332961206,
"grad_norm": 0.08295259360853054,
"learning_rate": 1.259590311167238e-05,
"loss": 1.2061,
"step": 765
},
{
"epoch": 0.8596148478928273,
"grad_norm": 0.08358389042910293,
"learning_rate": 1.166455926704082e-05,
"loss": 1.222,
"step": 770
},
{
"epoch": 0.8651967624895339,
"grad_norm": 0.08388863476839661,
"learning_rate": 1.0766848700131648e-05,
"loss": 1.2143,
"step": 775
},
{
"epoch": 0.8707786770862406,
"grad_norm": 0.08277339984932784,
"learning_rate": 9.903113209758096e-06,
"loss": 1.2192,
"step": 780
},
{
"epoch": 0.8763605916829472,
"grad_norm": 0.08938310164317657,
"learning_rate": 9.073681658892775e-06,
"loss": 1.2191,
"step": 785
},
{
"epoch": 0.8819425062796539,
"grad_norm": 0.07910593096708422,
"learning_rate": 8.278869849454718e-06,
"loss": 1.2269,
"step": 790
},
{
"epoch": 0.8875244208763606,
"grad_norm": 0.08295037453317607,
"learning_rate": 7.5189804020693536e-06,
"loss": 1.2021,
"step": 795
},
{
"epoch": 0.8931063354730673,
"grad_norm": 0.08199446080472911,
"learning_rate": 6.794302640847294e-06,
"loss": 1.1961,
"step": 800
},
{
"epoch": 0.8986882500697739,
"grad_norm": 0.08481342663212112,
"learning_rate": 6.1051124832254944e-06,
"loss": 1.2069,
"step": 805
},
{
"epoch": 0.9042701646664806,
"grad_norm": 0.08217551850800063,
"learning_rate": 5.451672334913216e-06,
"loss": 1.2055,
"step": 810
},
{
"epoch": 0.9098520792631872,
"grad_norm": 0.08322503504827561,
"learning_rate": 4.834230989982213e-06,
"loss": 1.2156,
"step": 815
},
{
"epoch": 0.915433993859894,
"grad_norm": 0.08125961805104615,
"learning_rate": 4.253023536139733e-06,
"loss": 1.2005,
"step": 820
},
{
"epoch": 0.9210159084566006,
"grad_norm": 0.09037682759604541,
"learning_rate": 3.7082712652200867e-06,
"loss": 1.2079,
"step": 825
},
{
"epoch": 0.9265978230533073,
"grad_norm": 0.08711894287392291,
"learning_rate": 3.2001815889286856e-06,
"loss": 1.232,
"step": 830
},
{
"epoch": 0.9321797376500139,
"grad_norm": 0.08367132801462379,
"learning_rate": 2.728947959871353e-06,
"loss": 1.1858,
"step": 835
},
{
"epoch": 0.9377616522467206,
"grad_norm": 0.0809801248589102,
"learning_rate": 2.294749797897955e-06,
"loss": 1.1871,
"step": 840
},
{
"epoch": 0.9433435668434274,
"grad_norm": 0.08412969109149288,
"learning_rate": 1.8977524217893783e-06,
"loss": 1.2248,
"step": 845
},
{
"epoch": 0.948925481440134,
"grad_norm": 0.08014128153610968,
"learning_rate": 1.5381069863131037e-06,
"loss": 1.2312,
"step": 850
},
{
"epoch": 0.9545073960368406,
"grad_norm": 0.08040835492341503,
"learning_rate": 1.2159504246718522e-06,
"loss": 1.2213,
"step": 855
},
{
"epoch": 0.9600893106335473,
"grad_norm": 0.08170226749481643,
"learning_rate": 9.314053963669245e-07,
"loss": 1.2114,
"step": 860
},
{
"epoch": 0.965671225230254,
"grad_norm": 0.08123838559159317,
"learning_rate": 6.845802404962243e-07,
"loss": 1.2455,
"step": 865
},
{
"epoch": 0.9712531398269606,
"grad_norm": 0.08532355248950987,
"learning_rate": 4.7556893450466653e-07,
"loss": 1.2017,
"step": 870
},
{
"epoch": 0.9768350544236674,
"grad_norm": 0.07935413274906811,
"learning_rate": 3.044510584027771e-07,
"loss": 1.203,
"step": 875
},
{
"epoch": 0.982416969020374,
"grad_norm": 0.07922680701516337,
"learning_rate": 1.7129176446692984e-07,
"loss": 1.1993,
"step": 880
},
{
"epoch": 0.9879988836170807,
"grad_norm": 0.08007277288266887,
"learning_rate": 7.614175243301213e-08,
"loss": 1.221,
"step": 885
},
{
"epoch": 0.9935807982137873,
"grad_norm": 0.08190648675567455,
"learning_rate": 1.9037250192732726e-08,
"loss": 1.2245,
"step": 890
},
{
"epoch": 0.999162712810494,
"grad_norm": 0.07884795604109555,
"learning_rate": 0.0,
"loss": 1.2359,
"step": 895
},
{
"epoch": 0.999162712810494,
"eval_loss": 1.1748292446136475,
"eval_runtime": 1569.4225,
"eval_samples_per_second": 8.524,
"eval_steps_per_second": 0.533,
"step": 895
},
{
"epoch": 0.999162712810494,
"step": 895,
"total_flos": 1.1254972268150784e+16,
"train_loss": 1.2433469767011078,
"train_runtime": 20318.3129,
"train_samples_per_second": 2.821,
"train_steps_per_second": 0.044
}
],
"logging_steps": 5,
"max_steps": 895,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1254972268150784e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}