km-full-model / checkpoint-1638 /trainer_state.json
likhithv's picture
Upload folder using huggingface_hub
84d98c3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1638,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01833180568285976,
"grad_norm": 0.6833510994911194,
"learning_rate": 0.00018,
"loss": 1.8247451782226562,
"step": 10
},
{
"epoch": 0.03666361136571952,
"grad_norm": 0.9012069702148438,
"learning_rate": 0.0001988943488943489,
"loss": 1.4739965438842773,
"step": 20
},
{
"epoch": 0.054995417048579284,
"grad_norm": 0.8267062306404114,
"learning_rate": 0.00019766584766584767,
"loss": 1.3358205795288085,
"step": 30
},
{
"epoch": 0.07332722273143905,
"grad_norm": 0.7565646767616272,
"learning_rate": 0.00019643734643734644,
"loss": 1.2644735336303712,
"step": 40
},
{
"epoch": 0.09165902841429881,
"grad_norm": 0.8591431975364685,
"learning_rate": 0.0001952088452088452,
"loss": 1.244968318939209,
"step": 50
},
{
"epoch": 0.10999083409715857,
"grad_norm": 0.8081830143928528,
"learning_rate": 0.000193980343980344,
"loss": 1.209956455230713,
"step": 60
},
{
"epoch": 0.12832263978001834,
"grad_norm": 0.8808525800704956,
"learning_rate": 0.00019275184275184277,
"loss": 1.1500988960266114,
"step": 70
},
{
"epoch": 0.1466544454628781,
"grad_norm": 0.82117760181427,
"learning_rate": 0.00019152334152334154,
"loss": 1.1469905853271485,
"step": 80
},
{
"epoch": 0.16498625114573787,
"grad_norm": 0.8583332896232605,
"learning_rate": 0.0001902948402948403,
"loss": 1.1948189735412598,
"step": 90
},
{
"epoch": 0.18331805682859761,
"grad_norm": 0.6712405681610107,
"learning_rate": 0.00018906633906633907,
"loss": 1.1391284942626954,
"step": 100
},
{
"epoch": 0.2016498625114574,
"grad_norm": 0.8613548874855042,
"learning_rate": 0.00018783783783783784,
"loss": 1.1040291786193848,
"step": 110
},
{
"epoch": 0.21998166819431714,
"grad_norm": 0.8608964085578918,
"learning_rate": 0.0001866093366093366,
"loss": 1.1578070640563964,
"step": 120
},
{
"epoch": 0.2383134738771769,
"grad_norm": 0.8638626337051392,
"learning_rate": 0.0001853808353808354,
"loss": 1.1500712394714356,
"step": 130
},
{
"epoch": 0.2566452795600367,
"grad_norm": 0.8131710886955261,
"learning_rate": 0.00018415233415233417,
"loss": 1.0960933685302734,
"step": 140
},
{
"epoch": 0.27497708524289644,
"grad_norm": 1.023303508758545,
"learning_rate": 0.00018292383292383292,
"loss": 1.106197452545166,
"step": 150
},
{
"epoch": 0.2933088909257562,
"grad_norm": 0.7958722114562988,
"learning_rate": 0.0001816953316953317,
"loss": 1.1257465362548829,
"step": 160
},
{
"epoch": 0.31164069660861593,
"grad_norm": 0.8529394865036011,
"learning_rate": 0.00018046683046683048,
"loss": 1.062838077545166,
"step": 170
},
{
"epoch": 0.32997250229147573,
"grad_norm": 0.8443934917449951,
"learning_rate": 0.00017923832923832925,
"loss": 1.023012638092041,
"step": 180
},
{
"epoch": 0.3483043079743355,
"grad_norm": 0.8035740852355957,
"learning_rate": 0.00017800982800982802,
"loss": 1.0435395240783691,
"step": 190
},
{
"epoch": 0.36663611365719523,
"grad_norm": 0.8863442540168762,
"learning_rate": 0.00017678132678132678,
"loss": 1.0918319702148438,
"step": 200
},
{
"epoch": 0.384967919340055,
"grad_norm": 0.8198781609535217,
"learning_rate": 0.00017555282555282555,
"loss": 1.0226572036743165,
"step": 210
},
{
"epoch": 0.4032997250229148,
"grad_norm": 0.9801501035690308,
"learning_rate": 0.00017432432432432432,
"loss": 1.0847922325134278,
"step": 220
},
{
"epoch": 0.4216315307057745,
"grad_norm": 0.6737959980964661,
"learning_rate": 0.00017309582309582312,
"loss": 1.0508249282836915,
"step": 230
},
{
"epoch": 0.4399633363886343,
"grad_norm": 0.7932195067405701,
"learning_rate": 0.00017186732186732188,
"loss": 1.042081069946289,
"step": 240
},
{
"epoch": 0.458295142071494,
"grad_norm": 0.864284873008728,
"learning_rate": 0.00017063882063882065,
"loss": 1.0485817909240722,
"step": 250
},
{
"epoch": 0.4766269477543538,
"grad_norm": 0.8705862164497375,
"learning_rate": 0.00016941031941031942,
"loss": 1.0782199859619142,
"step": 260
},
{
"epoch": 0.49495875343721357,
"grad_norm": 0.8180854916572571,
"learning_rate": 0.0001681818181818182,
"loss": 1.075644302368164,
"step": 270
},
{
"epoch": 0.5132905591200734,
"grad_norm": 0.8689812421798706,
"learning_rate": 0.00016695331695331696,
"loss": 1.0431486129760743,
"step": 280
},
{
"epoch": 0.5316223648029331,
"grad_norm": 0.7797616720199585,
"learning_rate": 0.00016572481572481573,
"loss": 1.019082736968994,
"step": 290
},
{
"epoch": 0.5499541704857929,
"grad_norm": 0.9403369426727295,
"learning_rate": 0.0001644963144963145,
"loss": 1.0104355812072754,
"step": 300
},
{
"epoch": 0.5682859761686526,
"grad_norm": 0.8061522245407104,
"learning_rate": 0.0001632678132678133,
"loss": 1.015712833404541,
"step": 310
},
{
"epoch": 0.5866177818515124,
"grad_norm": 1.0389378070831299,
"learning_rate": 0.00016203931203931203,
"loss": 0.983332347869873,
"step": 320
},
{
"epoch": 0.6049495875343721,
"grad_norm": 0.8335319757461548,
"learning_rate": 0.00016081081081081083,
"loss": 1.0035177230834962,
"step": 330
},
{
"epoch": 0.6232813932172319,
"grad_norm": 0.8039399981498718,
"learning_rate": 0.0001595823095823096,
"loss": 1.0274381637573242,
"step": 340
},
{
"epoch": 0.6416131989000916,
"grad_norm": 0.777919352054596,
"learning_rate": 0.00015835380835380836,
"loss": 1.0467321395874023,
"step": 350
},
{
"epoch": 0.6599450045829515,
"grad_norm": 0.8876609802246094,
"learning_rate": 0.00015712530712530713,
"loss": 0.9848588943481446,
"step": 360
},
{
"epoch": 0.6782768102658112,
"grad_norm": 0.8413349986076355,
"learning_rate": 0.0001558968058968059,
"loss": 1.0367840766906737,
"step": 370
},
{
"epoch": 0.696608615948671,
"grad_norm": 0.9921192526817322,
"learning_rate": 0.0001546683046683047,
"loss": 1.0018555641174316,
"step": 380
},
{
"epoch": 0.7149404216315307,
"grad_norm": 0.8272864818572998,
"learning_rate": 0.00015343980343980344,
"loss": 1.0093633651733398,
"step": 390
},
{
"epoch": 0.7332722273143905,
"grad_norm": 0.7949515581130981,
"learning_rate": 0.00015221130221130223,
"loss": 1.0220769882202148,
"step": 400
},
{
"epoch": 0.7516040329972502,
"grad_norm": 0.8337849378585815,
"learning_rate": 0.000150982800982801,
"loss": 0.9723053932189941,
"step": 410
},
{
"epoch": 0.76993583868011,
"grad_norm": 0.9521737694740295,
"learning_rate": 0.00014975429975429974,
"loss": 1.0630863189697266,
"step": 420
},
{
"epoch": 0.7882676443629697,
"grad_norm": 0.8320823311805725,
"learning_rate": 0.00014852579852579854,
"loss": 0.9916687965393066,
"step": 430
},
{
"epoch": 0.8065994500458296,
"grad_norm": 0.903413712978363,
"learning_rate": 0.0001472972972972973,
"loss": 0.9519875526428223,
"step": 440
},
{
"epoch": 0.8249312557286893,
"grad_norm": 0.8783673048019409,
"learning_rate": 0.00014606879606879607,
"loss": 1.0056891441345215,
"step": 450
},
{
"epoch": 0.843263061411549,
"grad_norm": 0.8581491708755493,
"learning_rate": 0.00014484029484029484,
"loss": 1.0211298942565918,
"step": 460
},
{
"epoch": 0.8615948670944088,
"grad_norm": 0.7977339029312134,
"learning_rate": 0.0001436117936117936,
"loss": 0.9528703689575195,
"step": 470
},
{
"epoch": 0.8799266727772685,
"grad_norm": 0.8142527937889099,
"learning_rate": 0.0001423832923832924,
"loss": 1.0111416816711425,
"step": 480
},
{
"epoch": 0.8982584784601283,
"grad_norm": 0.865929126739502,
"learning_rate": 0.00014115479115479115,
"loss": 0.978369140625,
"step": 490
},
{
"epoch": 0.916590284142988,
"grad_norm": 0.7955005764961243,
"learning_rate": 0.00013992628992628994,
"loss": 0.998965835571289,
"step": 500
},
{
"epoch": 0.9349220898258478,
"grad_norm": 0.7812423706054688,
"learning_rate": 0.0001386977886977887,
"loss": 0.9544276237487793,
"step": 510
},
{
"epoch": 0.9532538955087076,
"grad_norm": 0.817484974861145,
"learning_rate": 0.00013746928746928748,
"loss": 0.9699355125427246,
"step": 520
},
{
"epoch": 0.9715857011915674,
"grad_norm": 0.875234842300415,
"learning_rate": 0.00013624078624078625,
"loss": 0.9715826034545898,
"step": 530
},
{
"epoch": 0.9899175068744271,
"grad_norm": 0.7700145244598389,
"learning_rate": 0.00013501228501228501,
"loss": 0.9811925888061523,
"step": 540
},
{
"epoch": 1.0073327222731439,
"grad_norm": 0.7260869145393372,
"learning_rate": 0.0001337837837837838,
"loss": 0.882848072052002,
"step": 550
},
{
"epoch": 1.0256645279560037,
"grad_norm": 0.7263541221618652,
"learning_rate": 0.00013255528255528255,
"loss": 0.8015734672546386,
"step": 560
},
{
"epoch": 1.0439963336388633,
"grad_norm": 0.8721809387207031,
"learning_rate": 0.00013132678132678135,
"loss": 0.778080940246582,
"step": 570
},
{
"epoch": 1.0623281393217232,
"grad_norm": 0.8094732165336609,
"learning_rate": 0.00013009828009828011,
"loss": 0.7774394989013672,
"step": 580
},
{
"epoch": 1.0806599450045828,
"grad_norm": 0.8383634686470032,
"learning_rate": 0.00012886977886977886,
"loss": 0.7659544944763184,
"step": 590
},
{
"epoch": 1.0989917506874427,
"grad_norm": 0.9429551959037781,
"learning_rate": 0.00012764127764127765,
"loss": 0.7822256565093995,
"step": 600
},
{
"epoch": 1.1173235563703026,
"grad_norm": 0.8432884812355042,
"learning_rate": 0.00012641277641277642,
"loss": 0.7877971649169921,
"step": 610
},
{
"epoch": 1.1356553620531622,
"grad_norm": 0.9461238384246826,
"learning_rate": 0.0001251842751842752,
"loss": 0.8412753105163574,
"step": 620
},
{
"epoch": 1.153987167736022,
"grad_norm": 0.9495576620101929,
"learning_rate": 0.00012395577395577396,
"loss": 0.7389075279235839,
"step": 630
},
{
"epoch": 1.1723189734188817,
"grad_norm": 0.8137982487678528,
"learning_rate": 0.00012272727272727272,
"loss": 0.7511651992797852,
"step": 640
},
{
"epoch": 1.1906507791017416,
"grad_norm": 0.9718158841133118,
"learning_rate": 0.00012149877149877152,
"loss": 0.7982583999633789,
"step": 650
},
{
"epoch": 1.2089825847846012,
"grad_norm": 1.0837777853012085,
"learning_rate": 0.00012027027027027027,
"loss": 0.762714433670044,
"step": 660
},
{
"epoch": 1.227314390467461,
"grad_norm": 0.9882314801216125,
"learning_rate": 0.00011904176904176904,
"loss": 0.7749518871307373,
"step": 670
},
{
"epoch": 1.2456461961503207,
"grad_norm": 0.9463419914245605,
"learning_rate": 0.00011781326781326782,
"loss": 0.7641645908355713,
"step": 680
},
{
"epoch": 1.2639780018331805,
"grad_norm": 0.9794511198997498,
"learning_rate": 0.00011658476658476658,
"loss": 0.8050010681152344,
"step": 690
},
{
"epoch": 1.2823098075160404,
"grad_norm": 1.1002216339111328,
"learning_rate": 0.00011535626535626536,
"loss": 0.793759822845459,
"step": 700
},
{
"epoch": 1.3006416131989,
"grad_norm": 0.9648078083992004,
"learning_rate": 0.00011412776412776414,
"loss": 0.7668623447418212,
"step": 710
},
{
"epoch": 1.31897341888176,
"grad_norm": 1.0727074146270752,
"learning_rate": 0.00011289926289926291,
"loss": 0.8029165267944336,
"step": 720
},
{
"epoch": 1.3373052245646195,
"grad_norm": 0.9617047905921936,
"learning_rate": 0.00011167076167076167,
"loss": 0.7798116683959961,
"step": 730
},
{
"epoch": 1.3556370302474794,
"grad_norm": 0.8710028529167175,
"learning_rate": 0.00011044226044226045,
"loss": 0.7962553977966309,
"step": 740
},
{
"epoch": 1.3739688359303392,
"grad_norm": 0.8409777283668518,
"learning_rate": 0.00010921375921375923,
"loss": 0.7403414726257325,
"step": 750
},
{
"epoch": 1.3923006416131989,
"grad_norm": 1.029362440109253,
"learning_rate": 0.00010798525798525798,
"loss": 0.7815125465393067,
"step": 760
},
{
"epoch": 1.4106324472960587,
"grad_norm": 0.9566736221313477,
"learning_rate": 0.00010675675675675677,
"loss": 0.7839052200317382,
"step": 770
},
{
"epoch": 1.4289642529789184,
"grad_norm": 0.975339949131012,
"learning_rate": 0.00010552825552825553,
"loss": 0.749812650680542,
"step": 780
},
{
"epoch": 1.4472960586617782,
"grad_norm": 1.1521857976913452,
"learning_rate": 0.00010429975429975432,
"loss": 0.7797944068908691,
"step": 790
},
{
"epoch": 1.4656278643446379,
"grad_norm": 0.8301038146018982,
"learning_rate": 0.00010307125307125307,
"loss": 0.7280281543731689,
"step": 800
},
{
"epoch": 1.4839596700274977,
"grad_norm": 0.9730615615844727,
"learning_rate": 0.00010184275184275185,
"loss": 0.7437058448791504,
"step": 810
},
{
"epoch": 1.5022914757103574,
"grad_norm": 1.0270700454711914,
"learning_rate": 0.00010061425061425062,
"loss": 0.7848101615905761,
"step": 820
},
{
"epoch": 1.5206232813932172,
"grad_norm": 1.2335196733474731,
"learning_rate": 9.938574938574939e-05,
"loss": 0.7896716117858886,
"step": 830
},
{
"epoch": 1.538955087076077,
"grad_norm": 0.968611478805542,
"learning_rate": 9.815724815724816e-05,
"loss": 0.7918240070343018,
"step": 840
},
{
"epoch": 1.5572868927589367,
"grad_norm": 0.9463298320770264,
"learning_rate": 9.692874692874694e-05,
"loss": 0.7453035354614258,
"step": 850
},
{
"epoch": 1.5756186984417964,
"grad_norm": 1.004184603691101,
"learning_rate": 9.570024570024571e-05,
"loss": 0.8125950813293457,
"step": 860
},
{
"epoch": 1.5939505041246562,
"grad_norm": 1.1150691509246826,
"learning_rate": 9.447174447174448e-05,
"loss": 0.7995445251464843,
"step": 870
},
{
"epoch": 1.612282309807516,
"grad_norm": 1.060056447982788,
"learning_rate": 9.324324324324324e-05,
"loss": 0.7746751785278321,
"step": 880
},
{
"epoch": 1.630614115490376,
"grad_norm": 1.0525883436203003,
"learning_rate": 9.201474201474201e-05,
"loss": 0.7861367225646972,
"step": 890
},
{
"epoch": 1.6489459211732356,
"grad_norm": 0.9495214223861694,
"learning_rate": 9.07862407862408e-05,
"loss": 0.8055791854858398,
"step": 900
},
{
"epoch": 1.6672777268560952,
"grad_norm": 0.8876036405563354,
"learning_rate": 8.955773955773956e-05,
"loss": 0.736152982711792,
"step": 910
},
{
"epoch": 1.685609532538955,
"grad_norm": 1.0228347778320312,
"learning_rate": 8.832923832923833e-05,
"loss": 0.7859257698059082,
"step": 920
},
{
"epoch": 1.703941338221815,
"grad_norm": 1.2196885347366333,
"learning_rate": 8.710073710073711e-05,
"loss": 0.7442365169525147,
"step": 930
},
{
"epoch": 1.7222731439046746,
"grad_norm": 1.1201367378234863,
"learning_rate": 8.587223587223587e-05,
"loss": 0.7933924674987793,
"step": 940
},
{
"epoch": 1.7406049495875344,
"grad_norm": 1.0457044839859009,
"learning_rate": 8.464373464373465e-05,
"loss": 0.7594408512115478,
"step": 950
},
{
"epoch": 1.758936755270394,
"grad_norm": 1.2219468355178833,
"learning_rate": 8.341523341523342e-05,
"loss": 0.725389051437378,
"step": 960
},
{
"epoch": 1.777268560953254,
"grad_norm": 1.0098403692245483,
"learning_rate": 8.21867321867322e-05,
"loss": 0.7753002166748046,
"step": 970
},
{
"epoch": 1.7956003666361138,
"grad_norm": 1.020544409751892,
"learning_rate": 8.095823095823097e-05,
"loss": 0.7580110549926757,
"step": 980
},
{
"epoch": 1.8139321723189734,
"grad_norm": 0.9121679067611694,
"learning_rate": 7.972972972972974e-05,
"loss": 0.7329069614410401,
"step": 990
},
{
"epoch": 1.832263978001833,
"grad_norm": 1.1305643320083618,
"learning_rate": 7.85012285012285e-05,
"loss": 0.7436663150787354,
"step": 1000
},
{
"epoch": 1.850595783684693,
"grad_norm": 0.9970649480819702,
"learning_rate": 7.727272727272727e-05,
"loss": 0.7684538841247559,
"step": 1010
},
{
"epoch": 1.8689275893675528,
"grad_norm": 1.0161981582641602,
"learning_rate": 7.604422604422605e-05,
"loss": 0.7405171394348145,
"step": 1020
},
{
"epoch": 1.8872593950504126,
"grad_norm": 1.3399347066879272,
"learning_rate": 7.481572481572482e-05,
"loss": 0.780091667175293,
"step": 1030
},
{
"epoch": 1.9055912007332723,
"grad_norm": 1.2579443454742432,
"learning_rate": 7.358722358722359e-05,
"loss": 0.6868968486785889,
"step": 1040
},
{
"epoch": 1.923923006416132,
"grad_norm": 1.0092531442642212,
"learning_rate": 7.235872235872236e-05,
"loss": 0.742798137664795,
"step": 1050
},
{
"epoch": 1.9422548120989918,
"grad_norm": 1.121690273284912,
"learning_rate": 7.113022113022113e-05,
"loss": 0.8035343170166016,
"step": 1060
},
{
"epoch": 1.9605866177818516,
"grad_norm": 1.0780940055847168,
"learning_rate": 6.990171990171991e-05,
"loss": 0.7256640911102294,
"step": 1070
},
{
"epoch": 1.9789184234647113,
"grad_norm": 1.0335768461227417,
"learning_rate": 6.867321867321868e-05,
"loss": 0.751814889907837,
"step": 1080
},
{
"epoch": 1.9972502291475709,
"grad_norm": 1.0326813459396362,
"learning_rate": 6.744471744471746e-05,
"loss": 0.781493854522705,
"step": 1090
},
{
"epoch": 2.0146654445462877,
"grad_norm": 0.9365840554237366,
"learning_rate": 6.621621621621621e-05,
"loss": 0.6172435760498047,
"step": 1100
},
{
"epoch": 2.0329972502291476,
"grad_norm": 1.0775729417800903,
"learning_rate": 6.498771498771498e-05,
"loss": 0.5551021575927735,
"step": 1110
},
{
"epoch": 2.0513290559120074,
"grad_norm": 1.1233711242675781,
"learning_rate": 6.375921375921376e-05,
"loss": 0.530482006072998,
"step": 1120
},
{
"epoch": 2.0696608615948673,
"grad_norm": 0.9685810208320618,
"learning_rate": 6.253071253071253e-05,
"loss": 0.5284864902496338,
"step": 1130
},
{
"epoch": 2.0879926672777267,
"grad_norm": 1.3673559427261353,
"learning_rate": 6.130221130221131e-05,
"loss": 0.5300433158874511,
"step": 1140
},
{
"epoch": 2.1063244729605866,
"grad_norm": 1.291156530380249,
"learning_rate": 6.0073710073710075e-05,
"loss": 0.5388914585113526,
"step": 1150
},
{
"epoch": 2.1246562786434464,
"grad_norm": 1.0860686302185059,
"learning_rate": 5.8845208845208844e-05,
"loss": 0.5599504947662354,
"step": 1160
},
{
"epoch": 2.1429880843263063,
"grad_norm": 1.2297484874725342,
"learning_rate": 5.761670761670762e-05,
"loss": 0.5693662643432618,
"step": 1170
},
{
"epoch": 2.1613198900091657,
"grad_norm": 1.3128403425216675,
"learning_rate": 5.638820638820639e-05,
"loss": 0.5087790012359619,
"step": 1180
},
{
"epoch": 2.1796516956920255,
"grad_norm": 1.47864830493927,
"learning_rate": 5.515970515970517e-05,
"loss": 0.5520669460296631,
"step": 1190
},
{
"epoch": 2.1979835013748854,
"grad_norm": 1.3533881902694702,
"learning_rate": 5.393120393120393e-05,
"loss": 0.5194924354553223,
"step": 1200
},
{
"epoch": 2.2163153070577453,
"grad_norm": 1.0729988813400269,
"learning_rate": 5.27027027027027e-05,
"loss": 0.5386343955993652,
"step": 1210
},
{
"epoch": 2.234647112740605,
"grad_norm": 1.1851814985275269,
"learning_rate": 5.147420147420148e-05,
"loss": 0.5398934364318848,
"step": 1220
},
{
"epoch": 2.2529789184234645,
"grad_norm": 1.306754469871521,
"learning_rate": 5.024570024570024e-05,
"loss": 0.5412076473236084,
"step": 1230
},
{
"epoch": 2.2713107241063244,
"grad_norm": 1.2561992406845093,
"learning_rate": 4.901719901719902e-05,
"loss": 0.555613899230957,
"step": 1240
},
{
"epoch": 2.2896425297891843,
"grad_norm": 1.4935739040374756,
"learning_rate": 4.778869778869779e-05,
"loss": 0.560833215713501,
"step": 1250
},
{
"epoch": 2.307974335472044,
"grad_norm": 1.2064818143844604,
"learning_rate": 4.656019656019656e-05,
"loss": 0.5313505172729492,
"step": 1260
},
{
"epoch": 2.3263061411549035,
"grad_norm": 1.2250595092773438,
"learning_rate": 4.5331695331695335e-05,
"loss": 0.5530914306640625,
"step": 1270
},
{
"epoch": 2.3446379468377634,
"grad_norm": 1.249531865119934,
"learning_rate": 4.4103194103194104e-05,
"loss": 0.5281160831451416,
"step": 1280
},
{
"epoch": 2.3629697525206232,
"grad_norm": 1.1765642166137695,
"learning_rate": 4.287469287469288e-05,
"loss": 0.5416937351226807,
"step": 1290
},
{
"epoch": 2.381301558203483,
"grad_norm": 1.2973071336746216,
"learning_rate": 4.164619164619165e-05,
"loss": 0.5341888427734375,
"step": 1300
},
{
"epoch": 2.399633363886343,
"grad_norm": 1.3533828258514404,
"learning_rate": 4.0417690417690415e-05,
"loss": 0.5386404514312744,
"step": 1310
},
{
"epoch": 2.4179651695692024,
"grad_norm": 1.1323643922805786,
"learning_rate": 3.918918918918919e-05,
"loss": 0.5549521446228027,
"step": 1320
},
{
"epoch": 2.4362969752520622,
"grad_norm": 1.0967226028442383,
"learning_rate": 3.7960687960687965e-05,
"loss": 0.5668260097503662,
"step": 1330
},
{
"epoch": 2.454628780934922,
"grad_norm": 1.3874995708465576,
"learning_rate": 3.6732186732186734e-05,
"loss": 0.5530946254730225,
"step": 1340
},
{
"epoch": 2.472960586617782,
"grad_norm": 1.3139115571975708,
"learning_rate": 3.550368550368551e-05,
"loss": 0.544743013381958,
"step": 1350
},
{
"epoch": 2.4912923923006414,
"grad_norm": 1.3629847764968872,
"learning_rate": 3.427518427518428e-05,
"loss": 0.5550421714782715,
"step": 1360
},
{
"epoch": 2.5096241979835012,
"grad_norm": 1.3279292583465576,
"learning_rate": 3.3046683046683045e-05,
"loss": 0.5361227035522461,
"step": 1370
},
{
"epoch": 2.527956003666361,
"grad_norm": 1.3736717700958252,
"learning_rate": 3.181818181818182e-05,
"loss": 0.5256178855895997,
"step": 1380
},
{
"epoch": 2.546287809349221,
"grad_norm": 1.2405906915664673,
"learning_rate": 3.058968058968059e-05,
"loss": 0.5551014900207519,
"step": 1390
},
{
"epoch": 2.564619615032081,
"grad_norm": 1.2711869478225708,
"learning_rate": 2.9361179361179364e-05,
"loss": 0.549025011062622,
"step": 1400
},
{
"epoch": 2.5829514207149407,
"grad_norm": 1.1510083675384521,
"learning_rate": 2.8132678132678135e-05,
"loss": 0.5108777046203613,
"step": 1410
},
{
"epoch": 2.6012832263978,
"grad_norm": 1.2585129737854004,
"learning_rate": 2.6904176904176904e-05,
"loss": 0.545832633972168,
"step": 1420
},
{
"epoch": 2.61961503208066,
"grad_norm": 1.2926920652389526,
"learning_rate": 2.5675675675675675e-05,
"loss": 0.531532621383667,
"step": 1430
},
{
"epoch": 2.63794683776352,
"grad_norm": 1.1816222667694092,
"learning_rate": 2.4447174447174447e-05,
"loss": 0.4899789333343506,
"step": 1440
},
{
"epoch": 2.656278643446379,
"grad_norm": 1.3441561460494995,
"learning_rate": 2.3218673218673222e-05,
"loss": 0.5534125804901123,
"step": 1450
},
{
"epoch": 2.674610449129239,
"grad_norm": 1.5367056131362915,
"learning_rate": 2.199017199017199e-05,
"loss": 0.5469470500946045,
"step": 1460
},
{
"epoch": 2.692942254812099,
"grad_norm": 1.292490005493164,
"learning_rate": 2.0761670761670762e-05,
"loss": 0.5418555259704589,
"step": 1470
},
{
"epoch": 2.711274060494959,
"grad_norm": 1.2457395792007446,
"learning_rate": 1.9533169533169534e-05,
"loss": 0.5272214889526368,
"step": 1480
},
{
"epoch": 2.7296058661778186,
"grad_norm": 1.3204960823059082,
"learning_rate": 1.8304668304668305e-05,
"loss": 0.5287877559661865,
"step": 1490
},
{
"epoch": 2.7479376718606785,
"grad_norm": 1.0838243961334229,
"learning_rate": 1.7076167076167077e-05,
"loss": 0.49266462326049804,
"step": 1500
},
{
"epoch": 2.766269477543538,
"grad_norm": 1.3524028062820435,
"learning_rate": 1.584766584766585e-05,
"loss": 0.534552526473999,
"step": 1510
},
{
"epoch": 2.7846012832263978,
"grad_norm": 1.2434245347976685,
"learning_rate": 1.4619164619164619e-05,
"loss": 0.5303339004516602,
"step": 1520
},
{
"epoch": 2.8029330889092576,
"grad_norm": 1.4432783126831055,
"learning_rate": 1.339066339066339e-05,
"loss": 0.5432450771331787,
"step": 1530
},
{
"epoch": 2.8212648945921175,
"grad_norm": 1.372916340827942,
"learning_rate": 1.2162162162162164e-05,
"loss": 0.5449412345886231,
"step": 1540
},
{
"epoch": 2.839596700274977,
"grad_norm": 1.3015090227127075,
"learning_rate": 1.0933660933660935e-05,
"loss": 0.5642722129821778,
"step": 1550
},
{
"epoch": 2.8579285059578368,
"grad_norm": 1.434592366218567,
"learning_rate": 9.705159705159705e-06,
"loss": 0.5404855728149414,
"step": 1560
},
{
"epoch": 2.8762603116406966,
"grad_norm": 2.344008445739746,
"learning_rate": 8.476658476658477e-06,
"loss": 0.5066645622253418,
"step": 1570
},
{
"epoch": 2.8945921173235565,
"grad_norm": 1.0202550888061523,
"learning_rate": 7.2481572481572485e-06,
"loss": 0.5348256587982178,
"step": 1580
},
{
"epoch": 2.9129239230064163,
"grad_norm": 1.2149248123168945,
"learning_rate": 6.019656019656019e-06,
"loss": 0.4903108596801758,
"step": 1590
},
{
"epoch": 2.9312557286892758,
"grad_norm": 1.5240399837493896,
"learning_rate": 4.791154791154792e-06,
"loss": 0.5296618461608886,
"step": 1600
},
{
"epoch": 2.9495875343721356,
"grad_norm": 1.4258157014846802,
"learning_rate": 3.562653562653563e-06,
"loss": 0.49115524291992185,
"step": 1610
},
{
"epoch": 2.9679193400549955,
"grad_norm": 1.1255574226379395,
"learning_rate": 2.3341523341523343e-06,
"loss": 0.5161442279815673,
"step": 1620
},
{
"epoch": 2.9862511457378553,
"grad_norm": 1.365963101387024,
"learning_rate": 1.1056511056511056e-06,
"loss": 0.5332399368286133,
"step": 1630
}
],
"logging_steps": 10,
"max_steps": 1638,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.469692570628813e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}