VerifyRM / trainer_state.json
fiowhahf's picture
Upload 9 files
7c55743 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9991487623003845,
"eval_steps": 500,
"global_step": 2751,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010895842555075079,
"grad_norm": 2.5097851753234863,
"learning_rate": 1.9927299163940386e-05,
"loss": 0.51,
"step": 10
},
{
"epoch": 0.021791685110150158,
"grad_norm": 1.4778149127960205,
"learning_rate": 1.985459832788077e-05,
"loss": 0.0818,
"step": 20
},
{
"epoch": 0.03268752766522524,
"grad_norm": 1.0869137048721313,
"learning_rate": 1.978189749182116e-05,
"loss": 0.1052,
"step": 30
},
{
"epoch": 0.043583370220300316,
"grad_norm": 1.0245683193206787,
"learning_rate": 1.970919665576154e-05,
"loss": 0.0508,
"step": 40
},
{
"epoch": 0.054479212775375395,
"grad_norm": 3.580275535583496,
"learning_rate": 1.963649581970193e-05,
"loss": 0.0336,
"step": 50
},
{
"epoch": 0.06537505533045047,
"grad_norm": 13.708664894104004,
"learning_rate": 1.9563794983642313e-05,
"loss": 0.0247,
"step": 60
},
{
"epoch": 0.07627089788552556,
"grad_norm": 2.38472843170166,
"learning_rate": 1.9491094147582698e-05,
"loss": 0.0474,
"step": 70
},
{
"epoch": 0.08716674044060063,
"grad_norm": 2.3008601665496826,
"learning_rate": 1.9418393311523086e-05,
"loss": 0.0505,
"step": 80
},
{
"epoch": 0.09806258299567572,
"grad_norm": 0.9727557301521301,
"learning_rate": 1.9345692475463468e-05,
"loss": 0.0291,
"step": 90
},
{
"epoch": 0.10895842555075079,
"grad_norm": 0.017909426242113113,
"learning_rate": 1.9272991639403856e-05,
"loss": 0.0232,
"step": 100
},
{
"epoch": 0.11985426810582588,
"grad_norm": 0.13610009849071503,
"learning_rate": 1.920029080334424e-05,
"loss": 0.0679,
"step": 110
},
{
"epoch": 0.13075011066090095,
"grad_norm": 0.09146017581224442,
"learning_rate": 1.9127589967284625e-05,
"loss": 0.062,
"step": 120
},
{
"epoch": 0.14164595321597603,
"grad_norm": 3.696361541748047,
"learning_rate": 1.9054889131225013e-05,
"loss": 0.0515,
"step": 130
},
{
"epoch": 0.15254179577105112,
"grad_norm": 0.07528296858072281,
"learning_rate": 1.8982188295165395e-05,
"loss": 0.008,
"step": 140
},
{
"epoch": 0.16343763832612618,
"grad_norm": 0.3899368345737457,
"learning_rate": 1.8909487459105783e-05,
"loss": 0.0309,
"step": 150
},
{
"epoch": 0.17433348088120126,
"grad_norm": 0.5960955619812012,
"learning_rate": 1.8836786623046168e-05,
"loss": 0.0415,
"step": 160
},
{
"epoch": 0.18522932343627635,
"grad_norm": 0.027237065136432648,
"learning_rate": 1.8764085786986552e-05,
"loss": 0.0257,
"step": 170
},
{
"epoch": 0.19612516599135144,
"grad_norm": 6.851381778717041,
"learning_rate": 1.8691384950926937e-05,
"loss": 0.0585,
"step": 180
},
{
"epoch": 0.2070210085464265,
"grad_norm": 1.518951416015625,
"learning_rate": 1.8618684114867322e-05,
"loss": 0.0173,
"step": 190
},
{
"epoch": 0.21791685110150158,
"grad_norm": 16.334980010986328,
"learning_rate": 1.854598327880771e-05,
"loss": 0.0892,
"step": 200
},
{
"epoch": 0.22881269365657667,
"grad_norm": 0.6327227354049683,
"learning_rate": 1.847328244274809e-05,
"loss": 0.0391,
"step": 210
},
{
"epoch": 0.23970853621165175,
"grad_norm": 0.026528311893343925,
"learning_rate": 1.840058160668848e-05,
"loss": 0.0316,
"step": 220
},
{
"epoch": 0.2506043787667268,
"grad_norm": 0.15849582850933075,
"learning_rate": 1.8327880770628864e-05,
"loss": 0.0306,
"step": 230
},
{
"epoch": 0.2615002213218019,
"grad_norm": 8.983612060546875,
"learning_rate": 1.825517993456925e-05,
"loss": 0.0252,
"step": 240
},
{
"epoch": 0.272396063876877,
"grad_norm": 1.4300966262817383,
"learning_rate": 1.8182479098509634e-05,
"loss": 0.0307,
"step": 250
},
{
"epoch": 0.28329190643195207,
"grad_norm": 0.19248631596565247,
"learning_rate": 1.810977826245002e-05,
"loss": 0.034,
"step": 260
},
{
"epoch": 0.29418774898702715,
"grad_norm": 0.0807420164346695,
"learning_rate": 1.8037077426390407e-05,
"loss": 0.0218,
"step": 270
},
{
"epoch": 0.30508359154210224,
"grad_norm": 0.04030030593276024,
"learning_rate": 1.796437659033079e-05,
"loss": 0.0164,
"step": 280
},
{
"epoch": 0.31597943409717727,
"grad_norm": 0.03919893503189087,
"learning_rate": 1.7891675754271176e-05,
"loss": 0.0207,
"step": 290
},
{
"epoch": 0.32687527665225236,
"grad_norm": 0.9118878245353699,
"learning_rate": 1.781897491821156e-05,
"loss": 0.0254,
"step": 300
},
{
"epoch": 0.33777111920732744,
"grad_norm": 0.09405702352523804,
"learning_rate": 1.7746274082151945e-05,
"loss": 0.0072,
"step": 310
},
{
"epoch": 0.3486669617624025,
"grad_norm": 1.061004638671875,
"learning_rate": 1.7673573246092334e-05,
"loss": 0.0178,
"step": 320
},
{
"epoch": 0.3595628043174776,
"grad_norm": 0.35136711597442627,
"learning_rate": 1.7600872410032715e-05,
"loss": 0.0268,
"step": 330
},
{
"epoch": 0.3704586468725527,
"grad_norm": 0.33769288659095764,
"learning_rate": 1.7528171573973103e-05,
"loss": 0.0383,
"step": 340
},
{
"epoch": 0.3813544894276278,
"grad_norm": 1.448626160621643,
"learning_rate": 1.7455470737913488e-05,
"loss": 0.0214,
"step": 350
},
{
"epoch": 0.39225033198270287,
"grad_norm": 1.096685767173767,
"learning_rate": 1.7382769901853873e-05,
"loss": 0.0442,
"step": 360
},
{
"epoch": 0.4031461745377779,
"grad_norm": 0.08582064509391785,
"learning_rate": 1.7310069065794257e-05,
"loss": 0.041,
"step": 370
},
{
"epoch": 0.414042017092853,
"grad_norm": 0.5726041793823242,
"learning_rate": 1.7237368229734642e-05,
"loss": 0.02,
"step": 380
},
{
"epoch": 0.4249378596479281,
"grad_norm": 0.27912572026252747,
"learning_rate": 1.716466739367503e-05,
"loss": 0.033,
"step": 390
},
{
"epoch": 0.43583370220300316,
"grad_norm": 0.40194639563560486,
"learning_rate": 1.7091966557615415e-05,
"loss": 0.0297,
"step": 400
},
{
"epoch": 0.44672954475807825,
"grad_norm": 0.4923015832901001,
"learning_rate": 1.70192657215558e-05,
"loss": 0.0473,
"step": 410
},
{
"epoch": 0.45762538731315333,
"grad_norm": 0.4864579439163208,
"learning_rate": 1.6946564885496184e-05,
"loss": 0.0335,
"step": 420
},
{
"epoch": 0.4685212298682284,
"grad_norm": 0.0577218122780323,
"learning_rate": 1.687386404943657e-05,
"loss": 0.0267,
"step": 430
},
{
"epoch": 0.4794170724233035,
"grad_norm": 0.026588434353470802,
"learning_rate": 1.6801163213376954e-05,
"loss": 0.0242,
"step": 440
},
{
"epoch": 0.4903129149783786,
"grad_norm": 1.106031060218811,
"learning_rate": 1.6728462377317342e-05,
"loss": 0.0412,
"step": 450
},
{
"epoch": 0.5012087575334536,
"grad_norm": 2.185438394546509,
"learning_rate": 1.6655761541257727e-05,
"loss": 0.0168,
"step": 460
},
{
"epoch": 0.5121046000885288,
"grad_norm": 0.2645202577114105,
"learning_rate": 1.658306070519811e-05,
"loss": 0.0225,
"step": 470
},
{
"epoch": 0.5230004426436038,
"grad_norm": 0.26281026005744934,
"learning_rate": 1.6510359869138496e-05,
"loss": 0.0225,
"step": 480
},
{
"epoch": 0.5338962851986789,
"grad_norm": 0.09611400961875916,
"learning_rate": 1.643765903307888e-05,
"loss": 0.0204,
"step": 490
},
{
"epoch": 0.544792127753754,
"grad_norm": 0.2964985966682434,
"learning_rate": 1.6364958197019266e-05,
"loss": 0.0192,
"step": 500
},
{
"epoch": 0.555687970308829,
"grad_norm": 3.2991862297058105,
"learning_rate": 1.629225736095965e-05,
"loss": 0.0395,
"step": 510
},
{
"epoch": 0.5665838128639041,
"grad_norm": 0.9299785494804382,
"learning_rate": 1.621955652490004e-05,
"loss": 0.0213,
"step": 520
},
{
"epoch": 0.5774796554189792,
"grad_norm": 1.7656854391098022,
"learning_rate": 1.6146855688840423e-05,
"loss": 0.0293,
"step": 530
},
{
"epoch": 0.5883754979740543,
"grad_norm": 0.052940454334020615,
"learning_rate": 1.6074154852780808e-05,
"loss": 0.0349,
"step": 540
},
{
"epoch": 0.5992713405291293,
"grad_norm": 0.6700181365013123,
"learning_rate": 1.6001454016721193e-05,
"loss": 0.0098,
"step": 550
},
{
"epoch": 0.6101671830842045,
"grad_norm": 1.4992352724075317,
"learning_rate": 1.5928753180661577e-05,
"loss": 0.0209,
"step": 560
},
{
"epoch": 0.6210630256392795,
"grad_norm": 0.6882705688476562,
"learning_rate": 1.5856052344601966e-05,
"loss": 0.0208,
"step": 570
},
{
"epoch": 0.6319588681943545,
"grad_norm": 0.35566991567611694,
"learning_rate": 1.578335150854235e-05,
"loss": 0.0157,
"step": 580
},
{
"epoch": 0.6428547107494297,
"grad_norm": 0.1365765929222107,
"learning_rate": 1.5710650672482735e-05,
"loss": 0.0207,
"step": 590
},
{
"epoch": 0.6537505533045047,
"grad_norm": 0.010805984027683735,
"learning_rate": 1.563794983642312e-05,
"loss": 0.0386,
"step": 600
},
{
"epoch": 0.6646463958595799,
"grad_norm": 0.33677366375923157,
"learning_rate": 1.5565249000363505e-05,
"loss": 0.0178,
"step": 610
},
{
"epoch": 0.6755422384146549,
"grad_norm": 0.023768046870827675,
"learning_rate": 1.5492548164303893e-05,
"loss": 0.0115,
"step": 620
},
{
"epoch": 0.68643808096973,
"grad_norm": 1.271041989326477,
"learning_rate": 1.5419847328244274e-05,
"loss": 0.0335,
"step": 630
},
{
"epoch": 0.697333923524805,
"grad_norm": 0.39303043484687805,
"learning_rate": 1.5347146492184662e-05,
"loss": 0.0456,
"step": 640
},
{
"epoch": 0.7082297660798802,
"grad_norm": 1.5450124740600586,
"learning_rate": 1.5274445656125047e-05,
"loss": 0.0206,
"step": 650
},
{
"epoch": 0.7191256086349552,
"grad_norm": 0.12599903345108032,
"learning_rate": 1.5201744820065432e-05,
"loss": 0.0125,
"step": 660
},
{
"epoch": 0.7300214511900303,
"grad_norm": 0.03158240765333176,
"learning_rate": 1.5129043984005818e-05,
"loss": 0.0019,
"step": 670
},
{
"epoch": 0.7409172937451054,
"grad_norm": 1.2820944786071777,
"learning_rate": 1.5056343147946201e-05,
"loss": 0.0132,
"step": 680
},
{
"epoch": 0.7518131363001804,
"grad_norm": 0.4018807113170624,
"learning_rate": 1.4983642311886588e-05,
"loss": 0.0274,
"step": 690
},
{
"epoch": 0.7627089788552556,
"grad_norm": 0.7147946953773499,
"learning_rate": 1.4910941475826972e-05,
"loss": 0.0207,
"step": 700
},
{
"epoch": 0.7736048214103306,
"grad_norm": 1.3514039516448975,
"learning_rate": 1.4838240639767359e-05,
"loss": 0.0088,
"step": 710
},
{
"epoch": 0.7845006639654057,
"grad_norm": 0.10958287864923477,
"learning_rate": 1.4765539803707745e-05,
"loss": 0.0054,
"step": 720
},
{
"epoch": 0.7953965065204808,
"grad_norm": 0.12291970103979111,
"learning_rate": 1.4692838967648128e-05,
"loss": 0.0154,
"step": 730
},
{
"epoch": 0.8062923490755558,
"grad_norm": 0.056142911314964294,
"learning_rate": 1.4620138131588515e-05,
"loss": 0.0214,
"step": 740
},
{
"epoch": 0.817188191630631,
"grad_norm": 0.08367596566677094,
"learning_rate": 1.45474372955289e-05,
"loss": 0.0074,
"step": 750
},
{
"epoch": 0.828084034185706,
"grad_norm": 0.8847033381462097,
"learning_rate": 1.4474736459469286e-05,
"loss": 0.052,
"step": 760
},
{
"epoch": 0.8389798767407811,
"grad_norm": 0.23346182703971863,
"learning_rate": 1.4402035623409672e-05,
"loss": 0.0238,
"step": 770
},
{
"epoch": 0.8498757192958561,
"grad_norm": 0.7445326447486877,
"learning_rate": 1.4329334787350055e-05,
"loss": 0.0179,
"step": 780
},
{
"epoch": 0.8607715618509313,
"grad_norm": 1.623715877532959,
"learning_rate": 1.4256633951290442e-05,
"loss": 0.0138,
"step": 790
},
{
"epoch": 0.8716674044060063,
"grad_norm": 0.12205464392900467,
"learning_rate": 1.4183933115230826e-05,
"loss": 0.0182,
"step": 800
},
{
"epoch": 0.8825632469610815,
"grad_norm": 0.015034107491374016,
"learning_rate": 1.4111232279171211e-05,
"loss": 0.0192,
"step": 810
},
{
"epoch": 0.8934590895161565,
"grad_norm": 1.1116948127746582,
"learning_rate": 1.4038531443111596e-05,
"loss": 0.0329,
"step": 820
},
{
"epoch": 0.9043549320712315,
"grad_norm": 0.35468608140945435,
"learning_rate": 1.3965830607051982e-05,
"loss": 0.0299,
"step": 830
},
{
"epoch": 0.9152507746263067,
"grad_norm": 1.3069281578063965,
"learning_rate": 1.3893129770992369e-05,
"loss": 0.028,
"step": 840
},
{
"epoch": 0.9261466171813817,
"grad_norm": 0.6548961997032166,
"learning_rate": 1.3820428934932752e-05,
"loss": 0.0125,
"step": 850
},
{
"epoch": 0.9370424597364568,
"grad_norm": 0.016538333147764206,
"learning_rate": 1.3747728098873138e-05,
"loss": 0.0097,
"step": 860
},
{
"epoch": 0.9479383022915319,
"grad_norm": 0.7220777273178101,
"learning_rate": 1.3675027262813523e-05,
"loss": 0.0281,
"step": 870
},
{
"epoch": 0.958834144846607,
"grad_norm": 7.228305339813232,
"learning_rate": 1.360232642675391e-05,
"loss": 0.0095,
"step": 880
},
{
"epoch": 0.969729987401682,
"grad_norm": 0.31951704621315,
"learning_rate": 1.3529625590694292e-05,
"loss": 0.0148,
"step": 890
},
{
"epoch": 0.9806258299567572,
"grad_norm": 0.009546870365738869,
"learning_rate": 1.3456924754634679e-05,
"loss": 0.0051,
"step": 900
},
{
"epoch": 0.9915216725118322,
"grad_norm": 2.050363063812256,
"learning_rate": 1.3384223918575065e-05,
"loss": 0.0306,
"step": 910
},
{
"epoch": 1.0032687527665225,
"grad_norm": 1.1950825452804565,
"learning_rate": 1.331152308251545e-05,
"loss": 0.0061,
"step": 920
},
{
"epoch": 1.0141645953215976,
"grad_norm": 0.02007538639008999,
"learning_rate": 1.3238822246455837e-05,
"loss": 0.005,
"step": 930
},
{
"epoch": 1.0250604378766728,
"grad_norm": 0.053643591701984406,
"learning_rate": 1.316612141039622e-05,
"loss": 0.0093,
"step": 940
},
{
"epoch": 1.0359562804317477,
"grad_norm": 0.13197128474712372,
"learning_rate": 1.3093420574336606e-05,
"loss": 0.0123,
"step": 950
},
{
"epoch": 1.0468521229868228,
"grad_norm": 0.20932506024837494,
"learning_rate": 1.3020719738276992e-05,
"loss": 0.0267,
"step": 960
},
{
"epoch": 1.057747965541898,
"grad_norm": 0.11939968913793564,
"learning_rate": 1.2948018902217377e-05,
"loss": 0.0042,
"step": 970
},
{
"epoch": 1.068643808096973,
"grad_norm": 0.08671363443136215,
"learning_rate": 1.2875318066157762e-05,
"loss": 0.009,
"step": 980
},
{
"epoch": 1.079539650652048,
"grad_norm": 0.025082537904381752,
"learning_rate": 1.2802617230098147e-05,
"loss": 0.0028,
"step": 990
},
{
"epoch": 1.0904354932071232,
"grad_norm": 0.005358474794775248,
"learning_rate": 1.2729916394038533e-05,
"loss": 0.0017,
"step": 1000
},
{
"epoch": 1.1013313357621983,
"grad_norm": 0.008662994019687176,
"learning_rate": 1.2657215557978916e-05,
"loss": 0.0013,
"step": 1010
},
{
"epoch": 1.1122271783172732,
"grad_norm": 2.0191564559936523,
"learning_rate": 1.2584514721919303e-05,
"loss": 0.0179,
"step": 1020
},
{
"epoch": 1.1231230208723484,
"grad_norm": 0.025384988635778427,
"learning_rate": 1.2511813885859689e-05,
"loss": 0.02,
"step": 1030
},
{
"epoch": 1.1340188634274235,
"grad_norm": 0.011868833564221859,
"learning_rate": 1.2439113049800074e-05,
"loss": 0.0024,
"step": 1040
},
{
"epoch": 1.1449147059824987,
"grad_norm": 0.010154581628739834,
"learning_rate": 1.236641221374046e-05,
"loss": 0.0053,
"step": 1050
},
{
"epoch": 1.1558105485375736,
"grad_norm": 0.09402716159820557,
"learning_rate": 1.2293711377680843e-05,
"loss": 0.005,
"step": 1060
},
{
"epoch": 1.1667063910926487,
"grad_norm": 0.3972262442111969,
"learning_rate": 1.222101054162123e-05,
"loss": 0.0065,
"step": 1070
},
{
"epoch": 1.1776022336477239,
"grad_norm": 0.02627560682594776,
"learning_rate": 1.2148309705561614e-05,
"loss": 0.0192,
"step": 1080
},
{
"epoch": 1.1884980762027988,
"grad_norm": 0.538215160369873,
"learning_rate": 1.2075608869502e-05,
"loss": 0.0073,
"step": 1090
},
{
"epoch": 1.199393918757874,
"grad_norm": 0.48226070404052734,
"learning_rate": 1.2002908033442387e-05,
"loss": 0.0009,
"step": 1100
},
{
"epoch": 1.210289761312949,
"grad_norm": 0.5596455335617065,
"learning_rate": 1.193020719738277e-05,
"loss": 0.0119,
"step": 1110
},
{
"epoch": 1.2211856038680242,
"grad_norm": 0.03299971669912338,
"learning_rate": 1.1857506361323157e-05,
"loss": 0.0025,
"step": 1120
},
{
"epoch": 1.2320814464230991,
"grad_norm": 0.03791365772485733,
"learning_rate": 1.1784805525263541e-05,
"loss": 0.0147,
"step": 1130
},
{
"epoch": 1.2429772889781743,
"grad_norm": 0.6537386178970337,
"learning_rate": 1.1712104689203926e-05,
"loss": 0.0026,
"step": 1140
},
{
"epoch": 1.2538731315332494,
"grad_norm": 0.02327698841691017,
"learning_rate": 1.1639403853144313e-05,
"loss": 0.0012,
"step": 1150
},
{
"epoch": 1.2647689740883243,
"grad_norm": 0.024980690330266953,
"learning_rate": 1.1566703017084697e-05,
"loss": 0.0053,
"step": 1160
},
{
"epoch": 1.2756648166433995,
"grad_norm": 0.01306835189461708,
"learning_rate": 1.1494002181025084e-05,
"loss": 0.0179,
"step": 1170
},
{
"epoch": 1.2865606591984746,
"grad_norm": 0.005500817205756903,
"learning_rate": 1.1421301344965467e-05,
"loss": 0.0117,
"step": 1180
},
{
"epoch": 1.2974565017535498,
"grad_norm": 2.294457197189331,
"learning_rate": 1.1348600508905853e-05,
"loss": 0.0065,
"step": 1190
},
{
"epoch": 1.3083523443086247,
"grad_norm": 3.2596099376678467,
"learning_rate": 1.1275899672846238e-05,
"loss": 0.0128,
"step": 1200
},
{
"epoch": 1.3192481868636998,
"grad_norm": 0.014325232245028019,
"learning_rate": 1.1203198836786624e-05,
"loss": 0.004,
"step": 1210
},
{
"epoch": 1.330144029418775,
"grad_norm": 0.08742561936378479,
"learning_rate": 1.1130498000727011e-05,
"loss": 0.005,
"step": 1220
},
{
"epoch": 1.3410398719738499,
"grad_norm": 0.06310788542032242,
"learning_rate": 1.1057797164667394e-05,
"loss": 0.0062,
"step": 1230
},
{
"epoch": 1.351935714528925,
"grad_norm": 0.02661961503326893,
"learning_rate": 1.098509632860778e-05,
"loss": 0.001,
"step": 1240
},
{
"epoch": 1.3628315570840002,
"grad_norm": 0.008728576824069023,
"learning_rate": 1.0912395492548165e-05,
"loss": 0.0065,
"step": 1250
},
{
"epoch": 1.3737273996390753,
"grad_norm": 0.40287479758262634,
"learning_rate": 1.0839694656488552e-05,
"loss": 0.0115,
"step": 1260
},
{
"epoch": 1.3846232421941502,
"grad_norm": 0.0008290009573101997,
"learning_rate": 1.0766993820428935e-05,
"loss": 0.0023,
"step": 1270
},
{
"epoch": 1.3955190847492254,
"grad_norm": 0.20154079794883728,
"learning_rate": 1.0694292984369321e-05,
"loss": 0.004,
"step": 1280
},
{
"epoch": 1.4064149273043005,
"grad_norm": 0.032378897070884705,
"learning_rate": 1.0621592148309707e-05,
"loss": 0.0103,
"step": 1290
},
{
"epoch": 1.4173107698593754,
"grad_norm": 0.037077393382787704,
"learning_rate": 1.0548891312250092e-05,
"loss": 0.0048,
"step": 1300
},
{
"epoch": 1.4282066124144506,
"grad_norm": 0.0009527279180474579,
"learning_rate": 1.0476190476190477e-05,
"loss": 0.0197,
"step": 1310
},
{
"epoch": 1.4391024549695257,
"grad_norm": 0.6460732221603394,
"learning_rate": 1.0403489640130862e-05,
"loss": 0.0085,
"step": 1320
},
{
"epoch": 1.4499982975246009,
"grad_norm": 0.18065184354782104,
"learning_rate": 1.0330788804071248e-05,
"loss": 0.0021,
"step": 1330
},
{
"epoch": 1.4608941400796758,
"grad_norm": 0.08325136452913284,
"learning_rate": 1.0258087968011631e-05,
"loss": 0.0079,
"step": 1340
},
{
"epoch": 1.471789982634751,
"grad_norm": 0.0035695817787200212,
"learning_rate": 1.0185387131952018e-05,
"loss": 0.0001,
"step": 1350
},
{
"epoch": 1.482685825189826,
"grad_norm": 0.00448552705347538,
"learning_rate": 1.0112686295892404e-05,
"loss": 0.0004,
"step": 1360
},
{
"epoch": 1.493581667744901,
"grad_norm": 0.027783585712313652,
"learning_rate": 1.0039985459832789e-05,
"loss": 0.011,
"step": 1370
},
{
"epoch": 1.5044775102999761,
"grad_norm": 2.4403154850006104,
"learning_rate": 9.967284623773175e-06,
"loss": 0.0162,
"step": 1380
},
{
"epoch": 1.5153733528550513,
"grad_norm": 0.031121332198381424,
"learning_rate": 9.89458378771356e-06,
"loss": 0.0019,
"step": 1390
},
{
"epoch": 1.5262691954101264,
"grad_norm": 0.01372817624360323,
"learning_rate": 9.821882951653945e-06,
"loss": 0.0107,
"step": 1400
},
{
"epoch": 1.5371650379652015,
"grad_norm": 0.015296364203095436,
"learning_rate": 9.74918211559433e-06,
"loss": 0.0107,
"step": 1410
},
{
"epoch": 1.5480608805202765,
"grad_norm": 0.022742554545402527,
"learning_rate": 9.676481279534716e-06,
"loss": 0.0055,
"step": 1420
},
{
"epoch": 1.5589567230753516,
"grad_norm": 0.005425534211099148,
"learning_rate": 9.6037804434751e-06,
"loss": 0.001,
"step": 1430
},
{
"epoch": 1.5698525656304265,
"grad_norm": 0.0004977713688276708,
"learning_rate": 9.531079607415487e-06,
"loss": 0.0015,
"step": 1440
},
{
"epoch": 1.5807484081855017,
"grad_norm": 0.016388392075896263,
"learning_rate": 9.458378771355872e-06,
"loss": 0.0213,
"step": 1450
},
{
"epoch": 1.5916442507405768,
"grad_norm": 0.029239172115921974,
"learning_rate": 9.385677935296256e-06,
"loss": 0.0032,
"step": 1460
},
{
"epoch": 1.602540093295652,
"grad_norm": 0.25184109807014465,
"learning_rate": 9.312977099236641e-06,
"loss": 0.0139,
"step": 1470
},
{
"epoch": 1.613435935850727,
"grad_norm": 0.5452978014945984,
"learning_rate": 9.240276263177028e-06,
"loss": 0.001,
"step": 1480
},
{
"epoch": 1.624331778405802,
"grad_norm": 0.00713045010343194,
"learning_rate": 9.167575427117412e-06,
"loss": 0.0068,
"step": 1490
},
{
"epoch": 1.6352276209608771,
"grad_norm": 0.04856117442250252,
"learning_rate": 9.094874591057799e-06,
"loss": 0.013,
"step": 1500
},
{
"epoch": 1.646123463515952,
"grad_norm": 0.6631866693496704,
"learning_rate": 9.022173754998184e-06,
"loss": 0.0118,
"step": 1510
},
{
"epoch": 1.6570193060710272,
"grad_norm": 0.34849047660827637,
"learning_rate": 8.949472918938568e-06,
"loss": 0.004,
"step": 1520
},
{
"epoch": 1.6679151486261024,
"grad_norm": 0.011874212883412838,
"learning_rate": 8.876772082878955e-06,
"loss": 0.002,
"step": 1530
},
{
"epoch": 1.6788109911811775,
"grad_norm": 0.05654163286089897,
"learning_rate": 8.80407124681934e-06,
"loss": 0.0033,
"step": 1540
},
{
"epoch": 1.6897068337362526,
"grad_norm": 0.05505364388227463,
"learning_rate": 8.731370410759724e-06,
"loss": 0.0016,
"step": 1550
},
{
"epoch": 1.7006026762913276,
"grad_norm": 0.8052054047584534,
"learning_rate": 8.658669574700109e-06,
"loss": 0.0033,
"step": 1560
},
{
"epoch": 1.7114985188464027,
"grad_norm": 0.001815033028833568,
"learning_rate": 8.585968738640495e-06,
"loss": 0.0026,
"step": 1570
},
{
"epoch": 1.7223943614014776,
"grad_norm": 0.17480531334877014,
"learning_rate": 8.51326790258088e-06,
"loss": 0.0064,
"step": 1580
},
{
"epoch": 1.7332902039565528,
"grad_norm": 0.005486777517944574,
"learning_rate": 8.440567066521266e-06,
"loss": 0.0208,
"step": 1590
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.10310015082359314,
"learning_rate": 8.367866230461651e-06,
"loss": 0.0005,
"step": 1600
},
{
"epoch": 1.755081889066703,
"grad_norm": 0.008104170672595501,
"learning_rate": 8.295165394402036e-06,
"loss": 0.0087,
"step": 1610
},
{
"epoch": 1.7659777316217782,
"grad_norm": 0.033456411212682724,
"learning_rate": 8.22246455834242e-06,
"loss": 0.0072,
"step": 1620
},
{
"epoch": 1.776873574176853,
"grad_norm": 0.007005383726209402,
"learning_rate": 8.149763722282807e-06,
"loss": 0.014,
"step": 1630
},
{
"epoch": 1.7877694167319282,
"grad_norm": 0.012260228395462036,
"learning_rate": 8.077062886223192e-06,
"loss": 0.0008,
"step": 1640
},
{
"epoch": 1.7986652592870032,
"grad_norm": 0.0009957356378436089,
"learning_rate": 8.004362050163578e-06,
"loss": 0.0014,
"step": 1650
},
{
"epoch": 1.8095611018420783,
"grad_norm": 0.005955096334218979,
"learning_rate": 7.931661214103963e-06,
"loss": 0.0005,
"step": 1660
},
{
"epoch": 1.8204569443971534,
"grad_norm": 0.0004700123390648514,
"learning_rate": 7.858960378044348e-06,
"loss": 0.0028,
"step": 1670
},
{
"epoch": 1.8313527869522286,
"grad_norm": 0.002416003029793501,
"learning_rate": 7.786259541984733e-06,
"loss": 0.0003,
"step": 1680
},
{
"epoch": 1.8422486295073037,
"grad_norm": 0.028112288564443588,
"learning_rate": 7.713558705925119e-06,
"loss": 0.0318,
"step": 1690
},
{
"epoch": 1.8531444720623786,
"grad_norm": 0.03914355859160423,
"learning_rate": 7.640857869865504e-06,
"loss": 0.0139,
"step": 1700
},
{
"epoch": 1.8640403146174538,
"grad_norm": 4.869634628295898,
"learning_rate": 7.568157033805889e-06,
"loss": 0.0098,
"step": 1710
},
{
"epoch": 1.8749361571725287,
"grad_norm": 1.1335488557815552,
"learning_rate": 7.495456197746275e-06,
"loss": 0.0174,
"step": 1720
},
{
"epoch": 1.8858319997276038,
"grad_norm": 0.6747786402702332,
"learning_rate": 7.42275536168666e-06,
"loss": 0.0044,
"step": 1730
},
{
"epoch": 1.896727842282679,
"grad_norm": 0.9970724582672119,
"learning_rate": 7.350054525627045e-06,
"loss": 0.0087,
"step": 1740
},
{
"epoch": 1.9076236848377541,
"grad_norm": 0.16893063485622406,
"learning_rate": 7.27735368956743e-06,
"loss": 0.0032,
"step": 1750
},
{
"epoch": 1.9185195273928293,
"grad_norm": 0.8119887709617615,
"learning_rate": 7.204652853507816e-06,
"loss": 0.0153,
"step": 1760
},
{
"epoch": 1.9294153699479044,
"grad_norm": 0.006383243482559919,
"learning_rate": 7.131952017448202e-06,
"loss": 0.0034,
"step": 1770
},
{
"epoch": 1.9403112125029793,
"grad_norm": 0.03637854754924774,
"learning_rate": 7.059251181388587e-06,
"loss": 0.0034,
"step": 1780
},
{
"epoch": 1.9512070550580543,
"grad_norm": 0.04712774232029915,
"learning_rate": 6.9865503453289714e-06,
"loss": 0.0234,
"step": 1790
},
{
"epoch": 1.9621028976131294,
"grad_norm": 6.268856525421143,
"learning_rate": 6.913849509269357e-06,
"loss": 0.0265,
"step": 1800
},
{
"epoch": 1.9729987401682045,
"grad_norm": 0.6448054313659668,
"learning_rate": 6.841148673209742e-06,
"loss": 0.0057,
"step": 1810
},
{
"epoch": 1.9838945827232797,
"grad_norm": 0.07000619918107986,
"learning_rate": 6.768447837150128e-06,
"loss": 0.0005,
"step": 1820
},
{
"epoch": 1.9947904252783548,
"grad_norm": 0.012424224987626076,
"learning_rate": 6.695747001090514e-06,
"loss": 0.0039,
"step": 1830
},
{
"epoch": 2.006537505533045,
"grad_norm": 0.08453727513551712,
"learning_rate": 6.6230461650308985e-06,
"loss": 0.0006,
"step": 1840
},
{
"epoch": 2.01743334808812,
"grad_norm": 0.0390053391456604,
"learning_rate": 6.550345328971284e-06,
"loss": 0.0006,
"step": 1850
},
{
"epoch": 2.0283291906431953,
"grad_norm": 0.013394408859312534,
"learning_rate": 6.477644492911669e-06,
"loss": 0.0049,
"step": 1860
},
{
"epoch": 2.0392250331982704,
"grad_norm": 0.0027593837585300207,
"learning_rate": 6.404943656852054e-06,
"loss": 0.0008,
"step": 1870
},
{
"epoch": 2.0501208757533456,
"grad_norm": 0.0010020197369158268,
"learning_rate": 6.332242820792439e-06,
"loss": 0.0023,
"step": 1880
},
{
"epoch": 2.0610167183084203,
"grad_norm": 0.0010899041080847383,
"learning_rate": 6.259541984732826e-06,
"loss": 0.0005,
"step": 1890
},
{
"epoch": 2.0719125608634954,
"grad_norm": 0.03333039954304695,
"learning_rate": 6.18684114867321e-06,
"loss": 0.0011,
"step": 1900
},
{
"epoch": 2.0828084034185705,
"grad_norm": 0.002606542780995369,
"learning_rate": 6.114140312613596e-06,
"loss": 0.0062,
"step": 1910
},
{
"epoch": 2.0937042459736457,
"grad_norm": 0.008523502387106419,
"learning_rate": 6.041439476553981e-06,
"loss": 0.0001,
"step": 1920
},
{
"epoch": 2.104600088528721,
"grad_norm": 0.005313311703503132,
"learning_rate": 5.968738640494366e-06,
"loss": 0.0095,
"step": 1930
},
{
"epoch": 2.115495931083796,
"grad_norm": 0.030115563422441483,
"learning_rate": 5.896037804434751e-06,
"loss": 0.0011,
"step": 1940
},
{
"epoch": 2.126391773638871,
"grad_norm": 0.001531143207103014,
"learning_rate": 5.823336968375137e-06,
"loss": 0.0047,
"step": 1950
},
{
"epoch": 2.137287616193946,
"grad_norm": 0.013100974261760712,
"learning_rate": 5.750636132315522e-06,
"loss": 0.0041,
"step": 1960
},
{
"epoch": 2.148183458749021,
"grad_norm": 0.010219580493867397,
"learning_rate": 5.677935296255908e-06,
"loss": 0.0012,
"step": 1970
},
{
"epoch": 2.159079301304096,
"grad_norm": 0.02304321527481079,
"learning_rate": 5.6052344601962925e-06,
"loss": 0.0006,
"step": 1980
},
{
"epoch": 2.1699751438591712,
"grad_norm": 0.32716256380081177,
"learning_rate": 5.532533624136678e-06,
"loss": 0.0005,
"step": 1990
},
{
"epoch": 2.1808709864142464,
"grad_norm": 0.003199178259819746,
"learning_rate": 5.459832788077063e-06,
"loss": 0.0002,
"step": 2000
},
{
"epoch": 2.1917668289693215,
"grad_norm": 0.10407451540231705,
"learning_rate": 5.387131952017448e-06,
"loss": 0.0026,
"step": 2010
},
{
"epoch": 2.2026626715243967,
"grad_norm": 0.0036433066707104445,
"learning_rate": 5.314431115957834e-06,
"loss": 0.0053,
"step": 2020
},
{
"epoch": 2.2135585140794714,
"grad_norm": 0.22139491140842438,
"learning_rate": 5.2417302798982195e-06,
"loss": 0.0013,
"step": 2030
},
{
"epoch": 2.2244543566345465,
"grad_norm": 0.00901265349239111,
"learning_rate": 5.169029443838604e-06,
"loss": 0.0004,
"step": 2040
},
{
"epoch": 2.2353501991896216,
"grad_norm": 0.007596256677061319,
"learning_rate": 5.09632860777899e-06,
"loss": 0.0002,
"step": 2050
},
{
"epoch": 2.2462460417446968,
"grad_norm": 0.05308268591761589,
"learning_rate": 5.023627771719375e-06,
"loss": 0.0001,
"step": 2060
},
{
"epoch": 2.257141884299772,
"grad_norm": 0.005023419391363859,
"learning_rate": 4.95092693565976e-06,
"loss": 0.0001,
"step": 2070
},
{
"epoch": 2.268037726854847,
"grad_norm": 0.09251435101032257,
"learning_rate": 4.878226099600146e-06,
"loss": 0.0008,
"step": 2080
},
{
"epoch": 2.278933569409922,
"grad_norm": 0.0035660325083881617,
"learning_rate": 4.8055252635405305e-06,
"loss": 0.0029,
"step": 2090
},
{
"epoch": 2.2898294119649973,
"grad_norm": 0.00022365724726114422,
"learning_rate": 4.732824427480917e-06,
"loss": 0.0,
"step": 2100
},
{
"epoch": 2.300725254520072,
"grad_norm": 0.28966161608695984,
"learning_rate": 4.660123591421302e-06,
"loss": 0.0004,
"step": 2110
},
{
"epoch": 2.311621097075147,
"grad_norm": 0.000494773150421679,
"learning_rate": 4.5874227553616864e-06,
"loss": 0.0003,
"step": 2120
},
{
"epoch": 2.3225169396302223,
"grad_norm": 0.2110077142715454,
"learning_rate": 4.514721919302073e-06,
"loss": 0.0007,
"step": 2130
},
{
"epoch": 2.3334127821852975,
"grad_norm": 0.0006416022079065442,
"learning_rate": 4.442021083242458e-06,
"loss": 0.0006,
"step": 2140
},
{
"epoch": 2.3443086247403726,
"grad_norm": 0.0005581114673987031,
"learning_rate": 4.369320247182842e-06,
"loss": 0.0004,
"step": 2150
},
{
"epoch": 2.3552044672954477,
"grad_norm": 0.0006430571665987372,
"learning_rate": 4.296619411123229e-06,
"loss": 0.0013,
"step": 2160
},
{
"epoch": 2.3661003098505224,
"grad_norm": 0.0002313524018973112,
"learning_rate": 4.2239185750636135e-06,
"loss": 0.0011,
"step": 2170
},
{
"epoch": 2.3769961524055976,
"grad_norm": 0.01299639604985714,
"learning_rate": 4.151217739003999e-06,
"loss": 0.0002,
"step": 2180
},
{
"epoch": 2.3878919949606727,
"grad_norm": 0.036279868334531784,
"learning_rate": 4.078516902944385e-06,
"loss": 0.0,
"step": 2190
},
{
"epoch": 2.398787837515748,
"grad_norm": 0.0004496763285715133,
"learning_rate": 4.005816066884769e-06,
"loss": 0.0,
"step": 2200
},
{
"epoch": 2.409683680070823,
"grad_norm": 0.010034661740064621,
"learning_rate": 3.933115230825155e-06,
"loss": 0.0,
"step": 2210
},
{
"epoch": 2.420579522625898,
"grad_norm": 0.0027114665135741234,
"learning_rate": 3.860414394765541e-06,
"loss": 0.0,
"step": 2220
},
{
"epoch": 2.4314753651809733,
"grad_norm": 0.00021306249254848808,
"learning_rate": 3.7877135587059253e-06,
"loss": 0.0,
"step": 2230
},
{
"epoch": 2.4423712077360484,
"grad_norm": 0.002327492693439126,
"learning_rate": 3.7150127226463105e-06,
"loss": 0.0,
"step": 2240
},
{
"epoch": 2.453267050291123,
"grad_norm": 0.0042752730660140514,
"learning_rate": 3.6423118865866965e-06,
"loss": 0.0001,
"step": 2250
},
{
"epoch": 2.4641628928461983,
"grad_norm": 0.5819891691207886,
"learning_rate": 3.5696110505270817e-06,
"loss": 0.0014,
"step": 2260
},
{
"epoch": 2.4750587354012734,
"grad_norm": 0.0002232871629530564,
"learning_rate": 3.4969102144674664e-06,
"loss": 0.0,
"step": 2270
},
{
"epoch": 2.4859545779563486,
"grad_norm": 0.0006547856028191745,
"learning_rate": 3.4242093784078516e-06,
"loss": 0.0,
"step": 2280
},
{
"epoch": 2.4968504205114237,
"grad_norm": 0.007096582092344761,
"learning_rate": 3.3515085423482376e-06,
"loss": 0.0,
"step": 2290
},
{
"epoch": 2.507746263066499,
"grad_norm": 0.007319641765207052,
"learning_rate": 3.2788077062886227e-06,
"loss": 0.0,
"step": 2300
},
{
"epoch": 2.5186421056215735,
"grad_norm": 0.00013177268556319177,
"learning_rate": 3.206106870229008e-06,
"loss": 0.0,
"step": 2310
},
{
"epoch": 2.5295379481766487,
"grad_norm": 0.001638653688132763,
"learning_rate": 3.1334060341693935e-06,
"loss": 0.0002,
"step": 2320
},
{
"epoch": 2.540433790731724,
"grad_norm": 0.00048312891158275306,
"learning_rate": 3.0607051981097786e-06,
"loss": 0.0,
"step": 2330
},
{
"epoch": 2.551329633286799,
"grad_norm": 0.001063148258253932,
"learning_rate": 2.988004362050164e-06,
"loss": 0.0001,
"step": 2340
},
{
"epoch": 2.562225475841874,
"grad_norm": 0.005976190324872732,
"learning_rate": 2.9153035259905494e-06,
"loss": 0.0,
"step": 2350
},
{
"epoch": 2.5731213183969492,
"grad_norm": 0.001030449871905148,
"learning_rate": 2.8426026899309345e-06,
"loss": 0.0001,
"step": 2360
},
{
"epoch": 2.5840171609520244,
"grad_norm": 0.000677391595672816,
"learning_rate": 2.7699018538713197e-06,
"loss": 0.0016,
"step": 2370
},
{
"epoch": 2.5949130035070995,
"grad_norm": 1.1224867105484009,
"learning_rate": 2.6972010178117053e-06,
"loss": 0.0036,
"step": 2380
},
{
"epoch": 2.6058088460621747,
"grad_norm": 0.0026874279137700796,
"learning_rate": 2.6245001817520905e-06,
"loss": 0.0,
"step": 2390
},
{
"epoch": 2.6167046886172494,
"grad_norm": 0.003862058976665139,
"learning_rate": 2.5517993456924756e-06,
"loss": 0.0001,
"step": 2400
},
{
"epoch": 2.6276005311723245,
"grad_norm": 0.0830313041806221,
"learning_rate": 2.4790985096328608e-06,
"loss": 0.0014,
"step": 2410
},
{
"epoch": 2.6384963737273996,
"grad_norm": 0.0019621718674898148,
"learning_rate": 2.4063976735732464e-06,
"loss": 0.0005,
"step": 2420
},
{
"epoch": 2.649392216282475,
"grad_norm": 0.28306806087493896,
"learning_rate": 2.3336968375136315e-06,
"loss": 0.0002,
"step": 2430
},
{
"epoch": 2.66028805883755,
"grad_norm": 0.004503046162426472,
"learning_rate": 2.260996001454017e-06,
"loss": 0.0,
"step": 2440
},
{
"epoch": 2.6711839013926246,
"grad_norm": 0.0008729721885174513,
"learning_rate": 2.1882951653944023e-06,
"loss": 0.0008,
"step": 2450
},
{
"epoch": 2.6820797439476998,
"grad_norm": 0.010283468291163445,
"learning_rate": 2.1155943293347874e-06,
"loss": 0.0,
"step": 2460
},
{
"epoch": 2.692975586502775,
"grad_norm": 1.8014414308709092e-05,
"learning_rate": 2.042893493275173e-06,
"loss": 0.0012,
"step": 2470
},
{
"epoch": 2.70387142905785,
"grad_norm": 0.0013227862073108554,
"learning_rate": 1.970192657215558e-06,
"loss": 0.0001,
"step": 2480
},
{
"epoch": 2.714767271612925,
"grad_norm": 9.750492608873174e-05,
"learning_rate": 1.8974918211559433e-06,
"loss": 0.0012,
"step": 2490
},
{
"epoch": 2.7256631141680003,
"grad_norm": 0.009569020941853523,
"learning_rate": 1.824790985096329e-06,
"loss": 0.0001,
"step": 2500
},
{
"epoch": 2.7365589567230755,
"grad_norm": 0.00015347945736721158,
"learning_rate": 1.752090149036714e-06,
"loss": 0.0001,
"step": 2510
},
{
"epoch": 2.7474547992781506,
"grad_norm": 0.0024864268489181995,
"learning_rate": 1.6793893129770995e-06,
"loss": 0.0002,
"step": 2520
},
{
"epoch": 2.7583506418332258,
"grad_norm": 0.0018065335461869836,
"learning_rate": 1.6066884769174848e-06,
"loss": 0.0,
"step": 2530
},
{
"epoch": 2.7692464843883005,
"grad_norm": 0.000252872530836612,
"learning_rate": 1.53398764085787e-06,
"loss": 0.0002,
"step": 2540
},
{
"epoch": 2.7801423269433756,
"grad_norm": 0.0006220173672772944,
"learning_rate": 1.4612868047982554e-06,
"loss": 0.0,
"step": 2550
},
{
"epoch": 2.7910381694984507,
"grad_norm": 0.00021657197794411331,
"learning_rate": 1.3885859687386405e-06,
"loss": 0.002,
"step": 2560
},
{
"epoch": 2.801934012053526,
"grad_norm": 0.062267255038022995,
"learning_rate": 1.315885132679026e-06,
"loss": 0.0001,
"step": 2570
},
{
"epoch": 2.812829854608601,
"grad_norm": 0.00383751024492085,
"learning_rate": 1.2431842966194113e-06,
"loss": 0.0002,
"step": 2580
},
{
"epoch": 2.8237256971636757,
"grad_norm": 9.788275929167867e-05,
"learning_rate": 1.1704834605597967e-06,
"loss": 0.0006,
"step": 2590
},
{
"epoch": 2.834621539718751,
"grad_norm": 0.0013275217497721314,
"learning_rate": 1.0977826245001818e-06,
"loss": 0.0002,
"step": 2600
},
{
"epoch": 2.845517382273826,
"grad_norm": 0.0015028759371489286,
"learning_rate": 1.0250817884405672e-06,
"loss": 0.0,
"step": 2610
},
{
"epoch": 2.856413224828901,
"grad_norm": 0.00014119225670583546,
"learning_rate": 9.523809523809525e-07,
"loss": 0.0,
"step": 2620
},
{
"epoch": 2.8673090673839763,
"grad_norm": 0.007295021787285805,
"learning_rate": 8.796801163213378e-07,
"loss": 0.0,
"step": 2630
},
{
"epoch": 2.8782049099390514,
"grad_norm": 2.5996017939178273e-05,
"learning_rate": 8.069792802617231e-07,
"loss": 0.0001,
"step": 2640
},
{
"epoch": 2.8891007524941266,
"grad_norm": 0.00027592500555329025,
"learning_rate": 7.342784442021084e-07,
"loss": 0.0001,
"step": 2650
},
{
"epoch": 2.8999965950492017,
"grad_norm": 0.0033551298547536135,
"learning_rate": 6.615776081424936e-07,
"loss": 0.0,
"step": 2660
},
{
"epoch": 2.910892437604277,
"grad_norm": 0.0005961539573036134,
"learning_rate": 5.88876772082879e-07,
"loss": 0.0,
"step": 2670
},
{
"epoch": 2.9217882801593515,
"grad_norm": 0.0015423846198245883,
"learning_rate": 5.161759360232643e-07,
"loss": 0.0003,
"step": 2680
},
{
"epoch": 2.9326841227144267,
"grad_norm": 0.000448063132353127,
"learning_rate": 4.434750999636496e-07,
"loss": 0.0031,
"step": 2690
},
{
"epoch": 2.943579965269502,
"grad_norm": 0.003001452423632145,
"learning_rate": 3.7077426390403497e-07,
"loss": 0.0,
"step": 2700
},
{
"epoch": 2.954475807824577,
"grad_norm": 4.6965491492301226e-05,
"learning_rate": 2.9807342784442023e-07,
"loss": 0.0001,
"step": 2710
},
{
"epoch": 2.965371650379652,
"grad_norm": 0.00013006600784137845,
"learning_rate": 2.2537259178480555e-07,
"loss": 0.001,
"step": 2720
},
{
"epoch": 2.9762674929347273,
"grad_norm": 0.006912072654813528,
"learning_rate": 1.5267175572519085e-07,
"loss": 0.0,
"step": 2730
},
{
"epoch": 2.987163335489802,
"grad_norm": 0.0006019837455824018,
"learning_rate": 7.997091966557616e-08,
"loss": 0.0,
"step": 2740
},
{
"epoch": 2.998059178044877,
"grad_norm": 0.006343195680528879,
"learning_rate": 7.2700836059614684e-09,
"loss": 0.0005,
"step": 2750
}
],
"logging_steps": 10,
"max_steps": 2751,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1303954889740124e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}