openthoughts2_math_100k / trainer_state.json
EtashGuha's picture
Upload model
9876318 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.954168967421314,
"eval_steps": 500,
"global_step": 280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0176697956929873,
"grad_norm": 5.875631617147988,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.8112,
"step": 1
},
{
"epoch": 0.0353395913859746,
"grad_norm": 5.961953835603671,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.8194,
"step": 2
},
{
"epoch": 0.0530093870789619,
"grad_norm": 5.5057251196001635,
"learning_rate": 8.571428571428571e-06,
"loss": 0.796,
"step": 3
},
{
"epoch": 0.0706791827719492,
"grad_norm": 2.504760468622455,
"learning_rate": 1.1428571428571429e-05,
"loss": 0.7065,
"step": 4
},
{
"epoch": 0.0883489784649365,
"grad_norm": 3.925627650334603,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.7078,
"step": 5
},
{
"epoch": 0.1060187741579238,
"grad_norm": 4.140394334047557,
"learning_rate": 1.7142857142857142e-05,
"loss": 0.6955,
"step": 6
},
{
"epoch": 0.1236885698509111,
"grad_norm": 4.544574066739854,
"learning_rate": 2e-05,
"loss": 0.6606,
"step": 7
},
{
"epoch": 0.1413583655438984,
"grad_norm": 2.8858399260590226,
"learning_rate": 2.2857142857142858e-05,
"loss": 0.6356,
"step": 8
},
{
"epoch": 0.1590281612368857,
"grad_norm": 2.810080725674096,
"learning_rate": 2.5714285714285718e-05,
"loss": 0.6119,
"step": 9
},
{
"epoch": 0.176697956929873,
"grad_norm": 2.3552991906071616,
"learning_rate": 2.8571428571428574e-05,
"loss": 0.6021,
"step": 10
},
{
"epoch": 0.1943677526228603,
"grad_norm": 1.2910105034140027,
"learning_rate": 3.142857142857143e-05,
"loss": 0.5694,
"step": 11
},
{
"epoch": 0.2120375483158476,
"grad_norm": 1.6332855505381296,
"learning_rate": 3.4285714285714284e-05,
"loss": 0.554,
"step": 12
},
{
"epoch": 0.22970734400883489,
"grad_norm": 1.2595348729805071,
"learning_rate": 3.714285714285715e-05,
"loss": 0.5432,
"step": 13
},
{
"epoch": 0.2473771397018222,
"grad_norm": 1.3472823656550243,
"learning_rate": 4e-05,
"loss": 0.536,
"step": 14
},
{
"epoch": 0.2650469353948095,
"grad_norm": 0.9873839365603844,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.5393,
"step": 15
},
{
"epoch": 0.2827167310877968,
"grad_norm": 1.1377505855165275,
"learning_rate": 4.5714285714285716e-05,
"loss": 0.5293,
"step": 16
},
{
"epoch": 0.3003865267807841,
"grad_norm": 1.0613233081748863,
"learning_rate": 4.857142857142857e-05,
"loss": 0.5129,
"step": 17
},
{
"epoch": 0.3180563224737714,
"grad_norm": 1.1184007657565689,
"learning_rate": 5.1428571428571436e-05,
"loss": 0.5091,
"step": 18
},
{
"epoch": 0.3357261181667587,
"grad_norm": 1.1596525894157543,
"learning_rate": 5.4285714285714295e-05,
"loss": 0.5141,
"step": 19
},
{
"epoch": 0.353395913859746,
"grad_norm": 1.1056524487969162,
"learning_rate": 5.714285714285715e-05,
"loss": 0.5065,
"step": 20
},
{
"epoch": 0.37106570955273327,
"grad_norm": 1.2246588779747853,
"learning_rate": 6.000000000000001e-05,
"loss": 0.5001,
"step": 21
},
{
"epoch": 0.3887355052457206,
"grad_norm": 0.9005160048442259,
"learning_rate": 6.285714285714286e-05,
"loss": 0.4948,
"step": 22
},
{
"epoch": 0.4064053009387079,
"grad_norm": 1.468175523379714,
"learning_rate": 6.571428571428571e-05,
"loss": 0.5029,
"step": 23
},
{
"epoch": 0.4240750966316952,
"grad_norm": 0.7100895840287704,
"learning_rate": 6.857142857142857e-05,
"loss": 0.4901,
"step": 24
},
{
"epoch": 0.4417448923246825,
"grad_norm": 1.4524384718122851,
"learning_rate": 7.142857142857143e-05,
"loss": 0.5015,
"step": 25
},
{
"epoch": 0.45941468801766977,
"grad_norm": 3.29851781812619,
"learning_rate": 7.42857142857143e-05,
"loss": 0.4866,
"step": 26
},
{
"epoch": 0.4770844837106571,
"grad_norm": 1.6338876773578732,
"learning_rate": 7.714285714285715e-05,
"loss": 0.5027,
"step": 27
},
{
"epoch": 0.4947542794036444,
"grad_norm": 1.1095768120544467,
"learning_rate": 8e-05,
"loss": 0.4881,
"step": 28
},
{
"epoch": 0.5124240750966317,
"grad_norm": 0.9195853861546358,
"learning_rate": 7.9996891699239e-05,
"loss": 0.4766,
"step": 29
},
{
"epoch": 0.530093870789619,
"grad_norm": 1.893948627853531,
"learning_rate": 7.998756728003266e-05,
"loss": 0.5003,
"step": 30
},
{
"epoch": 0.5477636664826063,
"grad_norm": 1.3966468358301396,
"learning_rate": 7.997202819153595e-05,
"loss": 0.4748,
"step": 31
},
{
"epoch": 0.5654334621755936,
"grad_norm": 1.173904127551551,
"learning_rate": 7.99502768487569e-05,
"loss": 0.4728,
"step": 32
},
{
"epoch": 0.5831032578685809,
"grad_norm": 1.6511214276120525,
"learning_rate": 7.992231663218129e-05,
"loss": 0.4727,
"step": 33
},
{
"epoch": 0.6007730535615682,
"grad_norm": 0.8130185719483476,
"learning_rate": 7.988815188724721e-05,
"loss": 0.4696,
"step": 34
},
{
"epoch": 0.6184428492545555,
"grad_norm": 1.2456661967993063,
"learning_rate": 7.984778792366983e-05,
"loss": 0.4711,
"step": 35
},
{
"epoch": 0.6361126449475428,
"grad_norm": 1.0570393538806437,
"learning_rate": 7.980123101461606e-05,
"loss": 0.463,
"step": 36
},
{
"epoch": 0.6537824406405301,
"grad_norm": 0.9901050412456828,
"learning_rate": 7.974848839572971e-05,
"loss": 0.4578,
"step": 37
},
{
"epoch": 0.6714522363335174,
"grad_norm": 0.8705213139045455,
"learning_rate": 7.96895682640069e-05,
"loss": 0.4645,
"step": 38
},
{
"epoch": 0.6891220320265047,
"grad_norm": 0.7854045575641022,
"learning_rate": 7.962447977652211e-05,
"loss": 0.4647,
"step": 39
},
{
"epoch": 0.706791827719492,
"grad_norm": 0.8451669852541023,
"learning_rate": 7.955323304900514e-05,
"loss": 0.4609,
"step": 40
},
{
"epoch": 0.7244616234124793,
"grad_norm": 1.0240484592422439,
"learning_rate": 7.947583915426885e-05,
"loss": 0.4583,
"step": 41
},
{
"epoch": 0.7421314191054665,
"grad_norm": 0.9325468461529605,
"learning_rate": 7.939231012048833e-05,
"loss": 0.4599,
"step": 42
},
{
"epoch": 0.7598012147984539,
"grad_norm": 0.8208173672541784,
"learning_rate": 7.930265892933154e-05,
"loss": 0.4488,
"step": 43
},
{
"epoch": 0.7774710104914412,
"grad_norm": 0.8066370895813,
"learning_rate": 7.920689951394175e-05,
"loss": 0.4599,
"step": 44
},
{
"epoch": 0.7951408061844285,
"grad_norm": 0.5967750986819955,
"learning_rate": 7.91050467567722e-05,
"loss": 0.4531,
"step": 45
},
{
"epoch": 0.8128106018774158,
"grad_norm": 0.6644148535772102,
"learning_rate": 7.899711648727294e-05,
"loss": 0.4503,
"step": 46
},
{
"epoch": 0.830480397570403,
"grad_norm": 0.7344655144114142,
"learning_rate": 7.888312547943099e-05,
"loss": 0.4509,
"step": 47
},
{
"epoch": 0.8481501932633904,
"grad_norm": 3.462692478412094,
"learning_rate": 7.876309144916312e-05,
"loss": 0.4933,
"step": 48
},
{
"epoch": 0.8658199889563777,
"grad_norm": 1.0900503309702443,
"learning_rate": 7.863703305156273e-05,
"loss": 0.4673,
"step": 49
},
{
"epoch": 0.883489784649365,
"grad_norm": 1.372735585333971,
"learning_rate": 7.850496987800048e-05,
"loss": 0.45,
"step": 50
},
{
"epoch": 0.9011595803423523,
"grad_norm": 0.9943280611059013,
"learning_rate": 7.836692245307951e-05,
"loss": 0.4619,
"step": 51
},
{
"epoch": 0.9188293760353395,
"grad_norm": 1.1230540437524632,
"learning_rate": 7.822291223144564e-05,
"loss": 0.4602,
"step": 52
},
{
"epoch": 0.9364991717283269,
"grad_norm": 0.9358182024156444,
"learning_rate": 7.80729615944529e-05,
"loss": 0.4627,
"step": 53
},
{
"epoch": 0.9541689674213142,
"grad_norm": 0.6965038938335383,
"learning_rate": 7.791709384668528e-05,
"loss": 0.4377,
"step": 54
},
{
"epoch": 0.9718387631143015,
"grad_norm": 0.8794762258340597,
"learning_rate": 7.775533321233471e-05,
"loss": 0.4416,
"step": 55
},
{
"epoch": 0.9895085588072888,
"grad_norm": 0.5702192945738279,
"learning_rate": 7.758770483143634e-05,
"loss": 0.4381,
"step": 56
},
{
"epoch": 1.0088348978464936,
"grad_norm": 1.0158837364149667,
"learning_rate": 7.741423475596136e-05,
"loss": 0.6602,
"step": 57
},
{
"epoch": 1.026504693539481,
"grad_norm": 0.7661664733655509,
"learning_rate": 7.723494994576818e-05,
"loss": 0.4224,
"step": 58
},
{
"epoch": 1.0441744892324683,
"grad_norm": 0.6984964000427009,
"learning_rate": 7.704987826441235e-05,
"loss": 0.423,
"step": 59
},
{
"epoch": 1.0618442849254555,
"grad_norm": 0.543414847614615,
"learning_rate": 7.685904847481631e-05,
"loss": 0.4214,
"step": 60
},
{
"epoch": 1.079514080618443,
"grad_norm": 0.6586728820833049,
"learning_rate": 7.666249023479905e-05,
"loss": 0.4232,
"step": 61
},
{
"epoch": 1.09718387631143,
"grad_norm": 0.48664555730901193,
"learning_rate": 7.646023409246694e-05,
"loss": 0.4184,
"step": 62
},
{
"epoch": 1.1148536720044175,
"grad_norm": 0.45691517952976585,
"learning_rate": 7.625231148146601e-05,
"loss": 0.4116,
"step": 63
},
{
"epoch": 1.1325234676974048,
"grad_norm": 0.44580349839113315,
"learning_rate": 7.603875471609677e-05,
"loss": 0.4148,
"step": 64
},
{
"epoch": 1.150193263390392,
"grad_norm": 0.38551843328636426,
"learning_rate": 7.581959698629204e-05,
"loss": 0.4081,
"step": 65
},
{
"epoch": 1.1678630590833794,
"grad_norm": 0.4359324964542397,
"learning_rate": 7.559487235245875e-05,
"loss": 0.4151,
"step": 66
},
{
"epoch": 1.1855328547763666,
"grad_norm": 0.41530730049997977,
"learning_rate": 7.536461574018439e-05,
"loss": 0.4116,
"step": 67
},
{
"epoch": 1.203202650469354,
"grad_norm": 0.38292190540156723,
"learning_rate": 7.512886293480914e-05,
"loss": 0.4099,
"step": 68
},
{
"epoch": 1.2208724461623413,
"grad_norm": 0.4050268821480764,
"learning_rate": 7.488765057586422e-05,
"loss": 0.4059,
"step": 69
},
{
"epoch": 1.2385422418553285,
"grad_norm": 0.573303636137278,
"learning_rate": 7.464101615137756e-05,
"loss": 0.4137,
"step": 70
},
{
"epoch": 1.256212037548316,
"grad_norm": 0.6806327106102632,
"learning_rate": 7.438899799204762e-05,
"loss": 0.412,
"step": 71
},
{
"epoch": 1.273881833241303,
"grad_norm": 0.5165313145422143,
"learning_rate": 7.413163526528623e-05,
"loss": 0.4078,
"step": 72
},
{
"epoch": 1.2915516289342905,
"grad_norm": 0.5532520513811989,
"learning_rate": 7.386896796913137e-05,
"loss": 0.4026,
"step": 73
},
{
"epoch": 1.3092214246272778,
"grad_norm": 0.6170140080440525,
"learning_rate": 7.360103692603087e-05,
"loss": 0.4025,
"step": 74
},
{
"epoch": 1.326891220320265,
"grad_norm": 0.42873518437379604,
"learning_rate": 7.332788377649796e-05,
"loss": 0.4052,
"step": 75
},
{
"epoch": 1.3445610160132524,
"grad_norm": 0.449652816843054,
"learning_rate": 7.30495509726398e-05,
"loss": 0.4089,
"step": 76
},
{
"epoch": 1.3622308117062396,
"grad_norm": 0.4609411200515682,
"learning_rate": 7.276608177155968e-05,
"loss": 0.409,
"step": 77
},
{
"epoch": 1.379900607399227,
"grad_norm": 0.37783779994964356,
"learning_rate": 7.247752022863428e-05,
"loss": 0.411,
"step": 78
},
{
"epoch": 1.3975704030922143,
"grad_norm": 0.47167800258784437,
"learning_rate": 7.218391119066674e-05,
"loss": 0.4006,
"step": 79
},
{
"epoch": 1.4152401987852015,
"grad_norm": 0.5209290267234993,
"learning_rate": 7.188530028891691e-05,
"loss": 0.3961,
"step": 80
},
{
"epoch": 1.432909994478189,
"grad_norm": 0.35540053272228744,
"learning_rate": 7.158173393200942e-05,
"loss": 0.3999,
"step": 81
},
{
"epoch": 1.450579790171176,
"grad_norm": 0.2822473247891192,
"learning_rate": 7.12732592987212e-05,
"loss": 0.4029,
"step": 82
},
{
"epoch": 1.4682495858641635,
"grad_norm": 0.44251535710813034,
"learning_rate": 7.09599243306491e-05,
"loss": 0.411,
"step": 83
},
{
"epoch": 1.4859193815571508,
"grad_norm": 0.46257597741002365,
"learning_rate": 7.064177772475912e-05,
"loss": 0.3997,
"step": 84
},
{
"epoch": 1.503589177250138,
"grad_norm": 0.3804354174312414,
"learning_rate": 7.031886892581813e-05,
"loss": 0.3984,
"step": 85
},
{
"epoch": 1.5212589729431254,
"grad_norm": 0.2667610882166938,
"learning_rate": 6.999124811870938e-05,
"loss": 0.3986,
"step": 86
},
{
"epoch": 1.5389287686361126,
"grad_norm": 0.30942594899982945,
"learning_rate": 6.965896622063307e-05,
"loss": 0.4055,
"step": 87
},
{
"epoch": 1.5565985643291,
"grad_norm": 0.3550745842897038,
"learning_rate": 6.932207487319305e-05,
"loss": 0.408,
"step": 88
},
{
"epoch": 1.5742683600220873,
"grad_norm": 0.3569377139350627,
"learning_rate": 6.898062643437091e-05,
"loss": 0.3961,
"step": 89
},
{
"epoch": 1.5919381557150745,
"grad_norm": 0.33698677021351336,
"learning_rate": 6.863467397038874e-05,
"loss": 0.3927,
"step": 90
},
{
"epoch": 1.609607951408062,
"grad_norm": 0.4283132299621792,
"learning_rate": 6.828427124746191e-05,
"loss": 0.3962,
"step": 91
},
{
"epoch": 1.627277747101049,
"grad_norm": 0.4800585612046361,
"learning_rate": 6.792947272344292e-05,
"loss": 0.4024,
"step": 92
},
{
"epoch": 1.6449475427940365,
"grad_norm": 0.4106952334802103,
"learning_rate": 6.757033353935788e-05,
"loss": 0.3983,
"step": 93
},
{
"epoch": 1.6626173384870238,
"grad_norm": 0.44863543828572716,
"learning_rate": 6.720690951083678e-05,
"loss": 0.3983,
"step": 94
},
{
"epoch": 1.680287134180011,
"grad_norm": 0.5972683621698959,
"learning_rate": 6.68392571194388e-05,
"loss": 0.3952,
"step": 95
},
{
"epoch": 1.6979569298729982,
"grad_norm": 0.6864704632849609,
"learning_rate": 6.646743350387438e-05,
"loss": 0.4052,
"step": 96
},
{
"epoch": 1.7156267255659856,
"grad_norm": 0.5997402895023315,
"learning_rate": 6.609149645112485e-05,
"loss": 0.3977,
"step": 97
},
{
"epoch": 1.733296521258973,
"grad_norm": 0.42336994036744263,
"learning_rate": 6.571150438746157e-05,
"loss": 0.3985,
"step": 98
},
{
"epoch": 1.7509663169519603,
"grad_norm": 0.416677515975007,
"learning_rate": 6.532751636936561e-05,
"loss": 0.4043,
"step": 99
},
{
"epoch": 1.7686361126449475,
"grad_norm": 0.5095146036807348,
"learning_rate": 6.493959207434934e-05,
"loss": 0.3931,
"step": 100
},
{
"epoch": 1.7863059083379347,
"grad_norm": 0.45946367582524933,
"learning_rate": 6.45477917916819e-05,
"loss": 0.3972,
"step": 101
},
{
"epoch": 1.8039757040309221,
"grad_norm": 0.3713407116922932,
"learning_rate": 6.41521764130191e-05,
"loss": 0.4044,
"step": 102
},
{
"epoch": 1.8216454997239095,
"grad_norm": 0.40017896468093417,
"learning_rate": 6.375280742294007e-05,
"loss": 0.398,
"step": 103
},
{
"epoch": 1.8393152954168968,
"grad_norm": 0.5022523013468211,
"learning_rate": 6.334974688939161e-05,
"loss": 0.3963,
"step": 104
},
{
"epoch": 1.856985091109884,
"grad_norm": 0.3934847620868829,
"learning_rate": 6.294305745404185e-05,
"loss": 0.3884,
"step": 105
},
{
"epoch": 1.8746548868028712,
"grad_norm": 0.3453314877586019,
"learning_rate": 6.253280232254489e-05,
"loss": 0.3899,
"step": 106
},
{
"epoch": 1.8923246824958586,
"grad_norm": 0.4479654044472846,
"learning_rate": 6.211904525471758e-05,
"loss": 0.3938,
"step": 107
},
{
"epoch": 1.909994478188846,
"grad_norm": 0.39352474590110836,
"learning_rate": 6.170185055463039e-05,
"loss": 0.3915,
"step": 108
},
{
"epoch": 1.9276642738818333,
"grad_norm": 0.2714956264200266,
"learning_rate": 6.128128306061347e-05,
"loss": 0.3899,
"step": 109
},
{
"epoch": 1.9453340695748205,
"grad_norm": 0.31091747939470954,
"learning_rate": 6.0857408135179926e-05,
"loss": 0.3893,
"step": 110
},
{
"epoch": 1.9630038652678077,
"grad_norm": 0.3575276621136432,
"learning_rate": 6.0430291654867435e-05,
"loss": 0.3913,
"step": 111
},
{
"epoch": 1.9806736609607951,
"grad_norm": 0.2508689498362062,
"learning_rate": 6.000000000000001e-05,
"loss": 0.3972,
"step": 112
},
{
"epoch": 1.9983434566537825,
"grad_norm": 0.5605963723377234,
"learning_rate": 5.9566600044371584e-05,
"loss": 0.5878,
"step": 113
},
{
"epoch": 2.017669795692987,
"grad_norm": 0.7102289660217768,
"learning_rate": 5.913015914485274e-05,
"loss": 0.3661,
"step": 114
},
{
"epoch": 2.0353395913859744,
"grad_norm": 0.3944445954033844,
"learning_rate": 5.869074513092249e-05,
"loss": 0.373,
"step": 115
},
{
"epoch": 2.053009387078962,
"grad_norm": 0.48382052666433606,
"learning_rate": 5.824842629412653e-05,
"loss": 0.3739,
"step": 116
},
{
"epoch": 2.0706791827719493,
"grad_norm": 0.5806563133800278,
"learning_rate": 5.7803271377463695e-05,
"loss": 0.3672,
"step": 117
},
{
"epoch": 2.0883489784649365,
"grad_norm": 0.6502257019855456,
"learning_rate": 5.735534956470233e-05,
"loss": 0.3644,
"step": 118
},
{
"epoch": 2.1060187741579237,
"grad_norm": 0.8472824581196564,
"learning_rate": 5.6904730469627985e-05,
"loss": 0.3709,
"step": 119
},
{
"epoch": 2.123688569850911,
"grad_norm": 0.6853246906791319,
"learning_rate": 5.645148412522447e-05,
"loss": 0.3645,
"step": 120
},
{
"epoch": 2.1413583655438986,
"grad_norm": 0.4107921656466385,
"learning_rate": 5.5995680972789634e-05,
"loss": 0.3662,
"step": 121
},
{
"epoch": 2.159028161236886,
"grad_norm": 0.37871427512746525,
"learning_rate": 5.5537391850987795e-05,
"loss": 0.3614,
"step": 122
},
{
"epoch": 2.176697956929873,
"grad_norm": 0.5083596297218169,
"learning_rate": 5.507668798484021e-05,
"loss": 0.3645,
"step": 123
},
{
"epoch": 2.19436775262286,
"grad_norm": 0.4631791629644736,
"learning_rate": 5.461364097465581e-05,
"loss": 0.3651,
"step": 124
},
{
"epoch": 2.2120375483158474,
"grad_norm": 0.3234371144780778,
"learning_rate": 5.414832278490326e-05,
"loss": 0.3604,
"step": 125
},
{
"epoch": 2.229707344008835,
"grad_norm": 0.352276909118395,
"learning_rate": 5.368080573302676e-05,
"loss": 0.3662,
"step": 126
},
{
"epoch": 2.2473771397018223,
"grad_norm": 0.41914234346119467,
"learning_rate": 5.321116247820669e-05,
"loss": 0.3603,
"step": 127
},
{
"epoch": 2.2650469353948095,
"grad_norm": 0.28573640352352386,
"learning_rate": 5.2739466010067385e-05,
"loss": 0.3556,
"step": 128
},
{
"epoch": 2.2827167310877967,
"grad_norm": 0.25492365093010394,
"learning_rate": 5.226578963733338e-05,
"loss": 0.363,
"step": 129
},
{
"epoch": 2.300386526780784,
"grad_norm": 0.2927182277484276,
"learning_rate": 5.179020697643618e-05,
"loss": 0.3636,
"step": 130
},
{
"epoch": 2.3180563224737716,
"grad_norm": 0.2580774273705453,
"learning_rate": 5.13127919400731e-05,
"loss": 0.3632,
"step": 131
},
{
"epoch": 2.335726118166759,
"grad_norm": 0.22474459932551596,
"learning_rate": 5.0833618725720214e-05,
"loss": 0.3614,
"step": 132
},
{
"epoch": 2.353395913859746,
"grad_norm": 0.25586289471702456,
"learning_rate": 5.0352761804100835e-05,
"loss": 0.36,
"step": 133
},
{
"epoch": 2.371065709552733,
"grad_norm": 0.24042001746342648,
"learning_rate": 4.987029590761174e-05,
"loss": 0.3667,
"step": 134
},
{
"epoch": 2.3887355052457204,
"grad_norm": 0.24039897426531276,
"learning_rate": 4.9386296018708614e-05,
"loss": 0.3673,
"step": 135
},
{
"epoch": 2.406405300938708,
"grad_norm": 0.27314874775426934,
"learning_rate": 4.890083735825258e-05,
"loss": 0.3619,
"step": 136
},
{
"epoch": 2.4240750966316953,
"grad_norm": 0.23246378924197766,
"learning_rate": 4.841399537381984e-05,
"loss": 0.3623,
"step": 137
},
{
"epoch": 2.4417448923246825,
"grad_norm": 0.191508019981753,
"learning_rate": 4.792584572797591e-05,
"loss": 0.3633,
"step": 138
},
{
"epoch": 2.4594146880176697,
"grad_norm": 0.22416894070833618,
"learning_rate": 4.743646428651659e-05,
"loss": 0.3584,
"step": 139
},
{
"epoch": 2.477084483710657,
"grad_norm": 0.20230146319655629,
"learning_rate": 4.694592710667723e-05,
"loss": 0.3615,
"step": 140
},
{
"epoch": 2.4947542794036446,
"grad_norm": 0.1822985257854695,
"learning_rate": 4.645431042531227e-05,
"loss": 0.363,
"step": 141
},
{
"epoch": 2.512424075096632,
"grad_norm": 0.20736102263198034,
"learning_rate": 4.5961690647046974e-05,
"loss": 0.3586,
"step": 142
},
{
"epoch": 2.530093870789619,
"grad_norm": 0.17577697599602832,
"learning_rate": 4.546814433240294e-05,
"loss": 0.3598,
"step": 143
},
{
"epoch": 2.547763666482606,
"grad_norm": 0.20267553551988982,
"learning_rate": 4.4973748185899416e-05,
"loss": 0.3595,
"step": 144
},
{
"epoch": 2.5654334621755934,
"grad_norm": 0.16672091257518545,
"learning_rate": 4.4478579044132314e-05,
"loss": 0.3591,
"step": 145
},
{
"epoch": 2.583103257868581,
"grad_norm": 0.18713978691301508,
"learning_rate": 4.398271386383267e-05,
"loss": 0.3588,
"step": 146
},
{
"epoch": 2.6007730535615683,
"grad_norm": 0.1544729610927051,
"learning_rate": 4.348622970990634e-05,
"loss": 0.3535,
"step": 147
},
{
"epoch": 2.6184428492545555,
"grad_norm": 0.16023976528470063,
"learning_rate": 4.298920374345698e-05,
"loss": 0.3624,
"step": 148
},
{
"epoch": 2.6361126449475427,
"grad_norm": 0.1952298963833661,
"learning_rate": 4.249171320979409e-05,
"loss": 0.3592,
"step": 149
},
{
"epoch": 2.65378244064053,
"grad_norm": 0.17353667541371376,
"learning_rate": 4.199383542642789e-05,
"loss": 0.3655,
"step": 150
},
{
"epoch": 2.6714522363335176,
"grad_norm": 0.194516804675962,
"learning_rate": 4.149564777105304e-05,
"loss": 0.3565,
"step": 151
},
{
"epoch": 2.689122032026505,
"grad_norm": 0.1758217981986949,
"learning_rate": 4.0997227669522924e-05,
"loss": 0.3666,
"step": 152
},
{
"epoch": 2.706791827719492,
"grad_norm": 0.16067174619876137,
"learning_rate": 4.0498652583816606e-05,
"loss": 0.3592,
"step": 153
},
{
"epoch": 2.724461623412479,
"grad_norm": 0.14593999093364027,
"learning_rate": 4e-05,
"loss": 0.3561,
"step": 154
},
{
"epoch": 2.7421314191054664,
"grad_norm": 0.1603549305082767,
"learning_rate": 3.95013474161834e-05,
"loss": 0.3588,
"step": 155
},
{
"epoch": 2.759801214798454,
"grad_norm": 0.11543165975506586,
"learning_rate": 3.9002772330477096e-05,
"loss": 0.3613,
"step": 156
},
{
"epoch": 2.7774710104914413,
"grad_norm": 0.16445493416381396,
"learning_rate": 3.850435222894698e-05,
"loss": 0.3607,
"step": 157
},
{
"epoch": 2.7951408061844285,
"grad_norm": 0.1414422759760679,
"learning_rate": 3.800616457357211e-05,
"loss": 0.36,
"step": 158
},
{
"epoch": 2.8128106018774157,
"grad_norm": 0.13046409193282807,
"learning_rate": 3.7508286790205916e-05,
"loss": 0.35,
"step": 159
},
{
"epoch": 2.830480397570403,
"grad_norm": 0.1596088175716807,
"learning_rate": 3.7010796256543034e-05,
"loss": 0.3639,
"step": 160
},
{
"epoch": 2.8481501932633906,
"grad_norm": 0.1263702590709236,
"learning_rate": 3.6513770290093674e-05,
"loss": 0.3592,
"step": 161
},
{
"epoch": 2.865819988956378,
"grad_norm": 0.14761623625344572,
"learning_rate": 3.601728613616734e-05,
"loss": 0.3609,
"step": 162
},
{
"epoch": 2.883489784649365,
"grad_norm": 0.1457836294562342,
"learning_rate": 3.552142095586769e-05,
"loss": 0.3515,
"step": 163
},
{
"epoch": 2.901159580342352,
"grad_norm": 0.11727253956988348,
"learning_rate": 3.5026251814100604e-05,
"loss": 0.3611,
"step": 164
},
{
"epoch": 2.9188293760353394,
"grad_norm": 0.13457982400211851,
"learning_rate": 3.453185566759707e-05,
"loss": 0.3536,
"step": 165
},
{
"epoch": 2.936499171728327,
"grad_norm": 0.13998361556088473,
"learning_rate": 3.403830935295302e-05,
"loss": 0.3608,
"step": 166
},
{
"epoch": 2.9541689674213143,
"grad_norm": 0.14067124850466778,
"learning_rate": 3.3545689574687734e-05,
"loss": 0.3706,
"step": 167
},
{
"epoch": 2.9718387631143015,
"grad_norm": 0.1318011050837946,
"learning_rate": 3.305407289332279e-05,
"loss": 0.3544,
"step": 168
},
{
"epoch": 2.9895085588072887,
"grad_norm": 0.14546432839266002,
"learning_rate": 3.256353571348342e-05,
"loss": 0.3709,
"step": 169
},
{
"epoch": 3.008834897846494,
"grad_norm": 0.19583526414298022,
"learning_rate": 3.207415427202411e-05,
"loss": 0.527,
"step": 170
},
{
"epoch": 3.026504693539481,
"grad_norm": 0.1877498962594719,
"learning_rate": 3.1586004626180175e-05,
"loss": 0.3322,
"step": 171
},
{
"epoch": 3.0441744892324683,
"grad_norm": 0.18186432632351485,
"learning_rate": 3.109916264174743e-05,
"loss": 0.3366,
"step": 172
},
{
"epoch": 3.0618442849254555,
"grad_norm": 0.19885800612643872,
"learning_rate": 3.0613703981291406e-05,
"loss": 0.3324,
"step": 173
},
{
"epoch": 3.0795140806184427,
"grad_norm": 0.2089734413211629,
"learning_rate": 3.0129704092388253e-05,
"loss": 0.3339,
"step": 174
},
{
"epoch": 3.0971838763114303,
"grad_norm": 0.17173171928937206,
"learning_rate": 2.9647238195899168e-05,
"loss": 0.3335,
"step": 175
},
{
"epoch": 3.1148536720044175,
"grad_norm": 0.2123048979948462,
"learning_rate": 2.9166381274279803e-05,
"loss": 0.3333,
"step": 176
},
{
"epoch": 3.1325234676974048,
"grad_norm": 0.16806112420590733,
"learning_rate": 2.8687208059926904e-05,
"loss": 0.3353,
"step": 177
},
{
"epoch": 3.150193263390392,
"grad_norm": 0.17559998407873006,
"learning_rate": 2.8209793023563833e-05,
"loss": 0.3304,
"step": 178
},
{
"epoch": 3.167863059083379,
"grad_norm": 0.1478401374423511,
"learning_rate": 2.7734210362666637e-05,
"loss": 0.3301,
"step": 179
},
{
"epoch": 3.185532854776367,
"grad_norm": 0.15148759846687243,
"learning_rate": 2.7260533989932628e-05,
"loss": 0.3332,
"step": 180
},
{
"epoch": 3.203202650469354,
"grad_norm": 0.13080567876743235,
"learning_rate": 2.678883752179333e-05,
"loss": 0.3296,
"step": 181
},
{
"epoch": 3.2208724461623413,
"grad_norm": 0.14032258509798645,
"learning_rate": 2.6319194266973256e-05,
"loss": 0.3272,
"step": 182
},
{
"epoch": 3.2385422418553285,
"grad_norm": 0.1162785431430188,
"learning_rate": 2.5851677215096745e-05,
"loss": 0.3316,
"step": 183
},
{
"epoch": 3.2562120375483157,
"grad_norm": 0.13212366326804822,
"learning_rate": 2.53863590253442e-05,
"loss": 0.3357,
"step": 184
},
{
"epoch": 3.2738818332413033,
"grad_norm": 0.11293978492936926,
"learning_rate": 2.4923312015159794e-05,
"loss": 0.3301,
"step": 185
},
{
"epoch": 3.2915516289342905,
"grad_norm": 0.11627210040478692,
"learning_rate": 2.4462608149012215e-05,
"loss": 0.3372,
"step": 186
},
{
"epoch": 3.3092214246272778,
"grad_norm": 0.1116155121257842,
"learning_rate": 2.400431902721037e-05,
"loss": 0.332,
"step": 187
},
{
"epoch": 3.326891220320265,
"grad_norm": 0.10499190887903717,
"learning_rate": 2.3548515874775547e-05,
"loss": 0.3258,
"step": 188
},
{
"epoch": 3.344561016013252,
"grad_norm": 0.11985499625363151,
"learning_rate": 2.3095269530372032e-05,
"loss": 0.3356,
"step": 189
},
{
"epoch": 3.36223081170624,
"grad_norm": 0.10946277088364416,
"learning_rate": 2.264465043529768e-05,
"loss": 0.3339,
"step": 190
},
{
"epoch": 3.379900607399227,
"grad_norm": 0.10946281010542962,
"learning_rate": 2.2196728622536304e-05,
"loss": 0.3324,
"step": 191
},
{
"epoch": 3.3975704030922143,
"grad_norm": 0.10470866039213844,
"learning_rate": 2.175157370587348e-05,
"loss": 0.3333,
"step": 192
},
{
"epoch": 3.4152401987852015,
"grad_norm": 0.11496566123138528,
"learning_rate": 2.130925486907752e-05,
"loss": 0.3299,
"step": 193
},
{
"epoch": 3.4329099944781887,
"grad_norm": 0.10141893666492717,
"learning_rate": 2.0869840855147286e-05,
"loss": 0.3415,
"step": 194
},
{
"epoch": 3.4505797901711763,
"grad_norm": 0.11362446781863374,
"learning_rate": 2.0433399955628443e-05,
"loss": 0.3325,
"step": 195
},
{
"epoch": 3.4682495858641635,
"grad_norm": 0.09706953644786295,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.3385,
"step": 196
},
{
"epoch": 3.4859193815571508,
"grad_norm": 0.10343474750084429,
"learning_rate": 1.956970834513259e-05,
"loss": 0.3324,
"step": 197
},
{
"epoch": 3.503589177250138,
"grad_norm": 0.1115997799981753,
"learning_rate": 1.914259186482008e-05,
"loss": 0.3304,
"step": 198
},
{
"epoch": 3.5212589729431256,
"grad_norm": 0.100665947226005,
"learning_rate": 1.8718716939386543e-05,
"loss": 0.341,
"step": 199
},
{
"epoch": 3.5389287686361124,
"grad_norm": 0.1112389145071717,
"learning_rate": 1.829814944536963e-05,
"loss": 0.3311,
"step": 200
},
{
"epoch": 3.5565985643291,
"grad_norm": 0.10508913738753005,
"learning_rate": 1.7880954745282425e-05,
"loss": 0.3262,
"step": 201
},
{
"epoch": 3.5742683600220873,
"grad_norm": 0.10715189168423515,
"learning_rate": 1.7467197677455118e-05,
"loss": 0.3387,
"step": 202
},
{
"epoch": 3.5919381557150745,
"grad_norm": 0.12309528662432599,
"learning_rate": 1.7056942545958167e-05,
"loss": 0.3272,
"step": 203
},
{
"epoch": 3.609607951408062,
"grad_norm": 0.09503224614910906,
"learning_rate": 1.6650253110608415e-05,
"loss": 0.3361,
"step": 204
},
{
"epoch": 3.627277747101049,
"grad_norm": 0.10336176601010666,
"learning_rate": 1.6247192577059943e-05,
"loss": 0.3394,
"step": 205
},
{
"epoch": 3.6449475427940365,
"grad_norm": 0.10325570712107657,
"learning_rate": 1.5847823586980897e-05,
"loss": 0.3329,
"step": 206
},
{
"epoch": 3.6626173384870238,
"grad_norm": 0.09392755935325038,
"learning_rate": 1.545220820831811e-05,
"loss": 0.3273,
"step": 207
},
{
"epoch": 3.680287134180011,
"grad_norm": 0.09365769242973894,
"learning_rate": 1.5060407925650662e-05,
"loss": 0.3366,
"step": 208
},
{
"epoch": 3.697956929872998,
"grad_norm": 0.11814878497030305,
"learning_rate": 1.4672483630634414e-05,
"loss": 0.3365,
"step": 209
},
{
"epoch": 3.7156267255659854,
"grad_norm": 0.08867405697122034,
"learning_rate": 1.4288495612538427e-05,
"loss": 0.3344,
"step": 210
},
{
"epoch": 3.733296521258973,
"grad_norm": 0.09623647972155307,
"learning_rate": 1.3908503548875167e-05,
"loss": 0.334,
"step": 211
},
{
"epoch": 3.7509663169519603,
"grad_norm": 0.10214752499383144,
"learning_rate": 1.3532566496125634e-05,
"loss": 0.3319,
"step": 212
},
{
"epoch": 3.7686361126449475,
"grad_norm": 0.09305403874654943,
"learning_rate": 1.3160742880561204e-05,
"loss": 0.3327,
"step": 213
},
{
"epoch": 3.7863059083379347,
"grad_norm": 0.09492686150258188,
"learning_rate": 1.2793090489163218e-05,
"loss": 0.3276,
"step": 214
},
{
"epoch": 3.803975704030922,
"grad_norm": 0.08712346665776856,
"learning_rate": 1.242966646064212e-05,
"loss": 0.3378,
"step": 215
},
{
"epoch": 3.8216454997239095,
"grad_norm": 0.09302902241662848,
"learning_rate": 1.2070527276557092e-05,
"loss": 0.3276,
"step": 216
},
{
"epoch": 3.8393152954168968,
"grad_norm": 0.09984664869036139,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.335,
"step": 217
},
{
"epoch": 3.856985091109884,
"grad_norm": 0.08136111333958188,
"learning_rate": 1.1365326029611263e-05,
"loss": 0.325,
"step": 218
},
{
"epoch": 3.874654886802871,
"grad_norm": 0.09067392795608281,
"learning_rate": 1.1019373565629094e-05,
"loss": 0.3326,
"step": 219
},
{
"epoch": 3.8923246824958584,
"grad_norm": 0.1151467415131904,
"learning_rate": 1.0677925126806956e-05,
"loss": 0.3338,
"step": 220
},
{
"epoch": 3.909994478188846,
"grad_norm": 0.08540355002710472,
"learning_rate": 1.0341033779366931e-05,
"loss": 0.3281,
"step": 221
},
{
"epoch": 3.9276642738818333,
"grad_norm": 0.08890862244357016,
"learning_rate": 1.0008751881290628e-05,
"loss": 0.3279,
"step": 222
},
{
"epoch": 3.9453340695748205,
"grad_norm": 0.08886465646079035,
"learning_rate": 9.681131074181876e-06,
"loss": 0.3331,
"step": 223
},
{
"epoch": 3.9630038652678077,
"grad_norm": 0.08382860075616222,
"learning_rate": 9.358222275240884e-06,
"loss": 0.3301,
"step": 224
},
{
"epoch": 3.980673660960795,
"grad_norm": 0.08493360832822018,
"learning_rate": 9.040075669350905e-06,
"loss": 0.3321,
"step": 225
},
{
"epoch": 3.9983434566537825,
"grad_norm": 0.12681148128353786,
"learning_rate": 8.72674070127881e-06,
"loss": 0.4956,
"step": 226
},
{
"epoch": 4.017669795692988,
"grad_norm": 0.11963433162616376,
"learning_rate": 8.418266067990588e-06,
"loss": 0.3171,
"step": 227
},
{
"epoch": 4.035339591385974,
"grad_norm": 0.11337795603837922,
"learning_rate": 8.114699711083113e-06,
"loss": 0.3207,
"step": 228
},
{
"epoch": 4.053009387078962,
"grad_norm": 0.08872533010955261,
"learning_rate": 7.816088809333266e-06,
"loss": 0.3165,
"step": 229
},
{
"epoch": 4.070679182771949,
"grad_norm": 0.09241216784587171,
"learning_rate": 7.52247977136574e-06,
"loss": 0.328,
"step": 230
},
{
"epoch": 4.0883489784649365,
"grad_norm": 0.09282854796647831,
"learning_rate": 7.233918228440324e-06,
"loss": 0.3119,
"step": 231
},
{
"epoch": 4.106018774157924,
"grad_norm": 0.10157632018918776,
"learning_rate": 6.950449027360213e-06,
"loss": 0.3182,
"step": 232
},
{
"epoch": 4.123688569850911,
"grad_norm": 0.10277617383975274,
"learning_rate": 6.6721162235020476e-06,
"loss": 0.319,
"step": 233
},
{
"epoch": 4.141358365543899,
"grad_norm": 0.10056534251093886,
"learning_rate": 6.398963073969144e-06,
"loss": 0.3171,
"step": 234
},
{
"epoch": 4.159028161236885,
"grad_norm": 0.09505805141388292,
"learning_rate": 6.1310320308686354e-06,
"loss": 0.3147,
"step": 235
},
{
"epoch": 4.176697956929873,
"grad_norm": 0.09268654061240998,
"learning_rate": 5.868364734713776e-06,
"loss": 0.3191,
"step": 236
},
{
"epoch": 4.194367752622861,
"grad_norm": 0.08706624460622792,
"learning_rate": 5.611002007952389e-06,
"loss": 0.3208,
"step": 237
},
{
"epoch": 4.212037548315847,
"grad_norm": 0.08951679698719879,
"learning_rate": 5.358983848622452e-06,
"loss": 0.3172,
"step": 238
},
{
"epoch": 4.229707344008835,
"grad_norm": 0.09131830865477111,
"learning_rate": 5.112349424135788e-06,
"loss": 0.3164,
"step": 239
},
{
"epoch": 4.247377139701822,
"grad_norm": 0.08750009357915652,
"learning_rate": 4.871137065190854e-06,
"loss": 0.3106,
"step": 240
},
{
"epoch": 4.2650469353948095,
"grad_norm": 0.0850783896741063,
"learning_rate": 4.635384259815614e-06,
"loss": 0.3169,
"step": 241
},
{
"epoch": 4.282716731087797,
"grad_norm": 0.08475820034880008,
"learning_rate": 4.405127647541259e-06,
"loss": 0.3196,
"step": 242
},
{
"epoch": 4.300386526780784,
"grad_norm": 0.08682717964250186,
"learning_rate": 4.180403013707963e-06,
"loss": 0.3109,
"step": 243
},
{
"epoch": 4.318056322473772,
"grad_norm": 0.08602559336503131,
"learning_rate": 3.961245283903239e-06,
"loss": 0.3118,
"step": 244
},
{
"epoch": 4.335726118166758,
"grad_norm": 0.08629377402093397,
"learning_rate": 3.747688518534003e-06,
"loss": 0.3153,
"step": 245
},
{
"epoch": 4.353395913859746,
"grad_norm": 0.08163055098829672,
"learning_rate": 3.5397659075330748e-06,
"loss": 0.3139,
"step": 246
},
{
"epoch": 4.371065709552734,
"grad_norm": 0.07572785159509317,
"learning_rate": 3.3375097652009526e-06,
"loss": 0.313,
"step": 247
},
{
"epoch": 4.38873550524572,
"grad_norm": 0.08030300979420142,
"learning_rate": 3.140951525183691e-06,
"loss": 0.3154,
"step": 248
},
{
"epoch": 4.406405300938708,
"grad_norm": 0.07708287661350902,
"learning_rate": 2.950121735587654e-06,
"loss": 0.3168,
"step": 249
},
{
"epoch": 4.424075096631695,
"grad_norm": 0.09203966729547491,
"learning_rate": 2.765050054231835e-06,
"loss": 0.314,
"step": 250
},
{
"epoch": 4.4417448923246825,
"grad_norm": 0.08041597052637767,
"learning_rate": 2.5857652440386404e-06,
"loss": 0.3197,
"step": 251
},
{
"epoch": 4.45941468801767,
"grad_norm": 0.08037954632735851,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.3185,
"step": 252
},
{
"epoch": 4.477084483710657,
"grad_norm": 0.07294469295727943,
"learning_rate": 2.244666787665297e-06,
"loss": 0.3198,
"step": 253
},
{
"epoch": 4.494754279403645,
"grad_norm": 0.07372400441204935,
"learning_rate": 2.0829061533147322e-06,
"loss": 0.3125,
"step": 254
},
{
"epoch": 4.512424075096631,
"grad_norm": 0.07696683693078588,
"learning_rate": 1.927038405547106e-06,
"loss": 0.3153,
"step": 255
},
{
"epoch": 4.530093870789619,
"grad_norm": 0.07761898190749474,
"learning_rate": 1.7770877685543687e-06,
"loss": 0.3164,
"step": 256
},
{
"epoch": 4.547763666482607,
"grad_norm": 0.07405297885815232,
"learning_rate": 1.6330775469204895e-06,
"loss": 0.3165,
"step": 257
},
{
"epoch": 4.565433462175593,
"grad_norm": 0.07194679469449752,
"learning_rate": 1.495030121999519e-06,
"loss": 0.3174,
"step": 258
},
{
"epoch": 4.583103257868581,
"grad_norm": 0.07372588870054934,
"learning_rate": 1.3629669484372722e-06,
"loss": 0.3125,
"step": 259
},
{
"epoch": 4.600773053561568,
"grad_norm": 0.0736484855706964,
"learning_rate": 1.2369085508368862e-06,
"loss": 0.3117,
"step": 260
},
{
"epoch": 4.6184428492545555,
"grad_norm": 0.06929860862646713,
"learning_rate": 1.1168745205690202e-06,
"loss": 0.3188,
"step": 261
},
{
"epoch": 4.636112644947543,
"grad_norm": 0.06998294296555901,
"learning_rate": 1.0028835127270553e-06,
"loss": 0.3111,
"step": 262
},
{
"epoch": 4.65378244064053,
"grad_norm": 0.07010415717148563,
"learning_rate": 8.949532432278185e-07,
"loss": 0.3157,
"step": 263
},
{
"epoch": 4.671452236333518,
"grad_norm": 0.07068983473178043,
"learning_rate": 7.93100486058247e-07,
"loss": 0.322,
"step": 264
},
{
"epoch": 4.689122032026504,
"grad_norm": 0.07415661278084447,
"learning_rate": 6.973410706684691e-07,
"loss": 0.3122,
"step": 265
},
{
"epoch": 4.706791827719492,
"grad_norm": 0.07147977962991126,
"learning_rate": 6.076898795116792e-07,
"loss": 0.3162,
"step": 266
},
{
"epoch": 4.72446162341248,
"grad_norm": 0.07009164551218153,
"learning_rate": 5.241608457311565e-07,
"loss": 0.3174,
"step": 267
},
{
"epoch": 4.742131419105466,
"grad_norm": 0.07130206559582739,
"learning_rate": 4.467669509948591e-07,
"loss": 0.3114,
"step": 268
},
{
"epoch": 4.759801214798454,
"grad_norm": 0.06774782729285217,
"learning_rate": 3.7552022347788766e-07,
"loss": 0.3138,
"step": 269
},
{
"epoch": 4.777471010491441,
"grad_norm": 0.07138164733421887,
"learning_rate": 3.104317359931175e-07,
"loss": 0.3209,
"step": 270
},
{
"epoch": 4.7951408061844285,
"grad_norm": 0.06931798887191189,
"learning_rate": 2.5151160427029584e-07,
"loss": 0.3171,
"step": 271
},
{
"epoch": 4.812810601877416,
"grad_norm": 0.07008995274999451,
"learning_rate": 1.9876898538394362e-07,
"loss": 0.311,
"step": 272
},
{
"epoch": 4.830480397570403,
"grad_norm": 0.06835397987857686,
"learning_rate": 1.522120763301782e-07,
"loss": 0.323,
"step": 273
},
{
"epoch": 4.848150193263391,
"grad_norm": 0.06982710466662823,
"learning_rate": 1.1184811275279483e-07,
"loss": 0.3209,
"step": 274
},
{
"epoch": 4.865819988956377,
"grad_norm": 0.06719415454501948,
"learning_rate": 7.76833678187261e-08,
"loss": 0.3125,
"step": 275
},
{
"epoch": 4.883489784649365,
"grad_norm": 0.06985964768247895,
"learning_rate": 4.9723151243106225e-08,
"loss": 0.3192,
"step": 276
},
{
"epoch": 4.901159580342353,
"grad_norm": 0.06760152861997361,
"learning_rate": 2.797180846405567e-08,
"loss": 0.3176,
"step": 277
},
{
"epoch": 4.918829376035339,
"grad_norm": 0.07336231991253046,
"learning_rate": 1.2432719967350182e-08,
"loss": 0.3201,
"step": 278
},
{
"epoch": 4.936499171728327,
"grad_norm": 0.06901399516812774,
"learning_rate": 3.108300761005545e-09,
"loss": 0.3193,
"step": 279
},
{
"epoch": 4.954168967421314,
"grad_norm": 0.06958592985440362,
"learning_rate": 0.0,
"loss": 0.3178,
"step": 280
},
{
"epoch": 4.954168967421314,
"step": 280,
"total_flos": 7.445251410192499e+18,
"train_loss": 0.3906520079289164,
"train_runtime": 65492.582,
"train_samples_per_second": 2.212,
"train_steps_per_second": 0.004
}
],
"logging_steps": 1,
"max_steps": 280,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.445251410192499e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}