sandi99's picture
Upload folder using huggingface_hub
20b0d05 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.997289972899729,
"eval_steps": 500,
"global_step": 276,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0036133694670280035,
"grad_norm": 0.3959366977214813,
"learning_rate": 6.25e-06,
"loss": 0.9323,
"step": 1
},
{
"epoch": 0.007226738934056007,
"grad_norm": 0.45551198720932007,
"learning_rate": 1.25e-05,
"loss": 1.0507,
"step": 2
},
{
"epoch": 0.01084010840108401,
"grad_norm": 0.2823091745376587,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.8491,
"step": 3
},
{
"epoch": 0.014453477868112014,
"grad_norm": 0.46047303080558777,
"learning_rate": 2.5e-05,
"loss": 1.0142,
"step": 4
},
{
"epoch": 0.018066847335140017,
"grad_norm": 0.4086349606513977,
"learning_rate": 3.125e-05,
"loss": 0.947,
"step": 5
},
{
"epoch": 0.02168021680216802,
"grad_norm": 0.457003116607666,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.9485,
"step": 6
},
{
"epoch": 0.025293586269196026,
"grad_norm": 0.35562458634376526,
"learning_rate": 4.375e-05,
"loss": 0.8449,
"step": 7
},
{
"epoch": 0.028906955736224028,
"grad_norm": 0.33805516362190247,
"learning_rate": 5e-05,
"loss": 0.7379,
"step": 8
},
{
"epoch": 0.032520325203252036,
"grad_norm": 0.3412623703479767,
"learning_rate": 4.9998282347929784e-05,
"loss": 0.6282,
"step": 9
},
{
"epoch": 0.036133694670280034,
"grad_norm": 0.2843680679798126,
"learning_rate": 4.99931296277454e-05,
"loss": 0.5503,
"step": 10
},
{
"epoch": 0.03974706413730804,
"grad_norm": 0.17628777027130127,
"learning_rate": 4.998454254749331e-05,
"loss": 0.512,
"step": 11
},
{
"epoch": 0.04336043360433604,
"grad_norm": 0.19055013358592987,
"learning_rate": 4.997252228714279e-05,
"loss": 0.5397,
"step": 12
},
{
"epoch": 0.04697380307136405,
"grad_norm": 0.08906977623701096,
"learning_rate": 4.9957070498423854e-05,
"loss": 0.5458,
"step": 13
},
{
"epoch": 0.05058717253839205,
"grad_norm": 0.0917251780629158,
"learning_rate": 4.993818930460026e-05,
"loss": 0.5269,
"step": 14
},
{
"epoch": 0.05420054200542006,
"grad_norm": 0.0985497236251831,
"learning_rate": 4.9915881300177725e-05,
"loss": 0.4135,
"step": 15
},
{
"epoch": 0.057813911472448055,
"grad_norm": 0.1111132949590683,
"learning_rate": 4.9890149550547454e-05,
"loss": 0.5064,
"step": 16
},
{
"epoch": 0.06142728093947606,
"grad_norm": 0.0649256557226181,
"learning_rate": 4.98609975915649e-05,
"loss": 0.4804,
"step": 17
},
{
"epoch": 0.06504065040650407,
"grad_norm": 0.09687516838312149,
"learning_rate": 4.982842942906386e-05,
"loss": 0.3706,
"step": 18
},
{
"epoch": 0.06865401987353206,
"grad_norm": 0.14679567515850067,
"learning_rate": 4.979244953830608e-05,
"loss": 0.4105,
"step": 19
},
{
"epoch": 0.07226738934056007,
"grad_norm": 0.14155593514442444,
"learning_rate": 4.9753062863366276e-05,
"loss": 0.4886,
"step": 20
},
{
"epoch": 0.07588075880758807,
"grad_norm": 0.14684930443763733,
"learning_rate": 4.971027481645274e-05,
"loss": 0.4044,
"step": 21
},
{
"epoch": 0.07949412827461608,
"grad_norm": 0.11222010105848312,
"learning_rate": 4.966409127716367e-05,
"loss": 0.4361,
"step": 22
},
{
"epoch": 0.08310749774164408,
"grad_norm": 0.058118775486946106,
"learning_rate": 4.96145185916792e-05,
"loss": 0.4176,
"step": 23
},
{
"epoch": 0.08672086720867209,
"grad_norm": 0.06764644384384155,
"learning_rate": 4.95615635718894e-05,
"loss": 0.4683,
"step": 24
},
{
"epoch": 0.09033423667570009,
"grad_norm": 0.06886276602745056,
"learning_rate": 4.950523349445824e-05,
"loss": 0.418,
"step": 25
},
{
"epoch": 0.0939476061427281,
"grad_norm": 0.0706636980175972,
"learning_rate": 4.944553609982363e-05,
"loss": 0.3967,
"step": 26
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.04914792627096176,
"learning_rate": 4.938247959113386e-05,
"loss": 0.4623,
"step": 27
},
{
"epoch": 0.1011743450767841,
"grad_norm": 0.05717244744300842,
"learning_rate": 4.931607263312032e-05,
"loss": 0.4047,
"step": 28
},
{
"epoch": 0.10478771454381211,
"grad_norm": 0.05677526444196701,
"learning_rate": 4.924632435090696e-05,
"loss": 0.4251,
"step": 29
},
{
"epoch": 0.10840108401084012,
"grad_norm": 0.051282044500112534,
"learning_rate": 4.917324432875627e-05,
"loss": 0.4101,
"step": 30
},
{
"epoch": 0.1120144534778681,
"grad_norm": 0.05558260530233383,
"learning_rate": 4.909684260875235e-05,
"loss": 0.4425,
"step": 31
},
{
"epoch": 0.11562782294489611,
"grad_norm": 0.05362090840935707,
"learning_rate": 4.9017129689421e-05,
"loss": 0.383,
"step": 32
},
{
"epoch": 0.11924119241192412,
"grad_norm": 0.050591859966516495,
"learning_rate": 4.893411652428712e-05,
"loss": 0.3988,
"step": 33
},
{
"epoch": 0.12285456187895212,
"grad_norm": 0.07354583591222763,
"learning_rate": 4.8847814520369475e-05,
"loss": 0.473,
"step": 34
},
{
"epoch": 0.12646793134598014,
"grad_norm": 0.07448670268058777,
"learning_rate": 4.875823553661334e-05,
"loss": 0.3609,
"step": 35
},
{
"epoch": 0.13008130081300814,
"grad_norm": 0.09399361908435822,
"learning_rate": 4.8665391882260856e-05,
"loss": 0.3927,
"step": 36
},
{
"epoch": 0.13369467028003612,
"grad_norm": 0.061091382056474686,
"learning_rate": 4.856929631515964e-05,
"loss": 0.4512,
"step": 37
},
{
"epoch": 0.13730803974706413,
"grad_norm": 0.06277038156986237,
"learning_rate": 4.846996204000967e-05,
"loss": 0.3961,
"step": 38
},
{
"epoch": 0.14092140921409213,
"grad_norm": 0.05277445912361145,
"learning_rate": 4.8367402706548805e-05,
"loss": 0.3885,
"step": 39
},
{
"epoch": 0.14453477868112014,
"grad_norm": 0.06335710734128952,
"learning_rate": 4.8261632407677174e-05,
"loss": 0.4663,
"step": 40
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.05149435997009277,
"learning_rate": 4.815266567752059e-05,
"loss": 0.4012,
"step": 41
},
{
"epoch": 0.15176151761517614,
"grad_norm": 0.052154790610075,
"learning_rate": 4.804051748943343e-05,
"loss": 0.377,
"step": 42
},
{
"epoch": 0.15537488708220415,
"grad_norm": 0.06229854002594948,
"learning_rate": 4.792520325394111e-05,
"loss": 0.4677,
"step": 43
},
{
"epoch": 0.15898825654923215,
"grad_norm": 0.050992563366889954,
"learning_rate": 4.780673881662242e-05,
"loss": 0.4271,
"step": 44
},
{
"epoch": 0.16260162601626016,
"grad_norm": 0.057579364627599716,
"learning_rate": 4.7685140455932267e-05,
"loss": 0.4096,
"step": 45
},
{
"epoch": 0.16621499548328816,
"grad_norm": 0.05966678634285927,
"learning_rate": 4.756042488096471e-05,
"loss": 0.4075,
"step": 46
},
{
"epoch": 0.16982836495031617,
"grad_norm": 0.055218473076820374,
"learning_rate": 4.743260922915701e-05,
"loss": 0.459,
"step": 47
},
{
"epoch": 0.17344173441734417,
"grad_norm": 0.05127694830298424,
"learning_rate": 4.730171106393466e-05,
"loss": 0.4086,
"step": 48
},
{
"epoch": 0.17705510388437218,
"grad_norm": 0.06519781798124313,
"learning_rate": 4.716774837229804e-05,
"loss": 0.4418,
"step": 49
},
{
"epoch": 0.18066847335140018,
"grad_norm": 0.05895975977182388,
"learning_rate": 4.7030739562350713e-05,
"loss": 0.4013,
"step": 50
},
{
"epoch": 0.1842818428184282,
"grad_norm": 0.061492159962654114,
"learning_rate": 4.6890703460769955e-05,
"loss": 0.3726,
"step": 51
},
{
"epoch": 0.1878952122854562,
"grad_norm": 0.05051853135228157,
"learning_rate": 4.674765931021976e-05,
"loss": 0.4354,
"step": 52
},
{
"epoch": 0.1915085817524842,
"grad_norm": 0.05664265528321266,
"learning_rate": 4.6601626766706626e-05,
"loss": 0.4137,
"step": 53
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.06020362302660942,
"learning_rate": 4.645262589687861e-05,
"loss": 0.4171,
"step": 54
},
{
"epoch": 0.1987353206865402,
"grad_norm": 0.06303560733795166,
"learning_rate": 4.6300677175267914e-05,
"loss": 0.3724,
"step": 55
},
{
"epoch": 0.2023486901535682,
"grad_norm": 0.06793845444917679,
"learning_rate": 4.614580148147744e-05,
"loss": 0.3711,
"step": 56
},
{
"epoch": 0.20596205962059622,
"grad_norm": 0.07107391953468323,
"learning_rate": 4.598802009731167e-05,
"loss": 0.4428,
"step": 57
},
{
"epoch": 0.20957542908762422,
"grad_norm": 0.06567548215389252,
"learning_rate": 4.582735470385229e-05,
"loss": 0.3774,
"step": 58
},
{
"epoch": 0.21318879855465223,
"grad_norm": 0.05056913569569588,
"learning_rate": 4.5663827378478975e-05,
"loss": 0.3584,
"step": 59
},
{
"epoch": 0.21680216802168023,
"grad_norm": 0.08128344267606735,
"learning_rate": 4.5497460591835615e-05,
"loss": 0.3983,
"step": 60
},
{
"epoch": 0.2204155374887082,
"grad_norm": 0.05856931954622269,
"learning_rate": 4.532827720474268e-05,
"loss": 0.3486,
"step": 61
},
{
"epoch": 0.2240289069557362,
"grad_norm": 0.05503028631210327,
"learning_rate": 4.515630046505575e-05,
"loss": 0.3896,
"step": 62
},
{
"epoch": 0.22764227642276422,
"grad_norm": 0.047534190118312836,
"learning_rate": 4.498155400447107e-05,
"loss": 0.4463,
"step": 63
},
{
"epoch": 0.23125564588979222,
"grad_norm": 0.0638430267572403,
"learning_rate": 4.480406183527823e-05,
"loss": 0.3977,
"step": 64
},
{
"epoch": 0.23486901535682023,
"grad_norm": 0.04974055290222168,
"learning_rate": 4.462384834706058e-05,
"loss": 0.3999,
"step": 65
},
{
"epoch": 0.23848238482384823,
"grad_norm": 0.06309591233730316,
"learning_rate": 4.4440938303343804e-05,
"loss": 0.4275,
"step": 66
},
{
"epoch": 0.24209575429087624,
"grad_norm": 0.05192544683814049,
"learning_rate": 4.425535683819312e-05,
"loss": 0.4096,
"step": 67
},
{
"epoch": 0.24570912375790424,
"grad_norm": 0.057684604078531265,
"learning_rate": 4.406712945275955e-05,
"loss": 0.41,
"step": 68
},
{
"epoch": 0.24932249322493225,
"grad_norm": 0.0514802448451519,
"learning_rate": 4.387628201177577e-05,
"loss": 0.3372,
"step": 69
},
{
"epoch": 0.2529358626919603,
"grad_norm": 0.056559968739748,
"learning_rate": 4.368284074000193e-05,
"loss": 0.3929,
"step": 70
},
{
"epoch": 0.2565492321589883,
"grad_norm": 0.0645717978477478,
"learning_rate": 4.348683221862212e-05,
"loss": 0.4353,
"step": 71
},
{
"epoch": 0.2601626016260163,
"grad_norm": 0.08638172596693039,
"learning_rate": 4.328828338159173e-05,
"loss": 0.3978,
"step": 72
},
{
"epoch": 0.26377597109304424,
"grad_norm": 0.05915065109729767,
"learning_rate": 4.3087221511936434e-05,
"loss": 0.393,
"step": 73
},
{
"epoch": 0.26738934056007224,
"grad_norm": 0.061671093106269836,
"learning_rate": 4.288367423800319e-05,
"loss": 0.4187,
"step": 74
},
{
"epoch": 0.27100271002710025,
"grad_norm": 0.07420554012060165,
"learning_rate": 4.267766952966369e-05,
"loss": 0.3939,
"step": 75
},
{
"epoch": 0.27461607949412825,
"grad_norm": 0.07052630186080933,
"learning_rate": 4.2469235694471043e-05,
"loss": 0.3435,
"step": 76
},
{
"epoch": 0.27822944896115626,
"grad_norm": 0.06885933130979538,
"learning_rate": 4.225840137376993e-05,
"loss": 0.4363,
"step": 77
},
{
"epoch": 0.28184281842818426,
"grad_norm": 0.05735473707318306,
"learning_rate": 4.204519553876095e-05,
"loss": 0.3509,
"step": 78
},
{
"epoch": 0.28545618789521227,
"grad_norm": 0.06102309376001358,
"learning_rate": 4.1829647486519596e-05,
"loss": 0.3369,
"step": 79
},
{
"epoch": 0.28906955736224027,
"grad_norm": 0.06527422368526459,
"learning_rate": 4.161178683597054e-05,
"loss": 0.4052,
"step": 80
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.06578138470649719,
"learning_rate": 4.139164352381758e-05,
"loss": 0.3586,
"step": 81
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.05465536564588547,
"learning_rate": 4.116924780042997e-05,
"loss": 0.3759,
"step": 82
},
{
"epoch": 0.2999096657633243,
"grad_norm": 0.08491545915603638,
"learning_rate": 4.094463022568569e-05,
"loss": 0.3611,
"step": 83
},
{
"epoch": 0.3035230352303523,
"grad_norm": 0.06035340949892998,
"learning_rate": 4.071782166477213e-05,
"loss": 0.3537,
"step": 84
},
{
"epoch": 0.3071364046973803,
"grad_norm": 0.06220124289393425,
"learning_rate": 4.0488853283944806e-05,
"loss": 0.3878,
"step": 85
},
{
"epoch": 0.3107497741644083,
"grad_norm": 0.05434149503707886,
"learning_rate": 4.0257756546244804e-05,
"loss": 0.3765,
"step": 86
},
{
"epoch": 0.3143631436314363,
"grad_norm": 0.06244641914963722,
"learning_rate": 4.0024563207175316e-05,
"loss": 0.3668,
"step": 87
},
{
"epoch": 0.3179765130984643,
"grad_norm": 0.08008646965026855,
"learning_rate": 3.978930531033807e-05,
"loss": 0.3883,
"step": 88
},
{
"epoch": 0.3215898825654923,
"grad_norm": 0.06990881264209747,
"learning_rate": 3.9552015183030136e-05,
"loss": 0.4611,
"step": 89
},
{
"epoch": 0.3252032520325203,
"grad_norm": 0.05660560727119446,
"learning_rate": 3.93127254318018e-05,
"loss": 0.3865,
"step": 90
},
{
"epoch": 0.3288166214995483,
"grad_norm": 0.05711934715509415,
"learning_rate": 3.907146893797599e-05,
"loss": 0.4223,
"step": 91
},
{
"epoch": 0.3324299909665763,
"grad_norm": 0.06767363101243973,
"learning_rate": 3.882827885312999e-05,
"loss": 0.3481,
"step": 92
},
{
"epoch": 0.33604336043360433,
"grad_norm": 0.05866090953350067,
"learning_rate": 3.858318859454001e-05,
"loss": 0.4195,
"step": 93
},
{
"epoch": 0.33965672990063234,
"grad_norm": 0.05316139757633209,
"learning_rate": 3.833623184058926e-05,
"loss": 0.4042,
"step": 94
},
{
"epoch": 0.34327009936766034,
"grad_norm": 0.06730002164840698,
"learning_rate": 3.808744252614012e-05,
"loss": 0.3717,
"step": 95
},
{
"epoch": 0.34688346883468835,
"grad_norm": 0.07342930138111115,
"learning_rate": 3.783685483787105e-05,
"loss": 0.4075,
"step": 96
},
{
"epoch": 0.35049683830171635,
"grad_norm": 0.07083098590373993,
"learning_rate": 3.758450320957899e-05,
"loss": 0.3864,
"step": 97
},
{
"epoch": 0.35411020776874436,
"grad_norm": 0.07677371054887772,
"learning_rate": 3.7330422317447685e-05,
"loss": 0.393,
"step": 98
},
{
"epoch": 0.35772357723577236,
"grad_norm": 0.0808129534125328,
"learning_rate": 3.707464707528275e-05,
"loss": 0.3801,
"step": 99
},
{
"epoch": 0.36133694670280037,
"grad_norm": 0.06672363728284836,
"learning_rate": 3.681721262971413e-05,
"loss": 0.4472,
"step": 100
},
{
"epoch": 0.36495031616982837,
"grad_norm": 0.05534950643777847,
"learning_rate": 3.6558154355366506e-05,
"loss": 0.3683,
"step": 101
},
{
"epoch": 0.3685636856368564,
"grad_norm": 0.06686428934335709,
"learning_rate": 3.6297507849998344e-05,
"loss": 0.3455,
"step": 102
},
{
"epoch": 0.3721770551038844,
"grad_norm": 0.07248938828706741,
"learning_rate": 3.6035308929610446e-05,
"loss": 0.4083,
"step": 103
},
{
"epoch": 0.3757904245709124,
"grad_norm": 0.06316327303647995,
"learning_rate": 3.5771593623524265e-05,
"loss": 0.3661,
"step": 104
},
{
"epoch": 0.3794037940379404,
"grad_norm": 0.08561142534017563,
"learning_rate": 3.550639816943111e-05,
"loss": 0.3693,
"step": 105
},
{
"epoch": 0.3830171635049684,
"grad_norm": 0.05884739011526108,
"learning_rate": 3.5239759008412666e-05,
"loss": 0.4326,
"step": 106
},
{
"epoch": 0.3866305329719964,
"grad_norm": 0.06861259788274765,
"learning_rate": 3.497171277993346e-05,
"loss": 0.3423,
"step": 107
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.06908590346574783,
"learning_rate": 3.4702296316806244e-05,
"loss": 0.4494,
"step": 108
},
{
"epoch": 0.3938572719060524,
"grad_norm": 0.07454199343919754,
"learning_rate": 3.443154664013067e-05,
"loss": 0.4488,
"step": 109
},
{
"epoch": 0.3974706413730804,
"grad_norm": 0.07938794046640396,
"learning_rate": 3.415950095420616e-05,
"loss": 0.3938,
"step": 110
},
{
"epoch": 0.4010840108401084,
"grad_norm": 0.08505871146917343,
"learning_rate": 3.3886196641419545e-05,
"loss": 0.4004,
"step": 111
},
{
"epoch": 0.4046973803071364,
"grad_norm": 0.0625777617096901,
"learning_rate": 3.361167125710832e-05,
"loss": 0.3863,
"step": 112
},
{
"epoch": 0.4083107497741644,
"grad_norm": 0.07772816717624664,
"learning_rate": 3.333596252440008e-05,
"loss": 0.3981,
"step": 113
},
{
"epoch": 0.41192411924119243,
"grad_norm": 0.06656523048877716,
"learning_rate": 3.305910832902884e-05,
"loss": 0.3705,
"step": 114
},
{
"epoch": 0.41553748870822044,
"grad_norm": 0.07238256186246872,
"learning_rate": 3.278114671412917e-05,
"loss": 0.412,
"step": 115
},
{
"epoch": 0.41915085817524844,
"grad_norm": 0.06601731479167938,
"learning_rate": 3.2502115875008524e-05,
"loss": 0.3716,
"step": 116
},
{
"epoch": 0.42276422764227645,
"grad_norm": 0.0684824138879776,
"learning_rate": 3.222205415389877e-05,
"loss": 0.4183,
"step": 117
},
{
"epoch": 0.42637759710930445,
"grad_norm": 0.0698830783367157,
"learning_rate": 3.1941000034687515e-05,
"loss": 0.3517,
"step": 118
},
{
"epoch": 0.42999096657633246,
"grad_norm": 0.05978047475218773,
"learning_rate": 3.165899213762995e-05,
"loss": 0.3852,
"step": 119
},
{
"epoch": 0.43360433604336046,
"grad_norm": 0.07572682201862335,
"learning_rate": 3.1376069214041913e-05,
"loss": 0.4022,
"step": 120
},
{
"epoch": 0.4372177055103884,
"grad_norm": 0.07104960829019547,
"learning_rate": 3.109227014097505e-05,
"loss": 0.4185,
"step": 121
},
{
"epoch": 0.4408310749774164,
"grad_norm": 0.06828156113624573,
"learning_rate": 3.0807633915874584e-05,
"loss": 0.4239,
"step": 122
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.057690802961587906,
"learning_rate": 3.052219965122062e-05,
"loss": 0.4109,
"step": 123
},
{
"epoch": 0.4480578139114724,
"grad_norm": 0.06580954045057297,
"learning_rate": 3.0236006569153617e-05,
"loss": 0.359,
"step": 124
},
{
"epoch": 0.45167118337850043,
"grad_norm": 0.060349613428115845,
"learning_rate": 2.9949093996084747e-05,
"loss": 0.3775,
"step": 125
},
{
"epoch": 0.45528455284552843,
"grad_norm": 0.07335729151964188,
"learning_rate": 2.9661501357292033e-05,
"loss": 0.4043,
"step": 126
},
{
"epoch": 0.45889792231255644,
"grad_norm": 0.04954389110207558,
"learning_rate": 2.9373268171502777e-05,
"loss": 0.3537,
"step": 127
},
{
"epoch": 0.46251129177958444,
"grad_norm": 0.07528957724571228,
"learning_rate": 2.9084434045463255e-05,
"loss": 0.467,
"step": 128
},
{
"epoch": 0.46612466124661245,
"grad_norm": 0.06106121093034744,
"learning_rate": 2.8795038668496222e-05,
"loss": 0.4323,
"step": 129
},
{
"epoch": 0.46973803071364045,
"grad_norm": 0.08181653916835785,
"learning_rate": 2.850512180704715e-05,
"loss": 0.4208,
"step": 130
},
{
"epoch": 0.47335140018066846,
"grad_norm": 0.07354505360126495,
"learning_rate": 2.821472329921981e-05,
"loss": 0.3909,
"step": 131
},
{
"epoch": 0.47696476964769646,
"grad_norm": 0.09099866449832916,
"learning_rate": 2.792388304930207e-05,
"loss": 0.4296,
"step": 132
},
{
"epoch": 0.48057813911472447,
"grad_norm": 0.08062151074409485,
"learning_rate": 2.7632641022282502e-05,
"loss": 0.4106,
"step": 133
},
{
"epoch": 0.48419150858175247,
"grad_norm": 0.09198120981454849,
"learning_rate": 2.7341037238358774e-05,
"loss": 0.4064,
"step": 134
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.05343058705329895,
"learning_rate": 2.704911176743833e-05,
"loss": 0.404,
"step": 135
},
{
"epoch": 0.4914182475158085,
"grad_norm": 0.0657978504896164,
"learning_rate": 2.6756904723632324e-05,
"loss": 0.3993,
"step": 136
},
{
"epoch": 0.4950316169828365,
"grad_norm": 0.057678401470184326,
"learning_rate": 2.646445625974347e-05,
"loss": 0.3804,
"step": 137
},
{
"epoch": 0.4986449864498645,
"grad_norm": 0.06898088753223419,
"learning_rate": 2.6171806561748502e-05,
"loss": 0.4452,
"step": 138
},
{
"epoch": 0.5022583559168925,
"grad_norm": 0.09333262592554092,
"learning_rate": 2.5878995843276204e-05,
"loss": 0.3304,
"step": 139
},
{
"epoch": 0.5058717253839206,
"grad_norm": 0.06717183440923691,
"learning_rate": 2.5586064340081516e-05,
"loss": 0.326,
"step": 140
},
{
"epoch": 0.5094850948509485,
"grad_norm": 0.06729979068040848,
"learning_rate": 2.529305230451666e-05,
"loss": 0.3934,
"step": 141
},
{
"epoch": 0.5130984643179766,
"grad_norm": 0.09550358355045319,
"learning_rate": 2.5e-05,
"loss": 0.4733,
"step": 142
},
{
"epoch": 0.5167118337850045,
"grad_norm": 0.07080523669719696,
"learning_rate": 2.4706947695483348e-05,
"loss": 0.4039,
"step": 143
},
{
"epoch": 0.5203252032520326,
"grad_norm": 0.055423106998205185,
"learning_rate": 2.441393565991849e-05,
"loss": 0.3275,
"step": 144
},
{
"epoch": 0.5239385727190605,
"grad_norm": 0.06483904272317886,
"learning_rate": 2.4121004156723802e-05,
"loss": 0.4377,
"step": 145
},
{
"epoch": 0.5275519421860885,
"grad_norm": 0.06614437699317932,
"learning_rate": 2.3828193438251497e-05,
"loss": 0.3935,
"step": 146
},
{
"epoch": 0.5311653116531165,
"grad_norm": 0.08745498955249786,
"learning_rate": 2.3535543740256536e-05,
"loss": 0.4348,
"step": 147
},
{
"epoch": 0.5347786811201445,
"grad_norm": 0.07158234715461731,
"learning_rate": 2.3243095276367685e-05,
"loss": 0.3286,
"step": 148
},
{
"epoch": 0.5383920505871725,
"grad_norm": 0.06448652595281601,
"learning_rate": 2.2950888232561672e-05,
"loss": 0.4108,
"step": 149
},
{
"epoch": 0.5420054200542005,
"grad_norm": 0.07621192187070847,
"learning_rate": 2.2658962761641232e-05,
"loss": 0.4317,
"step": 150
},
{
"epoch": 0.5456187895212286,
"grad_norm": 0.07459475100040436,
"learning_rate": 2.23673589777175e-05,
"loss": 0.3876,
"step": 151
},
{
"epoch": 0.5492321589882565,
"grad_norm": 0.07355853170156479,
"learning_rate": 2.207611695069794e-05,
"loss": 0.3506,
"step": 152
},
{
"epoch": 0.5528455284552846,
"grad_norm": 0.07565652579069138,
"learning_rate": 2.17852767007802e-05,
"loss": 0.4221,
"step": 153
},
{
"epoch": 0.5564588979223125,
"grad_norm": 0.07433846592903137,
"learning_rate": 2.1494878192952855e-05,
"loss": 0.3913,
"step": 154
},
{
"epoch": 0.5600722673893406,
"grad_norm": 0.07123446464538574,
"learning_rate": 2.1204961331503787e-05,
"loss": 0.4106,
"step": 155
},
{
"epoch": 0.5636856368563685,
"grad_norm": 0.0848294198513031,
"learning_rate": 2.0915565954536744e-05,
"loss": 0.3171,
"step": 156
},
{
"epoch": 0.5672990063233966,
"grad_norm": 0.06394634395837784,
"learning_rate": 2.0626731828497225e-05,
"loss": 0.4106,
"step": 157
},
{
"epoch": 0.5709123757904245,
"grad_norm": 0.06601906567811966,
"learning_rate": 2.0338498642707977e-05,
"loss": 0.3651,
"step": 158
},
{
"epoch": 0.5745257452574526,
"grad_norm": 0.0734376311302185,
"learning_rate": 2.005090600391526e-05,
"loss": 0.3906,
"step": 159
},
{
"epoch": 0.5781391147244805,
"grad_norm": 0.07122786343097687,
"learning_rate": 1.9763993430846395e-05,
"loss": 0.4157,
"step": 160
},
{
"epoch": 0.5817524841915086,
"grad_norm": 0.06590158492326736,
"learning_rate": 1.947780034877938e-05,
"loss": 0.4267,
"step": 161
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.07380690425634384,
"learning_rate": 1.9192366084125425e-05,
"loss": 0.3748,
"step": 162
},
{
"epoch": 0.5889792231255646,
"grad_norm": 0.054361093789339066,
"learning_rate": 1.890772985902496e-05,
"loss": 0.3637,
"step": 163
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.06896340101957321,
"learning_rate": 1.8623930785958092e-05,
"loss": 0.4319,
"step": 164
},
{
"epoch": 0.5962059620596206,
"grad_norm": 0.08140537887811661,
"learning_rate": 1.8341007862370056e-05,
"loss": 0.3942,
"step": 165
},
{
"epoch": 0.5998193315266486,
"grad_norm": 0.07021729648113251,
"learning_rate": 1.8058999965312484e-05,
"loss": 0.3917,
"step": 166
},
{
"epoch": 0.6034327009936766,
"grad_norm": 0.06319273263216019,
"learning_rate": 1.777794584610124e-05,
"loss": 0.3833,
"step": 167
},
{
"epoch": 0.6070460704607046,
"grad_norm": 0.07088933885097504,
"learning_rate": 1.749788412499149e-05,
"loss": 0.3326,
"step": 168
},
{
"epoch": 0.6106594399277326,
"grad_norm": 0.06848324090242386,
"learning_rate": 1.721885328587083e-05,
"loss": 0.5018,
"step": 169
},
{
"epoch": 0.6142728093947606,
"grad_norm": 0.07163573056459427,
"learning_rate": 1.694089167097116e-05,
"loss": 0.3624,
"step": 170
},
{
"epoch": 0.6178861788617886,
"grad_norm": 0.06683260202407837,
"learning_rate": 1.6664037475599923e-05,
"loss": 0.4198,
"step": 171
},
{
"epoch": 0.6214995483288166,
"grad_norm": 0.06273495405912399,
"learning_rate": 1.638832874289168e-05,
"loss": 0.3388,
"step": 172
},
{
"epoch": 0.6251129177958447,
"grad_norm": 0.06024303659796715,
"learning_rate": 1.611380335858047e-05,
"loss": 0.4156,
"step": 173
},
{
"epoch": 0.6287262872628726,
"grad_norm": 0.08732262253761292,
"learning_rate": 1.5840499045793843e-05,
"loss": 0.3883,
"step": 174
},
{
"epoch": 0.6323396567299007,
"grad_norm": 0.06800790876150131,
"learning_rate": 1.5568453359869334e-05,
"loss": 0.3636,
"step": 175
},
{
"epoch": 0.6359530261969286,
"grad_norm": 0.08514184504747391,
"learning_rate": 1.5297703683193752e-05,
"loss": 0.3664,
"step": 176
},
{
"epoch": 0.6395663956639567,
"grad_norm": 0.0805889442563057,
"learning_rate": 1.502828722006655e-05,
"loss": 0.3912,
"step": 177
},
{
"epoch": 0.6431797651309846,
"grad_norm": 0.07321416586637497,
"learning_rate": 1.4760240991587337e-05,
"loss": 0.4077,
"step": 178
},
{
"epoch": 0.6467931345980127,
"grad_norm": 0.06993624567985535,
"learning_rate": 1.4493601830568887e-05,
"loss": 0.3728,
"step": 179
},
{
"epoch": 0.6504065040650406,
"grad_norm": 0.07736963033676147,
"learning_rate": 1.4228406376475742e-05,
"loss": 0.3644,
"step": 180
},
{
"epoch": 0.6540198735320687,
"grad_norm": 0.06840698421001434,
"learning_rate": 1.396469107038956e-05,
"loss": 0.3936,
"step": 181
},
{
"epoch": 0.6576332429990966,
"grad_norm": 0.07498890906572342,
"learning_rate": 1.3702492150001659e-05,
"loss": 0.3948,
"step": 182
},
{
"epoch": 0.6612466124661247,
"grad_norm": 0.06307978183031082,
"learning_rate": 1.34418456446335e-05,
"loss": 0.398,
"step": 183
},
{
"epoch": 0.6648599819331527,
"grad_norm": 0.0843866616487503,
"learning_rate": 1.3182787370285865e-05,
"loss": 0.3891,
"step": 184
},
{
"epoch": 0.6684733514001807,
"grad_norm": 0.07880077511072159,
"learning_rate": 1.292535292471726e-05,
"loss": 0.3812,
"step": 185
},
{
"epoch": 0.6720867208672087,
"grad_norm": 0.06986968219280243,
"learning_rate": 1.2669577682552319e-05,
"loss": 0.3851,
"step": 186
},
{
"epoch": 0.6757000903342367,
"grad_norm": 0.07602784037590027,
"learning_rate": 1.2415496790421011e-05,
"loss": 0.3956,
"step": 187
},
{
"epoch": 0.6793134598012647,
"grad_norm": 0.06611546874046326,
"learning_rate": 1.2163145162128947e-05,
"loss": 0.3629,
"step": 188
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.07958898693323135,
"learning_rate": 1.1912557473859895e-05,
"loss": 0.3647,
"step": 189
},
{
"epoch": 0.6865401987353207,
"grad_norm": 0.06264237314462662,
"learning_rate": 1.1663768159410748e-05,
"loss": 0.3797,
"step": 190
},
{
"epoch": 0.6901535682023487,
"grad_norm": 0.08303744345903397,
"learning_rate": 1.1416811405459993e-05,
"loss": 0.3754,
"step": 191
},
{
"epoch": 0.6937669376693767,
"grad_norm": 0.07206673175096512,
"learning_rate": 1.1171721146870015e-05,
"loss": 0.327,
"step": 192
},
{
"epoch": 0.6973803071364046,
"grad_norm": 0.06349314749240875,
"learning_rate": 1.0928531062024017e-05,
"loss": 0.3902,
"step": 193
},
{
"epoch": 0.7009936766034327,
"grad_norm": 0.07241489738225937,
"learning_rate": 1.0687274568198208e-05,
"loss": 0.3845,
"step": 194
},
{
"epoch": 0.7046070460704607,
"grad_norm": 0.06357239931821823,
"learning_rate": 1.0447984816969874e-05,
"loss": 0.3881,
"step": 195
},
{
"epoch": 0.7082204155374887,
"grad_norm": 0.06316613405942917,
"learning_rate": 1.021069468966194e-05,
"loss": 0.4735,
"step": 196
},
{
"epoch": 0.7118337850045167,
"grad_norm": 0.08076903223991394,
"learning_rate": 9.975436792824691e-06,
"loss": 0.43,
"step": 197
},
{
"epoch": 0.7154471544715447,
"grad_norm": 0.0836021676659584,
"learning_rate": 9.742243453755202e-06,
"loss": 0.3818,
"step": 198
},
{
"epoch": 0.7190605239385727,
"grad_norm": 0.0713673084974289,
"learning_rate": 9.5111467160552e-06,
"loss": 0.3846,
"step": 199
},
{
"epoch": 0.7226738934056007,
"grad_norm": 0.08711904287338257,
"learning_rate": 9.282178335227884e-06,
"loss": 0.4817,
"step": 200
},
{
"epoch": 0.7262872628726287,
"grad_norm": 0.05264454334974289,
"learning_rate": 9.05536977431431e-06,
"loss": 0.3995,
"step": 201
},
{
"epoch": 0.7299006323396567,
"grad_norm": 0.07466941326856613,
"learning_rate": 8.830752199570033e-06,
"loss": 0.3718,
"step": 202
},
{
"epoch": 0.7335140018066847,
"grad_norm": 0.07776648551225662,
"learning_rate": 8.608356476182424e-06,
"loss": 0.4786,
"step": 203
},
{
"epoch": 0.7371273712737128,
"grad_norm": 0.06611160188913345,
"learning_rate": 8.38821316402946e-06,
"loss": 0.3668,
"step": 204
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.07174837589263916,
"learning_rate": 8.170352513480408e-06,
"loss": 0.4016,
"step": 205
},
{
"epoch": 0.7443541102077688,
"grad_norm": 0.0830477848649025,
"learning_rate": 7.954804461239053e-06,
"loss": 0.4162,
"step": 206
},
{
"epoch": 0.7479674796747967,
"grad_norm": 0.08300362527370453,
"learning_rate": 7.741598626230079e-06,
"loss": 0.3738,
"step": 207
},
{
"epoch": 0.7515808491418248,
"grad_norm": 0.07526036351919174,
"learning_rate": 7.530764305528959e-06,
"loss": 0.3576,
"step": 208
},
{
"epoch": 0.7551942186088527,
"grad_norm": 0.06786955147981644,
"learning_rate": 7.3223304703363135e-06,
"loss": 0.4152,
"step": 209
},
{
"epoch": 0.7588075880758808,
"grad_norm": 0.08544765412807465,
"learning_rate": 7.116325761996817e-06,
"loss": 0.3735,
"step": 210
},
{
"epoch": 0.7624209575429087,
"grad_norm": 0.06077965721487999,
"learning_rate": 6.91277848806356e-06,
"loss": 0.3486,
"step": 211
},
{
"epoch": 0.7660343270099368,
"grad_norm": 0.07332652807235718,
"learning_rate": 6.711716618408281e-06,
"loss": 0.3734,
"step": 212
},
{
"epoch": 0.7696476964769647,
"grad_norm": 0.07848729193210602,
"learning_rate": 6.513167781377885e-06,
"loss": 0.4231,
"step": 213
},
{
"epoch": 0.7732610659439928,
"grad_norm": 0.07897993177175522,
"learning_rate": 6.317159259998073e-06,
"loss": 0.3513,
"step": 214
},
{
"epoch": 0.7768744354110207,
"grad_norm": 0.07235241681337357,
"learning_rate": 6.123717988224237e-06,
"loss": 0.4069,
"step": 215
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.09085345268249512,
"learning_rate": 5.932870547240454e-06,
"loss": 0.3849,
"step": 216
},
{
"epoch": 0.7841011743450768,
"grad_norm": 0.07704368233680725,
"learning_rate": 5.74464316180689e-06,
"loss": 0.4261,
"step": 217
},
{
"epoch": 0.7877145438121048,
"grad_norm": 0.057720448821783066,
"learning_rate": 5.559061696656198e-06,
"loss": 0.3711,
"step": 218
},
{
"epoch": 0.7913279132791328,
"grad_norm": 0.06448069959878922,
"learning_rate": 5.37615165293942e-06,
"loss": 0.4027,
"step": 219
},
{
"epoch": 0.7949412827461608,
"grad_norm": 0.08539154380559921,
"learning_rate": 5.1959381647217666e-06,
"loss": 0.388,
"step": 220
},
{
"epoch": 0.7985546522131888,
"grad_norm": 0.07000590115785599,
"learning_rate": 5.018445995528931e-06,
"loss": 0.4122,
"step": 221
},
{
"epoch": 0.8021680216802168,
"grad_norm": 0.07643178850412369,
"learning_rate": 4.843699534944257e-06,
"loss": 0.3749,
"step": 222
},
{
"epoch": 0.8057813911472448,
"grad_norm": 0.06629081815481186,
"learning_rate": 4.671722795257327e-06,
"loss": 0.3817,
"step": 223
},
{
"epoch": 0.8093947606142728,
"grad_norm": 0.06171542406082153,
"learning_rate": 4.502539408164386e-06,
"loss": 0.3474,
"step": 224
},
{
"epoch": 0.8130081300813008,
"grad_norm": 0.06734922528266907,
"learning_rate": 4.336172621521034e-06,
"loss": 0.3328,
"step": 225
},
{
"epoch": 0.8166214995483289,
"grad_norm": 0.09524697810411453,
"learning_rate": 4.1726452961477146e-06,
"loss": 0.3433,
"step": 226
},
{
"epoch": 0.8202348690153568,
"grad_norm": 0.06357850879430771,
"learning_rate": 4.01197990268834e-06,
"loss": 0.3992,
"step": 227
},
{
"epoch": 0.8238482384823849,
"grad_norm": 0.07560393214225769,
"learning_rate": 3.8541985185225645e-06,
"loss": 0.3575,
"step": 228
},
{
"epoch": 0.8274616079494128,
"grad_norm": 0.06906560808420181,
"learning_rate": 3.6993228247320877e-06,
"loss": 0.3287,
"step": 229
},
{
"epoch": 0.8310749774164409,
"grad_norm": 0.08411566913127899,
"learning_rate": 3.547374103121398e-06,
"loss": 0.4115,
"step": 230
},
{
"epoch": 0.8346883468834688,
"grad_norm": 0.08515972644090652,
"learning_rate": 3.398373233293378e-06,
"loss": 0.3709,
"step": 231
},
{
"epoch": 0.8383017163504969,
"grad_norm": 0.06780155003070831,
"learning_rate": 3.252340689780245e-06,
"loss": 0.3599,
"step": 232
},
{
"epoch": 0.8419150858175248,
"grad_norm": 0.08019706606864929,
"learning_rate": 3.1092965392300417e-06,
"loss": 0.3869,
"step": 233
},
{
"epoch": 0.8455284552845529,
"grad_norm": 0.0702086016535759,
"learning_rate": 2.969260437649293e-06,
"loss": 0.3846,
"step": 234
},
{
"epoch": 0.8491418247515808,
"grad_norm": 0.0851154550909996,
"learning_rate": 2.8322516277019624e-06,
"loss": 0.3434,
"step": 235
},
{
"epoch": 0.8527551942186089,
"grad_norm": 0.06722518056631088,
"learning_rate": 2.6982889360653377e-06,
"loss": 0.3349,
"step": 236
},
{
"epoch": 0.8563685636856369,
"grad_norm": 0.06803542375564575,
"learning_rate": 2.5673907708429976e-06,
"loss": 0.3526,
"step": 237
},
{
"epoch": 0.8599819331526649,
"grad_norm": 0.08029063045978546,
"learning_rate": 2.4395751190352924e-06,
"loss": 0.4286,
"step": 238
},
{
"epoch": 0.8635953026196929,
"grad_norm": 0.08042778819799423,
"learning_rate": 2.3148595440677405e-06,
"loss": 0.3739,
"step": 239
},
{
"epoch": 0.8672086720867209,
"grad_norm": 0.07175204902887344,
"learning_rate": 2.1932611833775846e-06,
"loss": 0.4156,
"step": 240
},
{
"epoch": 0.8708220415537489,
"grad_norm": 0.058878783136606216,
"learning_rate": 2.074796746058896e-06,
"loss": 0.3636,
"step": 241
},
{
"epoch": 0.8744354110207768,
"grad_norm": 0.08569607883691788,
"learning_rate": 1.9594825105665654e-06,
"loss": 0.3889,
"step": 242
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.07353324443101883,
"learning_rate": 1.847334322479413e-06,
"loss": 0.4352,
"step": 243
},
{
"epoch": 0.8816621499548328,
"grad_norm": 0.07135035842657089,
"learning_rate": 1.738367592322837e-06,
"loss": 0.4265,
"step": 244
},
{
"epoch": 0.8852755194218609,
"grad_norm": 0.06918162852525711,
"learning_rate": 1.6325972934512018e-06,
"loss": 0.4295,
"step": 245
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.07300789654254913,
"learning_rate": 1.5300379599903409e-06,
"loss": 0.4226,
"step": 246
},
{
"epoch": 0.8925022583559169,
"grad_norm": 0.06973148882389069,
"learning_rate": 1.4307036848403648e-06,
"loss": 0.3368,
"step": 247
},
{
"epoch": 0.8961156278229448,
"grad_norm": 0.07200148701667786,
"learning_rate": 1.3346081177391472e-06,
"loss": 0.3924,
"step": 248
},
{
"epoch": 0.8997289972899729,
"grad_norm": 0.07833510637283325,
"learning_rate": 1.2417644633866632e-06,
"loss": 0.3274,
"step": 249
},
{
"epoch": 0.9033423667570009,
"grad_norm": 0.061651114374399185,
"learning_rate": 1.1521854796305242e-06,
"loss": 0.3705,
"step": 250
},
{
"epoch": 0.9069557362240289,
"grad_norm": 0.07440148293972015,
"learning_rate": 1.0658834757128838e-06,
"loss": 0.3715,
"step": 251
},
{
"epoch": 0.9105691056910569,
"grad_norm": 0.0720466673374176,
"learning_rate": 9.828703105789983e-07,
"loss": 0.3361,
"step": 252
},
{
"epoch": 0.9141824751580849,
"grad_norm": 0.08179104328155518,
"learning_rate": 9.031573912476554e-07,
"loss": 0.3393,
"step": 253
},
{
"epoch": 0.9177958446251129,
"grad_norm": 0.058865226805210114,
"learning_rate": 8.267556712437341e-07,
"loss": 0.4249,
"step": 254
},
{
"epoch": 0.9214092140921409,
"grad_norm": 0.07929901778697968,
"learning_rate": 7.536756490930358e-07,
"loss": 0.4341,
"step": 255
},
{
"epoch": 0.9250225835591689,
"grad_norm": 0.07914505153894424,
"learning_rate": 6.839273668796747e-07,
"loss": 0.3942,
"step": 256
},
{
"epoch": 0.928635953026197,
"grad_norm": 0.08146975934505463,
"learning_rate": 6.175204088661485e-07,
"loss": 0.3562,
"step": 257
},
{
"epoch": 0.9322493224932249,
"grad_norm": 0.08726157248020172,
"learning_rate": 5.544639001763718e-07,
"loss": 0.4314,
"step": 258
},
{
"epoch": 0.935862691960253,
"grad_norm": 0.09031800180673599,
"learning_rate": 4.947665055417605e-07,
"loss": 0.3842,
"step": 259
},
{
"epoch": 0.9394760614272809,
"grad_norm": 0.0922897681593895,
"learning_rate": 4.3843642811059737e-07,
"loss": 0.3285,
"step": 260
},
{
"epoch": 0.943089430894309,
"grad_norm": 0.07188927382230759,
"learning_rate": 3.854814083208064e-07,
"loss": 0.3839,
"step": 261
},
{
"epoch": 0.9467028003613369,
"grad_norm": 0.08181816339492798,
"learning_rate": 3.3590872283633944e-07,
"loss": 0.3651,
"step": 262
},
{
"epoch": 0.950316169828365,
"grad_norm": 0.0699373111128807,
"learning_rate": 2.8972518354725977e-07,
"loss": 0.457,
"step": 263
},
{
"epoch": 0.9539295392953929,
"grad_norm": 0.08292391151189804,
"learning_rate": 2.4693713663372644e-07,
"loss": 0.4105,
"step": 264
},
{
"epoch": 0.957542908762421,
"grad_norm": 0.07387669384479523,
"learning_rate": 2.0755046169392e-07,
"loss": 0.3846,
"step": 265
},
{
"epoch": 0.9611562782294489,
"grad_norm": 0.08278100937604904,
"learning_rate": 1.7157057093614703e-07,
"loss": 0.4334,
"step": 266
},
{
"epoch": 0.964769647696477,
"grad_norm": 0.06216645613312721,
"learning_rate": 1.3900240843510993e-07,
"loss": 0.4007,
"step": 267
},
{
"epoch": 0.9683830171635049,
"grad_norm": 0.07292906939983368,
"learning_rate": 1.0985044945254764e-07,
"loss": 0.4152,
"step": 268
},
{
"epoch": 0.971996386630533,
"grad_norm": 0.07897216826677322,
"learning_rate": 8.411869982228038e-08,
"loss": 0.3954,
"step": 269
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.0776594951748848,
"learning_rate": 6.181069539974716e-08,
"loss": 0.3449,
"step": 270
},
{
"epoch": 0.979223125564589,
"grad_norm": 0.07104814052581787,
"learning_rate": 4.292950157614717e-08,
"loss": 0.3476,
"step": 271
},
{
"epoch": 0.982836495031617,
"grad_norm": 0.07420724630355835,
"learning_rate": 2.7477712857215677e-08,
"loss": 0.4095,
"step": 272
},
{
"epoch": 0.986449864498645,
"grad_norm": 0.06806948781013489,
"learning_rate": 1.5457452506698056e-08,
"loss": 0.3879,
"step": 273
},
{
"epoch": 0.990063233965673,
"grad_norm": 0.08909036219120026,
"learning_rate": 6.870372254602631e-09,
"loss": 0.3327,
"step": 274
},
{
"epoch": 0.993676603432701,
"grad_norm": 0.07509468495845795,
"learning_rate": 1.7176520702238964e-09,
"loss": 0.4033,
"step": 275
},
{
"epoch": 0.997289972899729,
"grad_norm": 0.06269805878400803,
"learning_rate": 0.0,
"loss": 0.4076,
"step": 276
},
{
"epoch": 0.997289972899729,
"eval_loss": 0.35787180066108704,
"eval_runtime": 515.6409,
"eval_samples_per_second": 1.422,
"eval_steps_per_second": 0.357,
"step": 276
}
],
"logging_steps": 1,
"max_steps": 276,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.247726843172225e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}