both_pack_22528_768 / trainer_state.json
LHL3341's picture
upload checkpoint
b3f7dad verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1524,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019704433497536946,
"grad_norm": 3.0279820123546215,
"learning_rate": 5.882352941176471e-07,
"loss": 0.5649,
"step": 10
},
{
"epoch": 0.03940886699507389,
"grad_norm": 1.4615125615819697,
"learning_rate": 1.2418300653594772e-06,
"loss": 0.5603,
"step": 20
},
{
"epoch": 0.059113300492610835,
"grad_norm": 0.8068217101006925,
"learning_rate": 1.8954248366013072e-06,
"loss": 0.5105,
"step": 30
},
{
"epoch": 0.07881773399014778,
"grad_norm": 0.5614754695808303,
"learning_rate": 2.549019607843137e-06,
"loss": 0.4774,
"step": 40
},
{
"epoch": 0.09852216748768473,
"grad_norm": 0.43718231534842905,
"learning_rate": 3.2026143790849674e-06,
"loss": 0.4583,
"step": 50
},
{
"epoch": 0.11822660098522167,
"grad_norm": 0.39346793627188986,
"learning_rate": 3.856209150326798e-06,
"loss": 0.436,
"step": 60
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.30749780652350023,
"learning_rate": 4.509803921568628e-06,
"loss": 0.4206,
"step": 70
},
{
"epoch": 0.15763546798029557,
"grad_norm": 0.30868892828152295,
"learning_rate": 5.163398692810458e-06,
"loss": 0.41,
"step": 80
},
{
"epoch": 0.17733990147783252,
"grad_norm": 0.32830592651082535,
"learning_rate": 5.816993464052289e-06,
"loss": 0.4113,
"step": 90
},
{
"epoch": 0.19704433497536947,
"grad_norm": 0.32758990112743774,
"learning_rate": 6.470588235294119e-06,
"loss": 0.3994,
"step": 100
},
{
"epoch": 0.21674876847290642,
"grad_norm": 0.2790374663664497,
"learning_rate": 7.124183006535948e-06,
"loss": 0.4015,
"step": 110
},
{
"epoch": 0.23645320197044334,
"grad_norm": 0.30110721148157965,
"learning_rate": 7.77777777777778e-06,
"loss": 0.3969,
"step": 120
},
{
"epoch": 0.2561576354679803,
"grad_norm": 0.30423786969760397,
"learning_rate": 8.43137254901961e-06,
"loss": 0.3987,
"step": 130
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.31196936765334116,
"learning_rate": 9.084967320261438e-06,
"loss": 0.3913,
"step": 140
},
{
"epoch": 0.2955665024630542,
"grad_norm": 0.37040976407183024,
"learning_rate": 9.738562091503268e-06,
"loss": 0.386,
"step": 150
},
{
"epoch": 0.31527093596059114,
"grad_norm": 0.3549433136700143,
"learning_rate": 9.999527436141312e-06,
"loss": 0.3816,
"step": 160
},
{
"epoch": 0.33497536945812806,
"grad_norm": 0.39718414457173384,
"learning_rate": 9.996639869374844e-06,
"loss": 0.3905,
"step": 170
},
{
"epoch": 0.35467980295566504,
"grad_norm": 0.38713568972177087,
"learning_rate": 9.991128785615903e-06,
"loss": 0.374,
"step": 180
},
{
"epoch": 0.37438423645320196,
"grad_norm": 0.30293306376256685,
"learning_rate": 9.982997078493457e-06,
"loss": 0.3802,
"step": 190
},
{
"epoch": 0.39408866995073893,
"grad_norm": 0.30182611299554746,
"learning_rate": 9.972249017611153e-06,
"loss": 0.3824,
"step": 200
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.3037149817652602,
"learning_rate": 9.958890246305534e-06,
"loss": 0.3698,
"step": 210
},
{
"epoch": 0.43349753694581283,
"grad_norm": 0.36304856528027657,
"learning_rate": 9.942927778682968e-06,
"loss": 0.3764,
"step": 220
},
{
"epoch": 0.45320197044334976,
"grad_norm": 0.3347217336710472,
"learning_rate": 9.924369995936846e-06,
"loss": 0.3718,
"step": 230
},
{
"epoch": 0.4729064039408867,
"grad_norm": 0.2971293315400481,
"learning_rate": 9.903226641946982e-06,
"loss": 0.3754,
"step": 240
},
{
"epoch": 0.49261083743842365,
"grad_norm": 0.3287026933137672,
"learning_rate": 9.879508818163536e-06,
"loss": 0.3702,
"step": 250
},
{
"epoch": 0.5123152709359606,
"grad_norm": 0.3156479952472752,
"learning_rate": 9.853228977778125e-06,
"loss": 0.3728,
"step": 260
},
{
"epoch": 0.5320197044334976,
"grad_norm": 0.32319185406715967,
"learning_rate": 9.82440091918519e-06,
"loss": 0.3697,
"step": 270
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.3264752725365435,
"learning_rate": 9.79303977873707e-06,
"loss": 0.3758,
"step": 280
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.2681401549401077,
"learning_rate": 9.759162022796566e-06,
"loss": 0.3698,
"step": 290
},
{
"epoch": 0.5911330049261084,
"grad_norm": 0.2906543480857407,
"learning_rate": 9.722785439091172e-06,
"loss": 0.3696,
"step": 300
},
{
"epoch": 0.6108374384236454,
"grad_norm": 0.38568495214922205,
"learning_rate": 9.683929127373514e-06,
"loss": 0.3689,
"step": 310
},
{
"epoch": 0.6305418719211823,
"grad_norm": 0.38892801923451475,
"learning_rate": 9.642613489392916e-06,
"loss": 0.3556,
"step": 320
},
{
"epoch": 0.6502463054187192,
"grad_norm": 0.2965648673892553,
"learning_rate": 9.598860218183318e-06,
"loss": 0.3619,
"step": 330
},
{
"epoch": 0.6699507389162561,
"grad_norm": 0.31678244496188085,
"learning_rate": 9.552692286673231e-06,
"loss": 0.3663,
"step": 340
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.3483919487665228,
"learning_rate": 9.504133935623643e-06,
"loss": 0.3581,
"step": 350
},
{
"epoch": 0.7093596059113301,
"grad_norm": 0.3404931966180023,
"learning_rate": 9.453210660900264e-06,
"loss": 0.3563,
"step": 360
},
{
"epoch": 0.729064039408867,
"grad_norm": 0.38124885036911094,
"learning_rate": 9.399949200086757e-06,
"loss": 0.3588,
"step": 370
},
{
"epoch": 0.7487684729064039,
"grad_norm": 0.3118666499933069,
"learning_rate": 9.344377518446006e-06,
"loss": 0.3628,
"step": 380
},
{
"epoch": 0.7684729064039408,
"grad_norm": 0.38361684678555974,
"learning_rate": 9.286524794236783e-06,
"loss": 0.361,
"step": 390
},
{
"epoch": 0.7881773399014779,
"grad_norm": 0.32779346459943115,
"learning_rate": 9.226421403393513e-06,
"loss": 0.3642,
"step": 400
},
{
"epoch": 0.8078817733990148,
"grad_norm": 0.2992967856385497,
"learning_rate": 9.164098903577203e-06,
"loss": 0.3573,
"step": 410
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.3205591263261017,
"learning_rate": 9.099590017605903e-06,
"loss": 0.3549,
"step": 420
},
{
"epoch": 0.8472906403940886,
"grad_norm": 0.2678093580785744,
"learning_rate": 9.032928616273369e-06,
"loss": 0.3594,
"step": 430
},
{
"epoch": 0.8669950738916257,
"grad_norm": 0.32355043253019766,
"learning_rate": 8.964149700565006e-06,
"loss": 0.3602,
"step": 440
},
{
"epoch": 0.8866995073891626,
"grad_norm": 0.2866543725885009,
"learning_rate": 8.893289383280379e-06,
"loss": 0.3524,
"step": 450
},
{
"epoch": 0.9064039408866995,
"grad_norm": 0.33569301955376707,
"learning_rate": 8.820384870071951e-06,
"loss": 0.3484,
"step": 460
},
{
"epoch": 0.9261083743842364,
"grad_norm": 0.3325781399033863,
"learning_rate": 8.745474439910043e-06,
"loss": 0.3549,
"step": 470
},
{
"epoch": 0.9458128078817734,
"grad_norm": 0.3390519215844686,
"learning_rate": 8.668597424984196e-06,
"loss": 0.3533,
"step": 480
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.30939662294936154,
"learning_rate": 8.589794190051582e-06,
"loss": 0.3549,
"step": 490
},
{
"epoch": 0.9852216748768473,
"grad_norm": 0.32579852375721036,
"learning_rate": 8.509106111243223e-06,
"loss": 0.3521,
"step": 500
},
{
"epoch": 1.0039408866995074,
"grad_norm": 0.31743200314693687,
"learning_rate": 8.4265755543392e-06,
"loss": 0.3597,
"step": 510
},
{
"epoch": 1.0236453201970444,
"grad_norm": 0.3517094413633957,
"learning_rate": 8.342245852524229e-06,
"loss": 0.3329,
"step": 520
},
{
"epoch": 1.0433497536945813,
"grad_norm": 0.3386212184889377,
"learning_rate": 8.256161283635315e-06,
"loss": 0.3348,
"step": 530
},
{
"epoch": 1.0630541871921182,
"grad_norm": 0.3461407262910111,
"learning_rate": 8.16836704691338e-06,
"loss": 0.3284,
"step": 540
},
{
"epoch": 1.0827586206896551,
"grad_norm": 0.3175184396382611,
"learning_rate": 8.078909239271127e-06,
"loss": 0.3341,
"step": 550
},
{
"epoch": 1.102463054187192,
"grad_norm": 0.27454177241292593,
"learning_rate": 7.987834831089576e-06,
"loss": 0.3392,
"step": 560
},
{
"epoch": 1.1221674876847292,
"grad_norm": 0.2955431213720922,
"learning_rate": 7.895191641555957e-06,
"loss": 0.3319,
"step": 570
},
{
"epoch": 1.141871921182266,
"grad_norm": 0.3260291195524149,
"learning_rate": 7.801028313555954e-06,
"loss": 0.3364,
"step": 580
},
{
"epoch": 1.161576354679803,
"grad_norm": 0.36803259706135694,
"learning_rate": 7.705394288133459e-06,
"loss": 0.329,
"step": 590
},
{
"epoch": 1.18128078817734,
"grad_norm": 0.31736745368430674,
"learning_rate": 7.60833977853123e-06,
"loss": 0.3332,
"step": 600
},
{
"epoch": 1.2009852216748769,
"grad_norm": 0.2814842646318462,
"learning_rate": 7.509915743826128e-06,
"loss": 0.3278,
"step": 610
},
{
"epoch": 1.2206896551724138,
"grad_norm": 0.28225717787284943,
"learning_rate": 7.4101738621727245e-06,
"loss": 0.3296,
"step": 620
},
{
"epoch": 1.2403940886699507,
"grad_norm": 0.3073951939120494,
"learning_rate": 7.3091665036693716e-06,
"loss": 0.3253,
"step": 630
},
{
"epoch": 1.2600985221674876,
"grad_norm": 0.2909750317016804,
"learning_rate": 7.206946702860948e-06,
"loss": 0.3329,
"step": 640
},
{
"epoch": 1.2798029556650246,
"grad_norm": 0.3195502698394791,
"learning_rate": 7.103568130892742e-06,
"loss": 0.3358,
"step": 650
},
{
"epoch": 1.2995073891625615,
"grad_norm": 0.2955230796655043,
"learning_rate": 6.999085067330085e-06,
"loss": 0.331,
"step": 660
},
{
"epoch": 1.3192118226600984,
"grad_norm": 0.30204789259744014,
"learning_rate": 6.8935523716585195e-06,
"loss": 0.3313,
"step": 670
},
{
"epoch": 1.3389162561576355,
"grad_norm": 0.2683292800793258,
"learning_rate": 6.787025454479489e-06,
"loss": 0.3229,
"step": 680
},
{
"epoch": 1.3586206896551725,
"grad_norm": 0.26418537588888813,
"learning_rate": 6.679560248416652e-06,
"loss": 0.3297,
"step": 690
},
{
"epoch": 1.3783251231527094,
"grad_norm": 0.2909070001901888,
"learning_rate": 6.571213178748112e-06,
"loss": 0.3319,
"step": 700
},
{
"epoch": 1.3980295566502463,
"grad_norm": 0.28638818406094363,
"learning_rate": 6.462041133779969e-06,
"loss": 0.3309,
"step": 710
},
{
"epoch": 1.4177339901477832,
"grad_norm": 0.27868432298930257,
"learning_rate": 6.352101434976761e-06,
"loss": 0.3322,
"step": 720
},
{
"epoch": 1.4374384236453202,
"grad_norm": 0.29588673430919354,
"learning_rate": 6.241451806864465e-06,
"loss": 0.336,
"step": 730
},
{
"epoch": 1.457142857142857,
"grad_norm": 0.30596222012886065,
"learning_rate": 6.130150346721888e-06,
"loss": 0.3229,
"step": 740
},
{
"epoch": 1.4768472906403942,
"grad_norm": 0.2648591334136923,
"learning_rate": 6.018255494076309e-06,
"loss": 0.3219,
"step": 750
},
{
"epoch": 1.4965517241379311,
"grad_norm": 0.28293115551723064,
"learning_rate": 5.905826000019458e-06,
"loss": 0.3294,
"step": 760
},
{
"epoch": 1.516256157635468,
"grad_norm": 0.27582313122373875,
"learning_rate": 5.79292089635987e-06,
"loss": 0.3296,
"step": 770
},
{
"epoch": 1.535960591133005,
"grad_norm": 0.2812038009084268,
"learning_rate": 5.679599464627885e-06,
"loss": 0.3316,
"step": 780
},
{
"epoch": 1.555665024630542,
"grad_norm": 0.2944019123260006,
"learning_rate": 5.5659212049494915e-06,
"loss": 0.3328,
"step": 790
},
{
"epoch": 1.5753694581280788,
"grad_norm": 0.29904709651401157,
"learning_rate": 5.451945804805425e-06,
"loss": 0.324,
"step": 800
},
{
"epoch": 1.5950738916256157,
"grad_norm": 0.2784023488610782,
"learning_rate": 5.337733107691879e-06,
"loss": 0.3234,
"step": 810
},
{
"epoch": 1.6147783251231527,
"grad_norm": 0.4116398811741189,
"learning_rate": 5.223343081699302e-06,
"loss": 0.3311,
"step": 820
},
{
"epoch": 1.6344827586206896,
"grad_norm": 0.30015526206975013,
"learning_rate": 5.108835788025782e-06,
"loss": 0.3259,
"step": 830
},
{
"epoch": 1.6541871921182265,
"grad_norm": 0.2929610632606337,
"learning_rate": 4.994271349441534e-06,
"loss": 0.3231,
"step": 840
},
{
"epoch": 1.6738916256157634,
"grad_norm": 0.28794646555718384,
"learning_rate": 4.879709918721067e-06,
"loss": 0.3216,
"step": 850
},
{
"epoch": 1.6935960591133004,
"grad_norm": 0.3039465901272877,
"learning_rate": 4.76521164705959e-06,
"loss": 0.3255,
"step": 860
},
{
"epoch": 1.7133004926108373,
"grad_norm": 0.2908141512608896,
"learning_rate": 4.6508366524902525e-06,
"loss": 0.3232,
"step": 870
},
{
"epoch": 1.7330049261083744,
"grad_norm": 0.2856067008508303,
"learning_rate": 4.536644988318802e-06,
"loss": 0.3259,
"step": 880
},
{
"epoch": 1.7527093596059113,
"grad_norm": 0.2736902206570373,
"learning_rate": 4.4226966115922096e-06,
"loss": 0.3324,
"step": 890
},
{
"epoch": 1.7724137931034483,
"grad_norm": 0.3559568362235315,
"learning_rate": 4.3090513516178514e-06,
"loss": 0.3242,
"step": 900
},
{
"epoch": 1.7921182266009852,
"grad_norm": 0.2538152352841044,
"learning_rate": 4.195768878549766e-06,
"loss": 0.3201,
"step": 910
},
{
"epoch": 1.8118226600985223,
"grad_norm": 0.26598303066421064,
"learning_rate": 4.082908672058453e-06,
"loss": 0.3259,
"step": 920
},
{
"epoch": 1.8315270935960593,
"grad_norm": 0.28186912011428483,
"learning_rate": 3.970529990100706e-06,
"loss": 0.3257,
"step": 930
},
{
"epoch": 1.8512315270935962,
"grad_norm": 0.275845674431498,
"learning_rate": 3.8586918378058595e-06,
"loss": 0.3296,
"step": 940
},
{
"epoch": 1.870935960591133,
"grad_norm": 0.2634790774287366,
"learning_rate": 3.747452936494761e-06,
"loss": 0.328,
"step": 950
},
{
"epoch": 1.89064039408867,
"grad_norm": 0.3021503026215729,
"learning_rate": 3.636871692847791e-06,
"loss": 0.3224,
"step": 960
},
{
"epoch": 1.910344827586207,
"grad_norm": 0.271143161250696,
"learning_rate": 3.527006168238061e-06,
"loss": 0.3269,
"step": 970
},
{
"epoch": 1.9300492610837439,
"grad_norm": 0.2651897275909597,
"learning_rate": 3.417914048245927e-06,
"loss": 0.3247,
"step": 980
},
{
"epoch": 1.9497536945812808,
"grad_norm": 0.2608280493875229,
"learning_rate": 3.309652612370816e-06,
"loss": 0.3223,
"step": 990
},
{
"epoch": 1.9694581280788177,
"grad_norm": 0.25658236440730964,
"learning_rate": 3.2022787039562745e-06,
"loss": 0.3284,
"step": 1000
},
{
"epoch": 1.9891625615763546,
"grad_norm": 0.39452708935161934,
"learning_rate": 3.095848700344001e-06,
"loss": 0.3281,
"step": 1010
},
{
"epoch": 2.007881773399015,
"grad_norm": 0.27426147250441796,
"learning_rate": 2.990418483272579e-06,
"loss": 0.3169,
"step": 1020
},
{
"epoch": 2.027586206896552,
"grad_norm": 0.33623291768317054,
"learning_rate": 2.8860434095364266e-06,
"loss": 0.3069,
"step": 1030
},
{
"epoch": 2.0472906403940887,
"grad_norm": 0.4101482918037455,
"learning_rate": 2.7827782819203497e-06,
"loss": 0.3058,
"step": 1040
},
{
"epoch": 2.0669950738916256,
"grad_norm": 0.24962463377649954,
"learning_rate": 2.6806773204250148e-06,
"loss": 0.3065,
"step": 1050
},
{
"epoch": 2.0866995073891625,
"grad_norm": 0.2530338383655545,
"learning_rate": 2.579794133798388e-06,
"loss": 0.3051,
"step": 1060
},
{
"epoch": 2.1064039408866995,
"grad_norm": 0.26733489560435986,
"learning_rate": 2.4801816913881242e-06,
"loss": 0.3071,
"step": 1070
},
{
"epoch": 2.1261083743842364,
"grad_norm": 0.2672830744223254,
"learning_rate": 2.3818922953296937e-06,
"loss": 0.3056,
"step": 1080
},
{
"epoch": 2.1458128078817733,
"grad_norm": 0.2563106205327875,
"learning_rate": 2.2849775530848057e-06,
"loss": 0.2992,
"step": 1090
},
{
"epoch": 2.1655172413793102,
"grad_norm": 0.2606021808754117,
"learning_rate": 2.189488350344596e-06,
"loss": 0.3139,
"step": 1100
},
{
"epoch": 2.185221674876847,
"grad_norm": 0.26508188540176497,
"learning_rate": 2.095474824311769e-06,
"loss": 0.3031,
"step": 1110
},
{
"epoch": 2.204926108374384,
"grad_norm": 0.2997008301548407,
"learning_rate": 2.0029863373757553e-06,
"loss": 0.3066,
"step": 1120
},
{
"epoch": 2.224630541871921,
"grad_norm": 0.24564665419705128,
"learning_rate": 1.9120714511946746e-06,
"loss": 0.3027,
"step": 1130
},
{
"epoch": 2.2443349753694584,
"grad_norm": 0.24016732234530286,
"learning_rate": 1.822777901197738e-06,
"loss": 0.3047,
"step": 1140
},
{
"epoch": 2.264039408866995,
"grad_norm": 0.24884374496439782,
"learning_rate": 1.7351525715214512e-06,
"loss": 0.3012,
"step": 1150
},
{
"epoch": 2.283743842364532,
"grad_norm": 0.29955124615888423,
"learning_rate": 1.6492414703928277e-06,
"loss": 0.3024,
"step": 1160
},
{
"epoch": 2.303448275862069,
"grad_norm": 0.24995732946807675,
"learning_rate": 1.5650897059724545e-06,
"loss": 0.3072,
"step": 1170
},
{
"epoch": 2.323152709359606,
"grad_norm": 0.27721697299099485,
"learning_rate": 1.482741462670193e-06,
"loss": 0.3086,
"step": 1180
},
{
"epoch": 2.342857142857143,
"grad_norm": 0.2680931201078952,
"learning_rate": 1.4022399779458656e-06,
"loss": 0.3076,
"step": 1190
},
{
"epoch": 2.36256157635468,
"grad_norm": 0.25884126877718644,
"learning_rate": 1.3236275196071641e-06,
"loss": 0.3063,
"step": 1200
},
{
"epoch": 2.382266009852217,
"grad_norm": 0.24416102518796548,
"learning_rate": 1.2469453636166645e-06,
"loss": 0.3066,
"step": 1210
},
{
"epoch": 2.4019704433497537,
"grad_norm": 0.26922577960580724,
"learning_rate": 1.1722337724196365e-06,
"loss": 0.3119,
"step": 1220
},
{
"epoch": 2.4216748768472907,
"grad_norm": 0.25037887747041054,
"learning_rate": 1.0995319738039855e-06,
"loss": 0.313,
"step": 1230
},
{
"epoch": 2.4413793103448276,
"grad_norm": 0.2651461475029688,
"learning_rate": 1.028878140303462e-06,
"loss": 0.2988,
"step": 1240
},
{
"epoch": 2.4610837438423645,
"grad_norm": 0.2564753685149257,
"learning_rate": 9.603093691549348e-07,
"loss": 0.303,
"step": 1250
},
{
"epoch": 2.4807881773399014,
"grad_norm": 0.26344914469001607,
"learning_rate": 8.938616628202478e-07,
"loss": 0.3027,
"step": 1260
},
{
"epoch": 2.5004926108374383,
"grad_norm": 0.25207552584955384,
"learning_rate": 8.295699100829124e-07,
"loss": 0.3004,
"step": 1270
},
{
"epoch": 2.5201970443349753,
"grad_norm": 0.28458206050624457,
"learning_rate": 7.674678677295277e-07,
"loss": 0.3043,
"step": 1280
},
{
"epoch": 2.539901477832512,
"grad_norm": 0.23571286194786062,
"learning_rate": 7.07588142825571e-07,
"loss": 0.31,
"step": 1290
},
{
"epoch": 2.559605911330049,
"grad_norm": 0.24227892952770294,
"learning_rate": 6.499621755948487e-07,
"loss": 0.3014,
"step": 1300
},
{
"epoch": 2.5793103448275865,
"grad_norm": 0.23495263014727036,
"learning_rate": 5.946202229116227e-07,
"loss": 0.3078,
"step": 1310
},
{
"epoch": 2.599014778325123,
"grad_norm": 0.27243587575264466,
"learning_rate": 5.41591342414034e-07,
"loss": 0.3052,
"step": 1320
},
{
"epoch": 2.6187192118226603,
"grad_norm": 0.24767917319433275,
"learning_rate": 4.909033772472204e-07,
"loss": 0.3086,
"step": 1330
},
{
"epoch": 2.638423645320197,
"grad_norm": 0.3082430748983811,
"learning_rate": 4.42582941444093e-07,
"loss": 0.3008,
"step": 1340
},
{
"epoch": 2.658128078817734,
"grad_norm": 0.23995454448763587,
"learning_rate": 3.9665540595147376e-07,
"loss": 0.312,
"step": 1350
},
{
"epoch": 2.677832512315271,
"grad_norm": 0.26981926134895196,
"learning_rate": 3.531448853089192e-07,
"loss": 0.3052,
"step": 1360
},
{
"epoch": 2.697536945812808,
"grad_norm": 0.24323117710183706,
"learning_rate": 3.1207422498723663e-07,
"loss": 0.3065,
"step": 1370
},
{
"epoch": 2.717241379310345,
"grad_norm": 0.4168883045884567,
"learning_rate": 2.734649893933178e-07,
"loss": 0.306,
"step": 1380
},
{
"epoch": 2.736945812807882,
"grad_norm": 0.23949822475954996,
"learning_rate": 2.3733745054762059e-07,
"loss": 0.3052,
"step": 1390
},
{
"epoch": 2.7566502463054188,
"grad_norm": 0.22489392767199434,
"learning_rate": 2.0371057744021315e-07,
"loss": 0.3036,
"step": 1400
},
{
"epoch": 2.7763546798029557,
"grad_norm": 0.24001654591150326,
"learning_rate": 1.7260202607098985e-07,
"loss": 0.308,
"step": 1410
},
{
"epoch": 2.7960591133004926,
"grad_norm": 0.22628265532283418,
"learning_rate": 1.4402813017927396e-07,
"loss": 0.3014,
"step": 1420
},
{
"epoch": 2.8157635467980295,
"grad_norm": 0.29783439102497933,
"learning_rate": 1.1800389266769242e-07,
"loss": 0.3072,
"step": 1430
},
{
"epoch": 2.8354679802955665,
"grad_norm": 0.25141926113090335,
"learning_rate": 9.454297772480137e-08,
"loss": 0.3076,
"step": 1440
},
{
"epoch": 2.8551724137931034,
"grad_norm": 0.22821689683912336,
"learning_rate": 7.365770365062308e-08,
"loss": 0.3067,
"step": 1450
},
{
"epoch": 2.8748768472906403,
"grad_norm": 0.24290991296428727,
"learning_rate": 5.535903638884399e-08,
"loss": 0.3028,
"step": 1460
},
{
"epoch": 2.8945812807881772,
"grad_norm": 0.23018588484999283,
"learning_rate": 3.965658376907544e-08,
"loss": 0.302,
"step": 1470
},
{
"epoch": 2.914285714285714,
"grad_norm": 0.36088955224736075,
"learning_rate": 2.6558590462207322e-08,
"loss": 0.2996,
"step": 1480
},
{
"epoch": 2.933990147783251,
"grad_norm": 0.2339217161083891,
"learning_rate": 1.607193365148696e-08,
"loss": 0.3094,
"step": 1490
},
{
"epoch": 2.9536945812807884,
"grad_norm": 0.2448303897755478,
"learning_rate": 8.202119421615306e-09,
"loss": 0.3014,
"step": 1500
},
{
"epoch": 2.973399014778325,
"grad_norm": 0.430632356624608,
"learning_rate": 2.9532798677395226e-09,
"loss": 0.3043,
"step": 1510
},
{
"epoch": 2.9931034482758623,
"grad_norm": 0.2305975377394837,
"learning_rate": 3.2817092587345e-10,
"loss": 0.3066,
"step": 1520
}
],
"logging_steps": 10,
"max_steps": 1524,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5070977114308608.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}