| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1524, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.019704433497536946, | |
| "grad_norm": 3.0279820123546215, | |
| "learning_rate": 5.882352941176471e-07, | |
| "loss": 0.5649, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03940886699507389, | |
| "grad_norm": 1.4615125615819697, | |
| "learning_rate": 1.2418300653594772e-06, | |
| "loss": 0.5603, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.059113300492610835, | |
| "grad_norm": 0.8068217101006925, | |
| "learning_rate": 1.8954248366013072e-06, | |
| "loss": 0.5105, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07881773399014778, | |
| "grad_norm": 0.5614754695808303, | |
| "learning_rate": 2.549019607843137e-06, | |
| "loss": 0.4774, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09852216748768473, | |
| "grad_norm": 0.43718231534842905, | |
| "learning_rate": 3.2026143790849674e-06, | |
| "loss": 0.4583, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11822660098522167, | |
| "grad_norm": 0.39346793627188986, | |
| "learning_rate": 3.856209150326798e-06, | |
| "loss": 0.436, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 0.30749780652350023, | |
| "learning_rate": 4.509803921568628e-06, | |
| "loss": 0.4206, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15763546798029557, | |
| "grad_norm": 0.30868892828152295, | |
| "learning_rate": 5.163398692810458e-06, | |
| "loss": 0.41, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17733990147783252, | |
| "grad_norm": 0.32830592651082535, | |
| "learning_rate": 5.816993464052289e-06, | |
| "loss": 0.4113, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.19704433497536947, | |
| "grad_norm": 0.32758990112743774, | |
| "learning_rate": 6.470588235294119e-06, | |
| "loss": 0.3994, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21674876847290642, | |
| "grad_norm": 0.2790374663664497, | |
| "learning_rate": 7.124183006535948e-06, | |
| "loss": 0.4015, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.23645320197044334, | |
| "grad_norm": 0.30110721148157965, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.3969, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2561576354679803, | |
| "grad_norm": 0.30423786969760397, | |
| "learning_rate": 8.43137254901961e-06, | |
| "loss": 0.3987, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 0.31196936765334116, | |
| "learning_rate": 9.084967320261438e-06, | |
| "loss": 0.3913, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2955665024630542, | |
| "grad_norm": 0.37040976407183024, | |
| "learning_rate": 9.738562091503268e-06, | |
| "loss": 0.386, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.31527093596059114, | |
| "grad_norm": 0.3549433136700143, | |
| "learning_rate": 9.999527436141312e-06, | |
| "loss": 0.3816, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.33497536945812806, | |
| "grad_norm": 0.39718414457173384, | |
| "learning_rate": 9.996639869374844e-06, | |
| "loss": 0.3905, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.35467980295566504, | |
| "grad_norm": 0.38713568972177087, | |
| "learning_rate": 9.991128785615903e-06, | |
| "loss": 0.374, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.37438423645320196, | |
| "grad_norm": 0.30293306376256685, | |
| "learning_rate": 9.982997078493457e-06, | |
| "loss": 0.3802, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.39408866995073893, | |
| "grad_norm": 0.30182611299554746, | |
| "learning_rate": 9.972249017611153e-06, | |
| "loss": 0.3824, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 0.3037149817652602, | |
| "learning_rate": 9.958890246305534e-06, | |
| "loss": 0.3698, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.43349753694581283, | |
| "grad_norm": 0.36304856528027657, | |
| "learning_rate": 9.942927778682968e-06, | |
| "loss": 0.3764, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.45320197044334976, | |
| "grad_norm": 0.3347217336710472, | |
| "learning_rate": 9.924369995936846e-06, | |
| "loss": 0.3718, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4729064039408867, | |
| "grad_norm": 0.2971293315400481, | |
| "learning_rate": 9.903226641946982e-06, | |
| "loss": 0.3754, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.49261083743842365, | |
| "grad_norm": 0.3287026933137672, | |
| "learning_rate": 9.879508818163536e-06, | |
| "loss": 0.3702, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5123152709359606, | |
| "grad_norm": 0.3156479952472752, | |
| "learning_rate": 9.853228977778125e-06, | |
| "loss": 0.3728, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5320197044334976, | |
| "grad_norm": 0.32319185406715967, | |
| "learning_rate": 9.82440091918519e-06, | |
| "loss": 0.3697, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.3264752725365435, | |
| "learning_rate": 9.79303977873707e-06, | |
| "loss": 0.3758, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.2681401549401077, | |
| "learning_rate": 9.759162022796566e-06, | |
| "loss": 0.3698, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5911330049261084, | |
| "grad_norm": 0.2906543480857407, | |
| "learning_rate": 9.722785439091172e-06, | |
| "loss": 0.3696, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6108374384236454, | |
| "grad_norm": 0.38568495214922205, | |
| "learning_rate": 9.683929127373514e-06, | |
| "loss": 0.3689, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6305418719211823, | |
| "grad_norm": 0.38892801923451475, | |
| "learning_rate": 9.642613489392916e-06, | |
| "loss": 0.3556, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6502463054187192, | |
| "grad_norm": 0.2965648673892553, | |
| "learning_rate": 9.598860218183318e-06, | |
| "loss": 0.3619, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6699507389162561, | |
| "grad_norm": 0.31678244496188085, | |
| "learning_rate": 9.552692286673231e-06, | |
| "loss": 0.3663, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.3483919487665228, | |
| "learning_rate": 9.504133935623643e-06, | |
| "loss": 0.3581, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7093596059113301, | |
| "grad_norm": 0.3404931966180023, | |
| "learning_rate": 9.453210660900264e-06, | |
| "loss": 0.3563, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.729064039408867, | |
| "grad_norm": 0.38124885036911094, | |
| "learning_rate": 9.399949200086757e-06, | |
| "loss": 0.3588, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7487684729064039, | |
| "grad_norm": 0.3118666499933069, | |
| "learning_rate": 9.344377518446006e-06, | |
| "loss": 0.3628, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7684729064039408, | |
| "grad_norm": 0.38361684678555974, | |
| "learning_rate": 9.286524794236783e-06, | |
| "loss": 0.361, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7881773399014779, | |
| "grad_norm": 0.32779346459943115, | |
| "learning_rate": 9.226421403393513e-06, | |
| "loss": 0.3642, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8078817733990148, | |
| "grad_norm": 0.2992967856385497, | |
| "learning_rate": 9.164098903577203e-06, | |
| "loss": 0.3573, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 0.3205591263261017, | |
| "learning_rate": 9.099590017605903e-06, | |
| "loss": 0.3549, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8472906403940886, | |
| "grad_norm": 0.2678093580785744, | |
| "learning_rate": 9.032928616273369e-06, | |
| "loss": 0.3594, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8669950738916257, | |
| "grad_norm": 0.32355043253019766, | |
| "learning_rate": 8.964149700565006e-06, | |
| "loss": 0.3602, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8866995073891626, | |
| "grad_norm": 0.2866543725885009, | |
| "learning_rate": 8.893289383280379e-06, | |
| "loss": 0.3524, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9064039408866995, | |
| "grad_norm": 0.33569301955376707, | |
| "learning_rate": 8.820384870071951e-06, | |
| "loss": 0.3484, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9261083743842364, | |
| "grad_norm": 0.3325781399033863, | |
| "learning_rate": 8.745474439910043e-06, | |
| "loss": 0.3549, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9458128078817734, | |
| "grad_norm": 0.3390519215844686, | |
| "learning_rate": 8.668597424984196e-06, | |
| "loss": 0.3533, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 0.30939662294936154, | |
| "learning_rate": 8.589794190051582e-06, | |
| "loss": 0.3549, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9852216748768473, | |
| "grad_norm": 0.32579852375721036, | |
| "learning_rate": 8.509106111243223e-06, | |
| "loss": 0.3521, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0039408866995074, | |
| "grad_norm": 0.31743200314693687, | |
| "learning_rate": 8.4265755543392e-06, | |
| "loss": 0.3597, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0236453201970444, | |
| "grad_norm": 0.3517094413633957, | |
| "learning_rate": 8.342245852524229e-06, | |
| "loss": 0.3329, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0433497536945813, | |
| "grad_norm": 0.3386212184889377, | |
| "learning_rate": 8.256161283635315e-06, | |
| "loss": 0.3348, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0630541871921182, | |
| "grad_norm": 0.3461407262910111, | |
| "learning_rate": 8.16836704691338e-06, | |
| "loss": 0.3284, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0827586206896551, | |
| "grad_norm": 0.3175184396382611, | |
| "learning_rate": 8.078909239271127e-06, | |
| "loss": 0.3341, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.102463054187192, | |
| "grad_norm": 0.27454177241292593, | |
| "learning_rate": 7.987834831089576e-06, | |
| "loss": 0.3392, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1221674876847292, | |
| "grad_norm": 0.2955431213720922, | |
| "learning_rate": 7.895191641555957e-06, | |
| "loss": 0.3319, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.141871921182266, | |
| "grad_norm": 0.3260291195524149, | |
| "learning_rate": 7.801028313555954e-06, | |
| "loss": 0.3364, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.161576354679803, | |
| "grad_norm": 0.36803259706135694, | |
| "learning_rate": 7.705394288133459e-06, | |
| "loss": 0.329, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.18128078817734, | |
| "grad_norm": 0.31736745368430674, | |
| "learning_rate": 7.60833977853123e-06, | |
| "loss": 0.3332, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2009852216748769, | |
| "grad_norm": 0.2814842646318462, | |
| "learning_rate": 7.509915743826128e-06, | |
| "loss": 0.3278, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2206896551724138, | |
| "grad_norm": 0.28225717787284943, | |
| "learning_rate": 7.4101738621727245e-06, | |
| "loss": 0.3296, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2403940886699507, | |
| "grad_norm": 0.3073951939120494, | |
| "learning_rate": 7.3091665036693716e-06, | |
| "loss": 0.3253, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.2600985221674876, | |
| "grad_norm": 0.2909750317016804, | |
| "learning_rate": 7.206946702860948e-06, | |
| "loss": 0.3329, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.2798029556650246, | |
| "grad_norm": 0.3195502698394791, | |
| "learning_rate": 7.103568130892742e-06, | |
| "loss": 0.3358, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.2995073891625615, | |
| "grad_norm": 0.2955230796655043, | |
| "learning_rate": 6.999085067330085e-06, | |
| "loss": 0.331, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3192118226600984, | |
| "grad_norm": 0.30204789259744014, | |
| "learning_rate": 6.8935523716585195e-06, | |
| "loss": 0.3313, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.3389162561576355, | |
| "grad_norm": 0.2683292800793258, | |
| "learning_rate": 6.787025454479489e-06, | |
| "loss": 0.3229, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.3586206896551725, | |
| "grad_norm": 0.26418537588888813, | |
| "learning_rate": 6.679560248416652e-06, | |
| "loss": 0.3297, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3783251231527094, | |
| "grad_norm": 0.2909070001901888, | |
| "learning_rate": 6.571213178748112e-06, | |
| "loss": 0.3319, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.3980295566502463, | |
| "grad_norm": 0.28638818406094363, | |
| "learning_rate": 6.462041133779969e-06, | |
| "loss": 0.3309, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.4177339901477832, | |
| "grad_norm": 0.27868432298930257, | |
| "learning_rate": 6.352101434976761e-06, | |
| "loss": 0.3322, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.4374384236453202, | |
| "grad_norm": 0.29588673430919354, | |
| "learning_rate": 6.241451806864465e-06, | |
| "loss": 0.336, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.457142857142857, | |
| "grad_norm": 0.30596222012886065, | |
| "learning_rate": 6.130150346721888e-06, | |
| "loss": 0.3229, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.4768472906403942, | |
| "grad_norm": 0.2648591334136923, | |
| "learning_rate": 6.018255494076309e-06, | |
| "loss": 0.3219, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.4965517241379311, | |
| "grad_norm": 0.28293115551723064, | |
| "learning_rate": 5.905826000019458e-06, | |
| "loss": 0.3294, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.516256157635468, | |
| "grad_norm": 0.27582313122373875, | |
| "learning_rate": 5.79292089635987e-06, | |
| "loss": 0.3296, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.535960591133005, | |
| "grad_norm": 0.2812038009084268, | |
| "learning_rate": 5.679599464627885e-06, | |
| "loss": 0.3316, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.555665024630542, | |
| "grad_norm": 0.2944019123260006, | |
| "learning_rate": 5.5659212049494915e-06, | |
| "loss": 0.3328, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5753694581280788, | |
| "grad_norm": 0.29904709651401157, | |
| "learning_rate": 5.451945804805425e-06, | |
| "loss": 0.324, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.5950738916256157, | |
| "grad_norm": 0.2784023488610782, | |
| "learning_rate": 5.337733107691879e-06, | |
| "loss": 0.3234, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6147783251231527, | |
| "grad_norm": 0.4116398811741189, | |
| "learning_rate": 5.223343081699302e-06, | |
| "loss": 0.3311, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.6344827586206896, | |
| "grad_norm": 0.30015526206975013, | |
| "learning_rate": 5.108835788025782e-06, | |
| "loss": 0.3259, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.6541871921182265, | |
| "grad_norm": 0.2929610632606337, | |
| "learning_rate": 4.994271349441534e-06, | |
| "loss": 0.3231, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.6738916256157634, | |
| "grad_norm": 0.28794646555718384, | |
| "learning_rate": 4.879709918721067e-06, | |
| "loss": 0.3216, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.6935960591133004, | |
| "grad_norm": 0.3039465901272877, | |
| "learning_rate": 4.76521164705959e-06, | |
| "loss": 0.3255, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.7133004926108373, | |
| "grad_norm": 0.2908141512608896, | |
| "learning_rate": 4.6508366524902525e-06, | |
| "loss": 0.3232, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.7330049261083744, | |
| "grad_norm": 0.2856067008508303, | |
| "learning_rate": 4.536644988318802e-06, | |
| "loss": 0.3259, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.7527093596059113, | |
| "grad_norm": 0.2736902206570373, | |
| "learning_rate": 4.4226966115922096e-06, | |
| "loss": 0.3324, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.7724137931034483, | |
| "grad_norm": 0.3559568362235315, | |
| "learning_rate": 4.3090513516178514e-06, | |
| "loss": 0.3242, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.7921182266009852, | |
| "grad_norm": 0.2538152352841044, | |
| "learning_rate": 4.195768878549766e-06, | |
| "loss": 0.3201, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.8118226600985223, | |
| "grad_norm": 0.26598303066421064, | |
| "learning_rate": 4.082908672058453e-06, | |
| "loss": 0.3259, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.8315270935960593, | |
| "grad_norm": 0.28186912011428483, | |
| "learning_rate": 3.970529990100706e-06, | |
| "loss": 0.3257, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8512315270935962, | |
| "grad_norm": 0.275845674431498, | |
| "learning_rate": 3.8586918378058595e-06, | |
| "loss": 0.3296, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.870935960591133, | |
| "grad_norm": 0.2634790774287366, | |
| "learning_rate": 3.747452936494761e-06, | |
| "loss": 0.328, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.89064039408867, | |
| "grad_norm": 0.3021503026215729, | |
| "learning_rate": 3.636871692847791e-06, | |
| "loss": 0.3224, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.910344827586207, | |
| "grad_norm": 0.271143161250696, | |
| "learning_rate": 3.527006168238061e-06, | |
| "loss": 0.3269, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9300492610837439, | |
| "grad_norm": 0.2651897275909597, | |
| "learning_rate": 3.417914048245927e-06, | |
| "loss": 0.3247, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.9497536945812808, | |
| "grad_norm": 0.2608280493875229, | |
| "learning_rate": 3.309652612370816e-06, | |
| "loss": 0.3223, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.9694581280788177, | |
| "grad_norm": 0.25658236440730964, | |
| "learning_rate": 3.2022787039562745e-06, | |
| "loss": 0.3284, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.9891625615763546, | |
| "grad_norm": 0.39452708935161934, | |
| "learning_rate": 3.095848700344001e-06, | |
| "loss": 0.3281, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.007881773399015, | |
| "grad_norm": 0.27426147250441796, | |
| "learning_rate": 2.990418483272579e-06, | |
| "loss": 0.3169, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.027586206896552, | |
| "grad_norm": 0.33623291768317054, | |
| "learning_rate": 2.8860434095364266e-06, | |
| "loss": 0.3069, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.0472906403940887, | |
| "grad_norm": 0.4101482918037455, | |
| "learning_rate": 2.7827782819203497e-06, | |
| "loss": 0.3058, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.0669950738916256, | |
| "grad_norm": 0.24962463377649954, | |
| "learning_rate": 2.6806773204250148e-06, | |
| "loss": 0.3065, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.0866995073891625, | |
| "grad_norm": 0.2530338383655545, | |
| "learning_rate": 2.579794133798388e-06, | |
| "loss": 0.3051, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.1064039408866995, | |
| "grad_norm": 0.26733489560435986, | |
| "learning_rate": 2.4801816913881242e-06, | |
| "loss": 0.3071, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.1261083743842364, | |
| "grad_norm": 0.2672830744223254, | |
| "learning_rate": 2.3818922953296937e-06, | |
| "loss": 0.3056, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.1458128078817733, | |
| "grad_norm": 0.2563106205327875, | |
| "learning_rate": 2.2849775530848057e-06, | |
| "loss": 0.2992, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.1655172413793102, | |
| "grad_norm": 0.2606021808754117, | |
| "learning_rate": 2.189488350344596e-06, | |
| "loss": 0.3139, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.185221674876847, | |
| "grad_norm": 0.26508188540176497, | |
| "learning_rate": 2.095474824311769e-06, | |
| "loss": 0.3031, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.204926108374384, | |
| "grad_norm": 0.2997008301548407, | |
| "learning_rate": 2.0029863373757553e-06, | |
| "loss": 0.3066, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.224630541871921, | |
| "grad_norm": 0.24564665419705128, | |
| "learning_rate": 1.9120714511946746e-06, | |
| "loss": 0.3027, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.2443349753694584, | |
| "grad_norm": 0.24016732234530286, | |
| "learning_rate": 1.822777901197738e-06, | |
| "loss": 0.3047, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.264039408866995, | |
| "grad_norm": 0.24884374496439782, | |
| "learning_rate": 1.7351525715214512e-06, | |
| "loss": 0.3012, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.283743842364532, | |
| "grad_norm": 0.29955124615888423, | |
| "learning_rate": 1.6492414703928277e-06, | |
| "loss": 0.3024, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.303448275862069, | |
| "grad_norm": 0.24995732946807675, | |
| "learning_rate": 1.5650897059724545e-06, | |
| "loss": 0.3072, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.323152709359606, | |
| "grad_norm": 0.27721697299099485, | |
| "learning_rate": 1.482741462670193e-06, | |
| "loss": 0.3086, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.342857142857143, | |
| "grad_norm": 0.2680931201078952, | |
| "learning_rate": 1.4022399779458656e-06, | |
| "loss": 0.3076, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.36256157635468, | |
| "grad_norm": 0.25884126877718644, | |
| "learning_rate": 1.3236275196071641e-06, | |
| "loss": 0.3063, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.382266009852217, | |
| "grad_norm": 0.24416102518796548, | |
| "learning_rate": 1.2469453636166645e-06, | |
| "loss": 0.3066, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.4019704433497537, | |
| "grad_norm": 0.26922577960580724, | |
| "learning_rate": 1.1722337724196365e-06, | |
| "loss": 0.3119, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.4216748768472907, | |
| "grad_norm": 0.25037887747041054, | |
| "learning_rate": 1.0995319738039855e-06, | |
| "loss": 0.313, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.4413793103448276, | |
| "grad_norm": 0.2651461475029688, | |
| "learning_rate": 1.028878140303462e-06, | |
| "loss": 0.2988, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.4610837438423645, | |
| "grad_norm": 0.2564753685149257, | |
| "learning_rate": 9.603093691549348e-07, | |
| "loss": 0.303, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.4807881773399014, | |
| "grad_norm": 0.26344914469001607, | |
| "learning_rate": 8.938616628202478e-07, | |
| "loss": 0.3027, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.5004926108374383, | |
| "grad_norm": 0.25207552584955384, | |
| "learning_rate": 8.295699100829124e-07, | |
| "loss": 0.3004, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.5201970443349753, | |
| "grad_norm": 0.28458206050624457, | |
| "learning_rate": 7.674678677295277e-07, | |
| "loss": 0.3043, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.539901477832512, | |
| "grad_norm": 0.23571286194786062, | |
| "learning_rate": 7.07588142825571e-07, | |
| "loss": 0.31, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.559605911330049, | |
| "grad_norm": 0.24227892952770294, | |
| "learning_rate": 6.499621755948487e-07, | |
| "loss": 0.3014, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.5793103448275865, | |
| "grad_norm": 0.23495263014727036, | |
| "learning_rate": 5.946202229116227e-07, | |
| "loss": 0.3078, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.599014778325123, | |
| "grad_norm": 0.27243587575264466, | |
| "learning_rate": 5.41591342414034e-07, | |
| "loss": 0.3052, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.6187192118226603, | |
| "grad_norm": 0.24767917319433275, | |
| "learning_rate": 4.909033772472204e-07, | |
| "loss": 0.3086, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.638423645320197, | |
| "grad_norm": 0.3082430748983811, | |
| "learning_rate": 4.42582941444093e-07, | |
| "loss": 0.3008, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.658128078817734, | |
| "grad_norm": 0.23995454448763587, | |
| "learning_rate": 3.9665540595147376e-07, | |
| "loss": 0.312, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.677832512315271, | |
| "grad_norm": 0.26981926134895196, | |
| "learning_rate": 3.531448853089192e-07, | |
| "loss": 0.3052, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.697536945812808, | |
| "grad_norm": 0.24323117710183706, | |
| "learning_rate": 3.1207422498723663e-07, | |
| "loss": 0.3065, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.717241379310345, | |
| "grad_norm": 0.4168883045884567, | |
| "learning_rate": 2.734649893933178e-07, | |
| "loss": 0.306, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.736945812807882, | |
| "grad_norm": 0.23949822475954996, | |
| "learning_rate": 2.3733745054762059e-07, | |
| "loss": 0.3052, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.7566502463054188, | |
| "grad_norm": 0.22489392767199434, | |
| "learning_rate": 2.0371057744021315e-07, | |
| "loss": 0.3036, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.7763546798029557, | |
| "grad_norm": 0.24001654591150326, | |
| "learning_rate": 1.7260202607098985e-07, | |
| "loss": 0.308, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.7960591133004926, | |
| "grad_norm": 0.22628265532283418, | |
| "learning_rate": 1.4402813017927396e-07, | |
| "loss": 0.3014, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.8157635467980295, | |
| "grad_norm": 0.29783439102497933, | |
| "learning_rate": 1.1800389266769242e-07, | |
| "loss": 0.3072, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.8354679802955665, | |
| "grad_norm": 0.25141926113090335, | |
| "learning_rate": 9.454297772480137e-08, | |
| "loss": 0.3076, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.8551724137931034, | |
| "grad_norm": 0.22821689683912336, | |
| "learning_rate": 7.365770365062308e-08, | |
| "loss": 0.3067, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.8748768472906403, | |
| "grad_norm": 0.24290991296428727, | |
| "learning_rate": 5.535903638884399e-08, | |
| "loss": 0.3028, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.8945812807881772, | |
| "grad_norm": 0.23018588484999283, | |
| "learning_rate": 3.965658376907544e-08, | |
| "loss": 0.302, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.914285714285714, | |
| "grad_norm": 0.36088955224736075, | |
| "learning_rate": 2.6558590462207322e-08, | |
| "loss": 0.2996, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.933990147783251, | |
| "grad_norm": 0.2339217161083891, | |
| "learning_rate": 1.607193365148696e-08, | |
| "loss": 0.3094, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.9536945812807884, | |
| "grad_norm": 0.2448303897755478, | |
| "learning_rate": 8.202119421615306e-09, | |
| "loss": 0.3014, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.973399014778325, | |
| "grad_norm": 0.430632356624608, | |
| "learning_rate": 2.9532798677395226e-09, | |
| "loss": 0.3043, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.9931034482758623, | |
| "grad_norm": 0.2305975377394837, | |
| "learning_rate": 3.2817092587345e-10, | |
| "loss": 0.3066, | |
| "step": 1520 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1524, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5070977114308608.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |