BNSYNOV / trainer_state.json
Sabbir772's picture
Upload full training checkpoint (including optimizer state)
bcb919b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 25.0,
"eval_steps": 500,
"global_step": 39775,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06285355122564425,
"grad_norm": 608.9674682617188,
"learning_rate": 4.844437460716531e-05,
"loss": 14.2524,
"step": 100
},
{
"epoch": 0.1257071024512885,
"grad_norm": 34.65327453613281,
"learning_rate": 4.6873035826524205e-05,
"loss": 10.3562,
"step": 200
},
{
"epoch": 0.18856065367693275,
"grad_norm": 21.24808120727539,
"learning_rate": 4.5301697045883096e-05,
"loss": 7.8551,
"step": 300
},
{
"epoch": 0.251414204902577,
"grad_norm": 17.404918670654297,
"learning_rate": 4.373035826524199e-05,
"loss": 6.6346,
"step": 400
},
{
"epoch": 0.3142677561282212,
"grad_norm": 12.713433265686035,
"learning_rate": 4.2159019484600884e-05,
"loss": 5.9755,
"step": 500
},
{
"epoch": 0.3771213073538655,
"grad_norm": 10.050477981567383,
"learning_rate": 4.0587680703959775e-05,
"loss": 5.5595,
"step": 600
},
{
"epoch": 0.43997485857950974,
"grad_norm": 13.709216117858887,
"learning_rate": 3.9016341923318666e-05,
"loss": 5.2853,
"step": 700
},
{
"epoch": 0.502828409805154,
"grad_norm": 9.112940788269043,
"learning_rate": 3.744500314267756e-05,
"loss": 5.1417,
"step": 800
},
{
"epoch": 0.5656819610307983,
"grad_norm": 8.267425537109375,
"learning_rate": 3.587366436203646e-05,
"loss": 4.9615,
"step": 900
},
{
"epoch": 0.6285355122564424,
"grad_norm": 9.709076881408691,
"learning_rate": 3.430232558139535e-05,
"loss": 4.6907,
"step": 1000
},
{
"epoch": 0.6913890634820867,
"grad_norm": 845.80859375,
"learning_rate": 3.273098680075424e-05,
"loss": 4.5456,
"step": 1100
},
{
"epoch": 0.754242614707731,
"grad_norm": 5.943735599517822,
"learning_rate": 3.115964802011313e-05,
"loss": 4.4291,
"step": 1200
},
{
"epoch": 0.8170961659333752,
"grad_norm": 5.8759989738464355,
"learning_rate": 2.9588309239472034e-05,
"loss": 4.3252,
"step": 1300
},
{
"epoch": 0.8799497171590195,
"grad_norm": 14.995753288269043,
"learning_rate": 2.8016970458830928e-05,
"loss": 4.2586,
"step": 1400
},
{
"epoch": 0.9428032683846638,
"grad_norm": 23.3351993560791,
"learning_rate": 2.644563167818982e-05,
"loss": 4.1372,
"step": 1500
},
{
"epoch": 1.0,
"eval_loss": 3.215750217437744,
"eval_runtime": 19.7611,
"eval_samples_per_second": 48.479,
"eval_steps_per_second": 6.073,
"step": 1591
},
{
"epoch": 1.005656819610308,
"grad_norm": 8.584565162658691,
"learning_rate": 2.4874292897548713e-05,
"loss": 4.0272,
"step": 1600
},
{
"epoch": 1.0685103708359522,
"grad_norm": 6.45043420791626,
"learning_rate": 2.3302954116907607e-05,
"loss": 3.9602,
"step": 1700
},
{
"epoch": 1.1313639220615965,
"grad_norm": 6.03476095199585,
"learning_rate": 2.17316153362665e-05,
"loss": 3.9052,
"step": 1800
},
{
"epoch": 1.1942174732872408,
"grad_norm": 5.746309280395508,
"learning_rate": 2.0160276555625392e-05,
"loss": 3.9282,
"step": 1900
},
{
"epoch": 1.2570710245128849,
"grad_norm": 8.062549591064453,
"learning_rate": 1.858893777498429e-05,
"loss": 3.8096,
"step": 2000
},
{
"epoch": 1.3199245757385292,
"grad_norm": 8.58310317993164,
"learning_rate": 1.701759899434318e-05,
"loss": 3.803,
"step": 2100
},
{
"epoch": 1.3827781269641735,
"grad_norm": 7.599905490875244,
"learning_rate": 1.5446260213702074e-05,
"loss": 3.8381,
"step": 2200
},
{
"epoch": 1.4456316781898177,
"grad_norm": 22.772512435913086,
"learning_rate": 1.3874921433060969e-05,
"loss": 3.6456,
"step": 2300
},
{
"epoch": 1.508485229415462,
"grad_norm": 6.949570178985596,
"learning_rate": 1.2303582652419863e-05,
"loss": 3.7442,
"step": 2400
},
{
"epoch": 1.5713387806411063,
"grad_norm": 5.7536821365356445,
"learning_rate": 1.0732243871778757e-05,
"loss": 3.691,
"step": 2500
},
{
"epoch": 1.6341923318667506,
"grad_norm": 55.64060974121094,
"learning_rate": 9.160905091137651e-06,
"loss": 3.7461,
"step": 2600
},
{
"epoch": 1.6970458830923947,
"grad_norm": 6.573077201843262,
"learning_rate": 7.589566310496543e-06,
"loss": 3.6186,
"step": 2700
},
{
"epoch": 1.759899434318039,
"grad_norm": 8.615326881408691,
"learning_rate": 6.018227529855437e-06,
"loss": 3.6546,
"step": 2800
},
{
"epoch": 1.8227529855436833,
"grad_norm": 6.359428405761719,
"learning_rate": 4.446888749214331e-06,
"loss": 3.5724,
"step": 2900
},
{
"epoch": 1.8856065367693273,
"grad_norm": 5.5190582275390625,
"learning_rate": 2.8755499685732243e-06,
"loss": 3.6164,
"step": 3000
},
{
"epoch": 1.9484600879949716,
"grad_norm": 5.9382004737854,
"learning_rate": 1.3042111879321182e-06,
"loss": 3.52,
"step": 3100
},
{
"epoch": 2.0,
"eval_loss": 2.803544521331787,
"eval_runtime": 19.8643,
"eval_samples_per_second": 48.227,
"eval_steps_per_second": 6.041,
"step": 3182
},
{
"epoch": 2.011313639220616,
"grad_norm": 10.074417114257812,
"learning_rate": 3.9946574481458206e-05,
"loss": 3.5087,
"step": 3200
},
{
"epoch": 2.07416719044626,
"grad_norm": 6.9990434646606445,
"learning_rate": 3.963230672532998e-05,
"loss": 3.5746,
"step": 3300
},
{
"epoch": 2.1370207416719045,
"grad_norm": 6.968172073364258,
"learning_rate": 3.931803896920176e-05,
"loss": 3.6324,
"step": 3400
},
{
"epoch": 2.1998742928975488,
"grad_norm": 179.99803161621094,
"learning_rate": 3.9003771213073545e-05,
"loss": 3.4072,
"step": 3500
},
{
"epoch": 2.262727844123193,
"grad_norm": 59.86805725097656,
"learning_rate": 3.868950345694532e-05,
"loss": 3.391,
"step": 3600
},
{
"epoch": 2.3255813953488373,
"grad_norm": 7.445355415344238,
"learning_rate": 3.83752357008171e-05,
"loss": 3.2032,
"step": 3700
},
{
"epoch": 2.3884349465744816,
"grad_norm": 5.553746700286865,
"learning_rate": 3.806096794468888e-05,
"loss": 3.3644,
"step": 3800
},
{
"epoch": 2.4512884978001255,
"grad_norm": 6.544325351715088,
"learning_rate": 3.7746700188560656e-05,
"loss": 3.1666,
"step": 3900
},
{
"epoch": 2.5141420490257698,
"grad_norm": 7.863962650299072,
"learning_rate": 3.7432432432432436e-05,
"loss": 3.1982,
"step": 4000
},
{
"epoch": 2.576995600251414,
"grad_norm": 10.573624610900879,
"learning_rate": 3.7118164676304215e-05,
"loss": 3.1336,
"step": 4100
},
{
"epoch": 2.6398491514770583,
"grad_norm": 8.506134986877441,
"learning_rate": 3.680389692017599e-05,
"loss": 3.0191,
"step": 4200
},
{
"epoch": 2.7027027027027026,
"grad_norm": 7.1274518966674805,
"learning_rate": 3.6489629164047774e-05,
"loss": 3.003,
"step": 4300
},
{
"epoch": 2.765556253928347,
"grad_norm": 5.121671199798584,
"learning_rate": 3.617536140791955e-05,
"loss": 3.085,
"step": 4400
},
{
"epoch": 2.828409805153991,
"grad_norm": 6.66685152053833,
"learning_rate": 3.5861093651791327e-05,
"loss": 3.0205,
"step": 4500
},
{
"epoch": 2.8912633563796355,
"grad_norm": 8.410430908203125,
"learning_rate": 3.5546825895663106e-05,
"loss": 2.9611,
"step": 4600
},
{
"epoch": 2.95411690760528,
"grad_norm": 6.266846179962158,
"learning_rate": 3.5232558139534886e-05,
"loss": 2.9299,
"step": 4700
},
{
"epoch": 3.0,
"eval_loss": 2.3084471225738525,
"eval_runtime": 20.0337,
"eval_samples_per_second": 47.819,
"eval_steps_per_second": 5.99,
"step": 4773
},
{
"epoch": 3.016970458830924,
"grad_norm": 6.011202335357666,
"learning_rate": 3.4918290383406665e-05,
"loss": 2.886,
"step": 4800
},
{
"epoch": 3.0798240100565684,
"grad_norm": 7.204225063323975,
"learning_rate": 3.4604022627278445e-05,
"loss": 2.8579,
"step": 4900
},
{
"epoch": 3.1426775612822127,
"grad_norm": 10.316048622131348,
"learning_rate": 3.428975487115022e-05,
"loss": 2.8155,
"step": 5000
},
{
"epoch": 3.2055311125078565,
"grad_norm": 6.55385684967041,
"learning_rate": 3.3975487115022e-05,
"loss": 2.8938,
"step": 5100
},
{
"epoch": 3.268384663733501,
"grad_norm": 6.081694602966309,
"learning_rate": 3.366121935889378e-05,
"loss": 2.7344,
"step": 5200
},
{
"epoch": 3.331238214959145,
"grad_norm": 8.186753273010254,
"learning_rate": 3.3346951602765556e-05,
"loss": 2.7899,
"step": 5300
},
{
"epoch": 3.3940917661847894,
"grad_norm": 7.425989627838135,
"learning_rate": 3.3032683846637335e-05,
"loss": 2.7317,
"step": 5400
},
{
"epoch": 3.4569453174104336,
"grad_norm": 5.459439277648926,
"learning_rate": 3.2718416090509115e-05,
"loss": 2.6456,
"step": 5500
},
{
"epoch": 3.519798868636078,
"grad_norm": 5.077919006347656,
"learning_rate": 3.2404148334380894e-05,
"loss": 2.6816,
"step": 5600
},
{
"epoch": 3.5826524198617222,
"grad_norm": 5.81939172744751,
"learning_rate": 3.2089880578252674e-05,
"loss": 2.64,
"step": 5700
},
{
"epoch": 3.6455059710873665,
"grad_norm": 39.74727249145508,
"learning_rate": 3.177561282212445e-05,
"loss": 2.6725,
"step": 5800
},
{
"epoch": 3.708359522313011,
"grad_norm": 5.927642345428467,
"learning_rate": 3.1461345065996226e-05,
"loss": 2.5395,
"step": 5900
},
{
"epoch": 3.771213073538655,
"grad_norm": 5.984442710876465,
"learning_rate": 3.114707730986801e-05,
"loss": 2.6297,
"step": 6000
},
{
"epoch": 3.834066624764299,
"grad_norm": 5.258358478546143,
"learning_rate": 3.083280955373979e-05,
"loss": 2.6291,
"step": 6100
},
{
"epoch": 3.8969201759899432,
"grad_norm": 5.7379937171936035,
"learning_rate": 3.0518541797611565e-05,
"loss": 2.6116,
"step": 6200
},
{
"epoch": 3.9597737272155875,
"grad_norm": 5.038835048675537,
"learning_rate": 3.0204274041483344e-05,
"loss": 2.6695,
"step": 6300
},
{
"epoch": 4.0,
"eval_loss": 2.0932769775390625,
"eval_runtime": 20.0417,
"eval_samples_per_second": 47.8,
"eval_steps_per_second": 5.988,
"step": 6364
},
{
"epoch": 4.022627278441232,
"grad_norm": 7.459395885467529,
"learning_rate": 2.9890006285355127e-05,
"loss": 2.6404,
"step": 6400
},
{
"epoch": 4.085480829666876,
"grad_norm": 6.721461296081543,
"learning_rate": 2.9575738529226903e-05,
"loss": 2.4614,
"step": 6500
},
{
"epoch": 4.14833438089252,
"grad_norm": 6.69769287109375,
"learning_rate": 2.9261470773098683e-05,
"loss": 2.457,
"step": 6600
},
{
"epoch": 4.211187932118165,
"grad_norm": 5.306356906890869,
"learning_rate": 2.894720301697046e-05,
"loss": 2.513,
"step": 6700
},
{
"epoch": 4.274041483343809,
"grad_norm": 5.425265312194824,
"learning_rate": 2.8632935260842235e-05,
"loss": 2.5467,
"step": 6800
},
{
"epoch": 4.336895034569453,
"grad_norm": 4.722207546234131,
"learning_rate": 2.8318667504714018e-05,
"loss": 2.3467,
"step": 6900
},
{
"epoch": 4.3997485857950975,
"grad_norm": 4.346086502075195,
"learning_rate": 2.8004399748585797e-05,
"loss": 2.5098,
"step": 7000
},
{
"epoch": 4.462602137020742,
"grad_norm": 7.4684319496154785,
"learning_rate": 2.7690131992457573e-05,
"loss": 2.4396,
"step": 7100
},
{
"epoch": 4.525455688246386,
"grad_norm": 5.709039688110352,
"learning_rate": 2.7375864236329353e-05,
"loss": 2.4688,
"step": 7200
},
{
"epoch": 4.58830923947203,
"grad_norm": 4.952858924865723,
"learning_rate": 2.7061596480201136e-05,
"loss": 2.3643,
"step": 7300
},
{
"epoch": 4.651162790697675,
"grad_norm": 6.68017578125,
"learning_rate": 2.6747328724072912e-05,
"loss": 2.4242,
"step": 7400
},
{
"epoch": 4.714016341923319,
"grad_norm": 3.584669828414917,
"learning_rate": 2.6433060967944688e-05,
"loss": 2.4552,
"step": 7500
},
{
"epoch": 4.776869893148963,
"grad_norm": 5.264488220214844,
"learning_rate": 2.6118793211816468e-05,
"loss": 2.4232,
"step": 7600
},
{
"epoch": 4.8397234443746076,
"grad_norm": 4.609414100646973,
"learning_rate": 2.580452545568825e-05,
"loss": 2.4418,
"step": 7700
},
{
"epoch": 4.902576995600251,
"grad_norm": 4.986881256103516,
"learning_rate": 2.5490257699560027e-05,
"loss": 2.4065,
"step": 7800
},
{
"epoch": 4.965430546825896,
"grad_norm": 4.9718098640441895,
"learning_rate": 2.5175989943431806e-05,
"loss": 2.4589,
"step": 7900
},
{
"epoch": 5.0,
"eval_loss": 1.984979271888733,
"eval_runtime": 20.0353,
"eval_samples_per_second": 47.816,
"eval_steps_per_second": 5.989,
"step": 7955
},
{
"epoch": 5.0282840980515395,
"grad_norm": 5.2526750564575195,
"learning_rate": 2.4861722187303586e-05,
"loss": 2.2708,
"step": 8000
},
{
"epoch": 5.091137649277184,
"grad_norm": 5.312747001647949,
"learning_rate": 2.454745443117536e-05,
"loss": 2.3068,
"step": 8100
},
{
"epoch": 5.153991200502828,
"grad_norm": 7.204046726226807,
"learning_rate": 2.423318667504714e-05,
"loss": 2.3729,
"step": 8200
},
{
"epoch": 5.216844751728472,
"grad_norm": 4.8044753074646,
"learning_rate": 2.391891891891892e-05,
"loss": 2.3501,
"step": 8300
},
{
"epoch": 5.279698302954117,
"grad_norm": 6.9473185539245605,
"learning_rate": 2.3604651162790697e-05,
"loss": 2.3398,
"step": 8400
},
{
"epoch": 5.342551854179761,
"grad_norm": 4.014726161956787,
"learning_rate": 2.3290383406662476e-05,
"loss": 2.2938,
"step": 8500
},
{
"epoch": 5.405405405405405,
"grad_norm": 6.722488880157471,
"learning_rate": 2.2976115650534256e-05,
"loss": 2.2354,
"step": 8600
},
{
"epoch": 5.4682589566310495,
"grad_norm": 5.856524467468262,
"learning_rate": 2.2661847894406035e-05,
"loss": 2.2757,
"step": 8700
},
{
"epoch": 5.531112507856694,
"grad_norm": 4.9930644035339355,
"learning_rate": 2.234758013827781e-05,
"loss": 2.2586,
"step": 8800
},
{
"epoch": 5.593966059082338,
"grad_norm": 5.49005126953125,
"learning_rate": 2.2033312382149594e-05,
"loss": 2.3155,
"step": 8900
},
{
"epoch": 5.656819610307982,
"grad_norm": 8.850517272949219,
"learning_rate": 2.171904462602137e-05,
"loss": 2.2841,
"step": 9000
},
{
"epoch": 5.719673161533627,
"grad_norm": 5.094405651092529,
"learning_rate": 2.140477686989315e-05,
"loss": 2.3147,
"step": 9100
},
{
"epoch": 5.782526712759271,
"grad_norm": 4.709909439086914,
"learning_rate": 2.109050911376493e-05,
"loss": 2.1584,
"step": 9200
},
{
"epoch": 5.845380263984915,
"grad_norm": 4.1693525314331055,
"learning_rate": 2.077624135763671e-05,
"loss": 2.2396,
"step": 9300
},
{
"epoch": 5.90823381521056,
"grad_norm": 6.800940036773682,
"learning_rate": 2.0461973601508485e-05,
"loss": 2.301,
"step": 9400
},
{
"epoch": 5.971087366436204,
"grad_norm": 7.419278144836426,
"learning_rate": 2.0147705845380265e-05,
"loss": 2.3142,
"step": 9500
},
{
"epoch": 6.0,
"eval_loss": 1.905881643295288,
"eval_runtime": 20.0332,
"eval_samples_per_second": 47.821,
"eval_steps_per_second": 5.99,
"step": 9546
},
{
"epoch": 6.033940917661848,
"grad_norm": 4.217894077301025,
"learning_rate": 1.9833438089252044e-05,
"loss": 2.1013,
"step": 9600
},
{
"epoch": 6.096794468887492,
"grad_norm": 5.345584869384766,
"learning_rate": 1.9519170333123824e-05,
"loss": 2.2714,
"step": 9700
},
{
"epoch": 6.159648020113137,
"grad_norm": 5.364700794219971,
"learning_rate": 1.92049025769956e-05,
"loss": 2.2381,
"step": 9800
},
{
"epoch": 6.222501571338781,
"grad_norm": 4.380568504333496,
"learning_rate": 1.8890634820867383e-05,
"loss": 2.1527,
"step": 9900
},
{
"epoch": 6.285355122564425,
"grad_norm": 6.300790309906006,
"learning_rate": 1.857636706473916e-05,
"loss": 2.1771,
"step": 10000
},
{
"epoch": 6.348208673790069,
"grad_norm": 5.757110118865967,
"learning_rate": 1.8262099308610938e-05,
"loss": 2.1695,
"step": 10100
},
{
"epoch": 6.411062225015713,
"grad_norm": 4.908361434936523,
"learning_rate": 1.7947831552482718e-05,
"loss": 2.1056,
"step": 10200
},
{
"epoch": 6.473915776241357,
"grad_norm": 5.048102378845215,
"learning_rate": 1.7633563796354494e-05,
"loss": 2.2112,
"step": 10300
},
{
"epoch": 6.536769327467002,
"grad_norm": 8.040143013000488,
"learning_rate": 1.7319296040226273e-05,
"loss": 2.0298,
"step": 10400
},
{
"epoch": 6.599622878692646,
"grad_norm": 5.15581750869751,
"learning_rate": 1.7005028284098053e-05,
"loss": 2.1224,
"step": 10500
},
{
"epoch": 6.66247642991829,
"grad_norm": 4.935842514038086,
"learning_rate": 1.6690760527969832e-05,
"loss": 2.0772,
"step": 10600
},
{
"epoch": 6.725329981143934,
"grad_norm": 5.487718105316162,
"learning_rate": 1.637649277184161e-05,
"loss": 2.2552,
"step": 10700
},
{
"epoch": 6.788183532369579,
"grad_norm": 5.713748455047607,
"learning_rate": 1.6062225015713388e-05,
"loss": 2.1358,
"step": 10800
},
{
"epoch": 6.851037083595223,
"grad_norm": 4.882757186889648,
"learning_rate": 1.5747957259585168e-05,
"loss": 2.1613,
"step": 10900
},
{
"epoch": 6.913890634820867,
"grad_norm": 5.634950637817383,
"learning_rate": 1.5433689503456947e-05,
"loss": 2.2567,
"step": 11000
},
{
"epoch": 6.976744186046512,
"grad_norm": 5.634829044342041,
"learning_rate": 1.5119421747328725e-05,
"loss": 2.1283,
"step": 11100
},
{
"epoch": 7.0,
"eval_loss": 1.84635591506958,
"eval_runtime": 20.0367,
"eval_samples_per_second": 47.812,
"eval_steps_per_second": 5.989,
"step": 11137
},
{
"epoch": 7.039597737272156,
"grad_norm": 5.635861873626709,
"learning_rate": 1.4805153991200504e-05,
"loss": 2.0938,
"step": 11200
},
{
"epoch": 7.1024512884978,
"grad_norm": 5.214977741241455,
"learning_rate": 1.4490886235072282e-05,
"loss": 2.062,
"step": 11300
},
{
"epoch": 7.1653048397234445,
"grad_norm": 7.498839855194092,
"learning_rate": 1.4176618478944062e-05,
"loss": 2.1292,
"step": 11400
},
{
"epoch": 7.228158390949089,
"grad_norm": 5.83459997177124,
"learning_rate": 1.386235072281584e-05,
"loss": 2.0796,
"step": 11500
},
{
"epoch": 7.291011942174733,
"grad_norm": 3.8935282230377197,
"learning_rate": 1.3548082966687619e-05,
"loss": 2.1414,
"step": 11600
},
{
"epoch": 7.353865493400377,
"grad_norm": 5.774020671844482,
"learning_rate": 1.3233815210559397e-05,
"loss": 2.145,
"step": 11700
},
{
"epoch": 7.416719044626022,
"grad_norm": 128.24192810058594,
"learning_rate": 1.2919547454431178e-05,
"loss": 2.0242,
"step": 11800
},
{
"epoch": 7.479572595851666,
"grad_norm": 4.4846367835998535,
"learning_rate": 1.2605279698302954e-05,
"loss": 2.0936,
"step": 11900
},
{
"epoch": 7.54242614707731,
"grad_norm": 5.091222763061523,
"learning_rate": 1.2291011942174734e-05,
"loss": 2.1988,
"step": 12000
},
{
"epoch": 7.6052796983029545,
"grad_norm": 3.3482093811035156,
"learning_rate": 1.1976744186046513e-05,
"loss": 2.1323,
"step": 12100
},
{
"epoch": 7.668133249528598,
"grad_norm": 5.329409599304199,
"learning_rate": 1.1662476429918291e-05,
"loss": 2.0587,
"step": 12200
},
{
"epoch": 7.730986800754243,
"grad_norm": 7.584386348724365,
"learning_rate": 1.134820867379007e-05,
"loss": 2.1341,
"step": 12300
},
{
"epoch": 7.7938403519798864,
"grad_norm": 5.996345520019531,
"learning_rate": 1.1033940917661848e-05,
"loss": 2.1108,
"step": 12400
},
{
"epoch": 7.856693903205531,
"grad_norm": 6.1731648445129395,
"learning_rate": 1.0719673161533628e-05,
"loss": 2.1218,
"step": 12500
},
{
"epoch": 7.919547454431175,
"grad_norm": 5.414481163024902,
"learning_rate": 1.0405405405405407e-05,
"loss": 2.028,
"step": 12600
},
{
"epoch": 7.982401005656819,
"grad_norm": 7.198294639587402,
"learning_rate": 1.0091137649277185e-05,
"loss": 2.0489,
"step": 12700
},
{
"epoch": 8.0,
"eval_loss": 1.8111430406570435,
"eval_runtime": 20.0666,
"eval_samples_per_second": 47.741,
"eval_steps_per_second": 5.98,
"step": 12728
},
{
"epoch": 8.045254556882464,
"grad_norm": 6.677022933959961,
"learning_rate": 9.776869893148963e-06,
"loss": 2.0814,
"step": 12800
},
{
"epoch": 8.108108108108109,
"grad_norm": 5.1916728019714355,
"learning_rate": 9.46260213702074e-06,
"loss": 2.119,
"step": 12900
},
{
"epoch": 8.170961659333752,
"grad_norm": 6.04162073135376,
"learning_rate": 9.14833438089252e-06,
"loss": 2.0058,
"step": 13000
},
{
"epoch": 8.233815210559397,
"grad_norm": 4.764267444610596,
"learning_rate": 8.8340666247643e-06,
"loss": 2.0113,
"step": 13100
},
{
"epoch": 8.29666876178504,
"grad_norm": 5.77971887588501,
"learning_rate": 8.519798868636078e-06,
"loss": 2.0392,
"step": 13200
},
{
"epoch": 8.359522313010686,
"grad_norm": 5.698218822479248,
"learning_rate": 8.205531112507857e-06,
"loss": 2.107,
"step": 13300
},
{
"epoch": 8.42237586423633,
"grad_norm": 5.236012935638428,
"learning_rate": 7.891263356379635e-06,
"loss": 2.0829,
"step": 13400
},
{
"epoch": 8.485229415461973,
"grad_norm": 4.379955291748047,
"learning_rate": 7.576995600251414e-06,
"loss": 1.9321,
"step": 13500
},
{
"epoch": 8.548082966687618,
"grad_norm": 6.034859657287598,
"learning_rate": 7.262727844123193e-06,
"loss": 2.1013,
"step": 13600
},
{
"epoch": 8.610936517913261,
"grad_norm": 5.320705413818359,
"learning_rate": 6.948460087994972e-06,
"loss": 2.0543,
"step": 13700
},
{
"epoch": 8.673790069138906,
"grad_norm": 5.735895156860352,
"learning_rate": 6.634192331866751e-06,
"loss": 2.0594,
"step": 13800
},
{
"epoch": 8.73664362036455,
"grad_norm": 4.845800876617432,
"learning_rate": 6.31992457573853e-06,
"loss": 1.9402,
"step": 13900
},
{
"epoch": 8.799497171590195,
"grad_norm": 4.628382682800293,
"learning_rate": 6.0056568196103085e-06,
"loss": 1.9937,
"step": 14000
},
{
"epoch": 8.862350722815838,
"grad_norm": 4.747410774230957,
"learning_rate": 5.691389063482086e-06,
"loss": 2.0654,
"step": 14100
},
{
"epoch": 8.925204274041484,
"grad_norm": 4.694166660308838,
"learning_rate": 5.377121307353866e-06,
"loss": 2.0523,
"step": 14200
},
{
"epoch": 8.988057825267127,
"grad_norm": 6.711084365844727,
"learning_rate": 5.0628535512256445e-06,
"loss": 1.9856,
"step": 14300
},
{
"epoch": 9.0,
"eval_loss": 1.7920939922332764,
"eval_runtime": 20.0378,
"eval_samples_per_second": 47.81,
"eval_steps_per_second": 5.989,
"step": 14319
},
{
"epoch": 9.050911376492772,
"grad_norm": 6.053162097930908,
"learning_rate": 4.748585795097423e-06,
"loss": 2.0392,
"step": 14400
},
{
"epoch": 9.113764927718416,
"grad_norm": 4.806529521942139,
"learning_rate": 4.434318038969202e-06,
"loss": 2.0308,
"step": 14500
},
{
"epoch": 9.17661847894406,
"grad_norm": 4.725819110870361,
"learning_rate": 4.1200502828409805e-06,
"loss": 2.0441,
"step": 14600
},
{
"epoch": 9.239472030169704,
"grad_norm": 4.637420177459717,
"learning_rate": 3.8057825267127596e-06,
"loss": 2.0061,
"step": 14700
},
{
"epoch": 9.30232558139535,
"grad_norm": 6.441665172576904,
"learning_rate": 3.4915147705845382e-06,
"loss": 2.1299,
"step": 14800
},
{
"epoch": 9.365179132620993,
"grad_norm": 3.506943941116333,
"learning_rate": 3.1772470144563173e-06,
"loss": 1.9443,
"step": 14900
},
{
"epoch": 9.428032683846638,
"grad_norm": 8.454822540283203,
"learning_rate": 2.8629792583280956e-06,
"loss": 2.0327,
"step": 15000
},
{
"epoch": 9.490886235072281,
"grad_norm": 5.021187782287598,
"learning_rate": 2.5487115021998746e-06,
"loss": 1.9839,
"step": 15100
},
{
"epoch": 9.553739786297927,
"grad_norm": 6.3962016105651855,
"learning_rate": 2.234443746071653e-06,
"loss": 2.0604,
"step": 15200
},
{
"epoch": 9.61659333752357,
"grad_norm": 5.531436443328857,
"learning_rate": 1.920175989943432e-06,
"loss": 2.0168,
"step": 15300
},
{
"epoch": 9.679446888749215,
"grad_norm": 4.300695896148682,
"learning_rate": 1.6059082338152106e-06,
"loss": 1.9994,
"step": 15400
},
{
"epoch": 9.742300439974859,
"grad_norm": 3.102018356323242,
"learning_rate": 1.2916404776869893e-06,
"loss": 2.0441,
"step": 15500
},
{
"epoch": 9.805153991200502,
"grad_norm": 4.91919469833374,
"learning_rate": 9.773727215587681e-07,
"loss": 1.9584,
"step": 15600
},
{
"epoch": 9.868007542426147,
"grad_norm": 4.21737813949585,
"learning_rate": 6.631049654305469e-07,
"loss": 2.0019,
"step": 15700
},
{
"epoch": 9.930861093651792,
"grad_norm": 4.098769187927246,
"learning_rate": 3.4883720930232557e-07,
"loss": 2.0121,
"step": 15800
},
{
"epoch": 9.993714644877436,
"grad_norm": 4.722096920013428,
"learning_rate": 3.456945317410434e-08,
"loss": 2.0196,
"step": 15900
},
{
"epoch": 10.0,
"eval_loss": 1.787421464920044,
"eval_runtime": 20.0243,
"eval_samples_per_second": 47.842,
"eval_steps_per_second": 5.993,
"step": 15910
},
{
"epoch": 10.056568196103079,
"grad_norm": 3.8331987857818604,
"learning_rate": 2.4860150848522942e-05,
"loss": 2.0388,
"step": 16000
},
{
"epoch": 10.119421747328724,
"grad_norm": 3.9292027950286865,
"learning_rate": 2.4703016970458832e-05,
"loss": 2.0913,
"step": 16100
},
{
"epoch": 10.182275298554368,
"grad_norm": 5.124855995178223,
"learning_rate": 2.454588309239472e-05,
"loss": 2.0452,
"step": 16200
},
{
"epoch": 10.245128849780013,
"grad_norm": 5.743933200836182,
"learning_rate": 2.438874921433061e-05,
"loss": 2.016,
"step": 16300
},
{
"epoch": 10.307982401005656,
"grad_norm": 6.4510931968688965,
"learning_rate": 2.42316153362665e-05,
"loss": 1.9785,
"step": 16400
},
{
"epoch": 10.370835952231301,
"grad_norm": 6.550465106964111,
"learning_rate": 2.4074481458202387e-05,
"loss": 1.9912,
"step": 16500
},
{
"epoch": 10.433689503456945,
"grad_norm": 5.37285852432251,
"learning_rate": 2.391734758013828e-05,
"loss": 2.0549,
"step": 16600
},
{
"epoch": 10.49654305468259,
"grad_norm": 5.4893412590026855,
"learning_rate": 2.376021370207417e-05,
"loss": 1.9434,
"step": 16700
},
{
"epoch": 10.559396605908233,
"grad_norm": 4.316259384155273,
"learning_rate": 2.3603079824010057e-05,
"loss": 1.8413,
"step": 16800
},
{
"epoch": 10.622250157133879,
"grad_norm": 3.4342756271362305,
"learning_rate": 2.3445945945945946e-05,
"loss": 1.9312,
"step": 16900
},
{
"epoch": 10.685103708359522,
"grad_norm": 5.680815696716309,
"learning_rate": 2.3288812067881836e-05,
"loss": 1.9678,
"step": 17000
},
{
"epoch": 10.747957259585167,
"grad_norm": 6.04569149017334,
"learning_rate": 2.3131678189817726e-05,
"loss": 2.0329,
"step": 17100
},
{
"epoch": 10.81081081081081,
"grad_norm": 9.336991310119629,
"learning_rate": 2.2974544311753616e-05,
"loss": 1.9575,
"step": 17200
},
{
"epoch": 10.873664362036456,
"grad_norm": 3.826447010040283,
"learning_rate": 2.2817410433689505e-05,
"loss": 1.9692,
"step": 17300
},
{
"epoch": 10.936517913262099,
"grad_norm": 4.134801387786865,
"learning_rate": 2.2660276555625392e-05,
"loss": 2.0406,
"step": 17400
},
{
"epoch": 10.999371464487744,
"grad_norm": 5.291431903839111,
"learning_rate": 2.2503142677561285e-05,
"loss": 1.9631,
"step": 17500
},
{
"epoch": 11.0,
"eval_loss": 1.7517410516738892,
"eval_runtime": 21.6572,
"eval_samples_per_second": 44.235,
"eval_steps_per_second": 5.541,
"step": 17501
},
{
"epoch": 11.062225015713388,
"grad_norm": 4.9575066566467285,
"learning_rate": 2.234600879949717e-05,
"loss": 1.9381,
"step": 17600
},
{
"epoch": 11.125078566939033,
"grad_norm": 12.871175765991211,
"learning_rate": 2.218887492143306e-05,
"loss": 1.8867,
"step": 17700
},
{
"epoch": 11.187932118164676,
"grad_norm": 4.3662519454956055,
"learning_rate": 2.203174104336895e-05,
"loss": 1.9713,
"step": 17800
},
{
"epoch": 11.250785669390321,
"grad_norm": 5.662289619445801,
"learning_rate": 2.187460716530484e-05,
"loss": 1.9188,
"step": 17900
},
{
"epoch": 11.313639220615965,
"grad_norm": 7.633818626403809,
"learning_rate": 2.171747328724073e-05,
"loss": 1.9142,
"step": 18000
},
{
"epoch": 11.376492771841608,
"grad_norm": 4.940028667449951,
"learning_rate": 2.156033940917662e-05,
"loss": 1.8697,
"step": 18100
},
{
"epoch": 11.439346323067253,
"grad_norm": 5.070211410522461,
"learning_rate": 2.1403205531112506e-05,
"loss": 1.9624,
"step": 18200
},
{
"epoch": 11.502199874292897,
"grad_norm": 7.409548282623291,
"learning_rate": 2.12460716530484e-05,
"loss": 1.9283,
"step": 18300
},
{
"epoch": 11.565053425518542,
"grad_norm": 6.541192531585693,
"learning_rate": 2.108893777498429e-05,
"loss": 1.9357,
"step": 18400
},
{
"epoch": 11.627906976744185,
"grad_norm": 5.941864967346191,
"learning_rate": 2.0931803896920176e-05,
"loss": 1.869,
"step": 18500
},
{
"epoch": 11.69076052796983,
"grad_norm": 9.418646812438965,
"learning_rate": 2.0774670018856065e-05,
"loss": 1.8518,
"step": 18600
},
{
"epoch": 11.753614079195474,
"grad_norm": 5.367152690887451,
"learning_rate": 2.061753614079196e-05,
"loss": 1.8945,
"step": 18700
},
{
"epoch": 11.81646763042112,
"grad_norm": 5.896432399749756,
"learning_rate": 2.0460402262727845e-05,
"loss": 1.8569,
"step": 18800
},
{
"epoch": 11.879321181646763,
"grad_norm": 6.137564182281494,
"learning_rate": 2.0303268384663735e-05,
"loss": 1.9179,
"step": 18900
},
{
"epoch": 11.942174732872408,
"grad_norm": 4.5933918952941895,
"learning_rate": 2.0146134506599625e-05,
"loss": 1.8941,
"step": 19000
},
{
"epoch": 12.0,
"eval_loss": 1.7062737941741943,
"eval_runtime": 21.7167,
"eval_samples_per_second": 44.114,
"eval_steps_per_second": 5.526,
"step": 19092
},
{
"epoch": 12.005028284098051,
"grad_norm": 5.298050880432129,
"learning_rate": 1.998900062853551e-05,
"loss": 1.8681,
"step": 19100
},
{
"epoch": 12.067881835323696,
"grad_norm": 7.001854419708252,
"learning_rate": 1.9831866750471404e-05,
"loss": 1.8377,
"step": 19200
},
{
"epoch": 12.13073538654934,
"grad_norm": 4.692386150360107,
"learning_rate": 1.9674732872407294e-05,
"loss": 1.8279,
"step": 19300
},
{
"epoch": 12.193588937774985,
"grad_norm": 6.864208221435547,
"learning_rate": 1.951759899434318e-05,
"loss": 1.8855,
"step": 19400
},
{
"epoch": 12.256442489000628,
"grad_norm": 3.883880853652954,
"learning_rate": 1.936046511627907e-05,
"loss": 1.84,
"step": 19500
},
{
"epoch": 12.319296040226273,
"grad_norm": 5.302524566650391,
"learning_rate": 1.920333123821496e-05,
"loss": 1.8791,
"step": 19600
},
{
"epoch": 12.382149591451917,
"grad_norm": 6.854051113128662,
"learning_rate": 1.904619736015085e-05,
"loss": 1.9189,
"step": 19700
},
{
"epoch": 12.445003142677562,
"grad_norm": 4.728283405303955,
"learning_rate": 1.888906348208674e-05,
"loss": 1.8903,
"step": 19800
},
{
"epoch": 12.507856693903205,
"grad_norm": 4.314347267150879,
"learning_rate": 1.8731929604022626e-05,
"loss": 1.8615,
"step": 19900
},
{
"epoch": 12.57071024512885,
"grad_norm": 3.873619318008423,
"learning_rate": 1.857479572595852e-05,
"loss": 1.8232,
"step": 20000
},
{
"epoch": 12.633563796354494,
"grad_norm": 6.445096969604492,
"learning_rate": 1.841766184789441e-05,
"loss": 1.7764,
"step": 20100
},
{
"epoch": 12.696417347580137,
"grad_norm": 4.258322715759277,
"learning_rate": 1.8260527969830295e-05,
"loss": 1.869,
"step": 20200
},
{
"epoch": 12.759270898805783,
"grad_norm": 7.782538414001465,
"learning_rate": 1.8103394091766185e-05,
"loss": 1.7986,
"step": 20300
},
{
"epoch": 12.822124450031426,
"grad_norm": 7.189488887786865,
"learning_rate": 1.7946260213702078e-05,
"loss": 1.8448,
"step": 20400
},
{
"epoch": 12.884978001257071,
"grad_norm": 5.59601354598999,
"learning_rate": 1.7789126335637964e-05,
"loss": 1.7924,
"step": 20500
},
{
"epoch": 12.947831552482715,
"grad_norm": 4.675200939178467,
"learning_rate": 1.7631992457573854e-05,
"loss": 1.8212,
"step": 20600
},
{
"epoch": 13.0,
"eval_loss": 1.6696668863296509,
"eval_runtime": 21.645,
"eval_samples_per_second": 44.26,
"eval_steps_per_second": 5.544,
"step": 20683
},
{
"epoch": 13.01068510370836,
"grad_norm": 3.3650217056274414,
"learning_rate": 1.7474858579509744e-05,
"loss": 1.6872,
"step": 20700
},
{
"epoch": 13.073538654934003,
"grad_norm": 6.4758219718933105,
"learning_rate": 1.731772470144563e-05,
"loss": 1.8029,
"step": 20800
},
{
"epoch": 13.136392206159648,
"grad_norm": 4.500367641448975,
"learning_rate": 1.7160590823381523e-05,
"loss": 1.8655,
"step": 20900
},
{
"epoch": 13.199245757385292,
"grad_norm": 5.369949817657471,
"learning_rate": 1.7003456945317413e-05,
"loss": 1.821,
"step": 21000
},
{
"epoch": 13.262099308610937,
"grad_norm": 4.84245491027832,
"learning_rate": 1.68463230672533e-05,
"loss": 1.7454,
"step": 21100
},
{
"epoch": 13.32495285983658,
"grad_norm": 4.510051727294922,
"learning_rate": 1.668918918918919e-05,
"loss": 1.8378,
"step": 21200
},
{
"epoch": 13.387806411062225,
"grad_norm": 5.163560390472412,
"learning_rate": 1.653205531112508e-05,
"loss": 1.7985,
"step": 21300
},
{
"epoch": 13.450659962287869,
"grad_norm": 4.454617023468018,
"learning_rate": 1.637492143306097e-05,
"loss": 1.8177,
"step": 21400
},
{
"epoch": 13.513513513513514,
"grad_norm": 3.672908067703247,
"learning_rate": 1.6217787554996858e-05,
"loss": 1.6908,
"step": 21500
},
{
"epoch": 13.576367064739157,
"grad_norm": 4.549923419952393,
"learning_rate": 1.6060653676932748e-05,
"loss": 1.7603,
"step": 21600
},
{
"epoch": 13.639220615964803,
"grad_norm": 5.733989715576172,
"learning_rate": 1.5903519798868638e-05,
"loss": 1.7689,
"step": 21700
},
{
"epoch": 13.702074167190446,
"grad_norm": 4.507519245147705,
"learning_rate": 1.5746385920804527e-05,
"loss": 1.7984,
"step": 21800
},
{
"epoch": 13.764927718416091,
"grad_norm": 4.713226795196533,
"learning_rate": 1.5589252042740414e-05,
"loss": 1.8011,
"step": 21900
},
{
"epoch": 13.827781269641735,
"grad_norm": 4.300686359405518,
"learning_rate": 1.5432118164676304e-05,
"loss": 1.7743,
"step": 22000
},
{
"epoch": 13.89063482086738,
"grad_norm": 4.702789306640625,
"learning_rate": 1.5274984286612197e-05,
"loss": 1.6903,
"step": 22100
},
{
"epoch": 13.953488372093023,
"grad_norm": 6.481640815734863,
"learning_rate": 1.5117850408548085e-05,
"loss": 1.822,
"step": 22200
},
{
"epoch": 14.0,
"eval_loss": 1.648952603340149,
"eval_runtime": 21.6512,
"eval_samples_per_second": 44.247,
"eval_steps_per_second": 5.542,
"step": 22274
},
{
"epoch": 14.016341923318668,
"grad_norm": 4.320845127105713,
"learning_rate": 2.1968573224387177e-05,
"loss": 1.7866,
"step": 22300
},
{
"epoch": 14.079195474544312,
"grad_norm": 5.575278282165527,
"learning_rate": 2.184286612193589e-05,
"loss": 1.7572,
"step": 22400
},
{
"epoch": 14.142049025769955,
"grad_norm": 5.764155387878418,
"learning_rate": 2.17171590194846e-05,
"loss": 1.7566,
"step": 22500
},
{
"epoch": 14.2049025769956,
"grad_norm": 4.854477882385254,
"learning_rate": 2.1591451917033316e-05,
"loss": 1.7517,
"step": 22600
},
{
"epoch": 14.267756128221244,
"grad_norm": 4.7141618728637695,
"learning_rate": 2.1465744814582025e-05,
"loss": 1.713,
"step": 22700
},
{
"epoch": 14.330609679446889,
"grad_norm": 4.3324785232543945,
"learning_rate": 2.1340037712130736e-05,
"loss": 1.7511,
"step": 22800
},
{
"epoch": 14.393463230672532,
"grad_norm": 3.4204530715942383,
"learning_rate": 2.1214330609679448e-05,
"loss": 1.7451,
"step": 22900
},
{
"epoch": 14.456316781898177,
"grad_norm": 4.925296783447266,
"learning_rate": 2.108862350722816e-05,
"loss": 1.6868,
"step": 23000
},
{
"epoch": 14.51917033312382,
"grad_norm": 4.997200965881348,
"learning_rate": 2.0962916404776872e-05,
"loss": 1.7259,
"step": 23100
},
{
"epoch": 14.582023884349466,
"grad_norm": 4.816483497619629,
"learning_rate": 2.0837209302325584e-05,
"loss": 1.7716,
"step": 23200
},
{
"epoch": 14.64487743557511,
"grad_norm": 5.224360466003418,
"learning_rate": 2.0711502199874295e-05,
"loss": 1.7039,
"step": 23300
},
{
"epoch": 14.707730986800755,
"grad_norm": 7.450541019439697,
"learning_rate": 2.0585795097423004e-05,
"loss": 1.6634,
"step": 23400
},
{
"epoch": 14.770584538026398,
"grad_norm": 5.811767101287842,
"learning_rate": 2.0460087994971716e-05,
"loss": 1.7526,
"step": 23500
},
{
"epoch": 14.833438089252043,
"grad_norm": 4.1061272621154785,
"learning_rate": 2.0334380892520427e-05,
"loss": 1.7612,
"step": 23600
},
{
"epoch": 14.896291640477687,
"grad_norm": 4.599556922912598,
"learning_rate": 2.020867379006914e-05,
"loss": 1.776,
"step": 23700
},
{
"epoch": 14.959145191703332,
"grad_norm": 4.085700988769531,
"learning_rate": 2.008296668761785e-05,
"loss": 1.7143,
"step": 23800
},
{
"epoch": 15.0,
"eval_loss": 1.6270309686660767,
"eval_runtime": 20.346,
"eval_samples_per_second": 47.085,
"eval_steps_per_second": 5.898,
"step": 23865
},
{
"epoch": 15.021998742928975,
"grad_norm": 8.476902961730957,
"learning_rate": 1.9957259585166563e-05,
"loss": 1.6504,
"step": 23900
},
{
"epoch": 15.08485229415462,
"grad_norm": 4.84979772567749,
"learning_rate": 1.9831552482715275e-05,
"loss": 1.7259,
"step": 24000
},
{
"epoch": 15.147705845380264,
"grad_norm": 4.314637184143066,
"learning_rate": 1.9705845380263983e-05,
"loss": 1.6254,
"step": 24100
},
{
"epoch": 15.210559396605909,
"grad_norm": 4.656597137451172,
"learning_rate": 1.9580138277812698e-05,
"loss": 1.7493,
"step": 24200
},
{
"epoch": 15.273412947831552,
"grad_norm": 4.276788711547852,
"learning_rate": 1.945443117536141e-05,
"loss": 1.6797,
"step": 24300
},
{
"epoch": 15.336266499057198,
"grad_norm": 3.9574031829833984,
"learning_rate": 1.9328724072910122e-05,
"loss": 1.716,
"step": 24400
},
{
"epoch": 15.399120050282841,
"grad_norm": 8.148831367492676,
"learning_rate": 1.920301697045883e-05,
"loss": 1.6737,
"step": 24500
},
{
"epoch": 15.461973601508486,
"grad_norm": 3.8734018802642822,
"learning_rate": 1.9077309868007542e-05,
"loss": 1.6452,
"step": 24600
},
{
"epoch": 15.52482715273413,
"grad_norm": 4.928835391998291,
"learning_rate": 1.8951602765556257e-05,
"loss": 1.7134,
"step": 24700
},
{
"epoch": 15.587680703959773,
"grad_norm": 4.991033554077148,
"learning_rate": 1.8825895663104966e-05,
"loss": 1.7327,
"step": 24800
},
{
"epoch": 15.650534255185418,
"grad_norm": 4.160732269287109,
"learning_rate": 1.8700188560653677e-05,
"loss": 1.6678,
"step": 24900
},
{
"epoch": 15.713387806411061,
"grad_norm": 6.523078441619873,
"learning_rate": 1.857448145820239e-05,
"loss": 1.6856,
"step": 25000
},
{
"epoch": 15.776241357636707,
"grad_norm": 6.306403636932373,
"learning_rate": 1.84487743557511e-05,
"loss": 1.6699,
"step": 25100
},
{
"epoch": 15.83909490886235,
"grad_norm": 4.479640483856201,
"learning_rate": 1.832306725329981e-05,
"loss": 1.676,
"step": 25200
},
{
"epoch": 15.901948460087995,
"grad_norm": 4.6891279220581055,
"learning_rate": 1.8197360150848525e-05,
"loss": 1.667,
"step": 25300
},
{
"epoch": 15.964802011313639,
"grad_norm": 5.908668518066406,
"learning_rate": 1.8071653048397236e-05,
"loss": 1.6267,
"step": 25400
},
{
"epoch": 16.0,
"eval_loss": 1.608726143836975,
"eval_runtime": 20.3571,
"eval_samples_per_second": 47.06,
"eval_steps_per_second": 5.895,
"step": 25456
},
{
"epoch": 16.027655562539284,
"grad_norm": 4.081086158752441,
"learning_rate": 1.7945945945945948e-05,
"loss": 1.5625,
"step": 25500
},
{
"epoch": 16.090509113764927,
"grad_norm": 3.7648415565490723,
"learning_rate": 1.7820238843494657e-05,
"loss": 1.6818,
"step": 25600
},
{
"epoch": 16.15336266499057,
"grad_norm": 5.430357456207275,
"learning_rate": 1.769453174104337e-05,
"loss": 1.6125,
"step": 25700
},
{
"epoch": 16.216216216216218,
"grad_norm": 5.235119819641113,
"learning_rate": 1.7568824638592084e-05,
"loss": 1.6985,
"step": 25800
},
{
"epoch": 16.27906976744186,
"grad_norm": 5.521476745605469,
"learning_rate": 1.7443117536140792e-05,
"loss": 1.6291,
"step": 25900
},
{
"epoch": 16.341923318667504,
"grad_norm": 5.7086873054504395,
"learning_rate": 1.7317410433689504e-05,
"loss": 1.6523,
"step": 26000
},
{
"epoch": 16.404776869893148,
"grad_norm": 5.697257041931152,
"learning_rate": 1.7191703331238216e-05,
"loss": 1.6518,
"step": 26100
},
{
"epoch": 16.467630421118795,
"grad_norm": 8.258442878723145,
"learning_rate": 1.7065996228786928e-05,
"loss": 1.6314,
"step": 26200
},
{
"epoch": 16.530483972344438,
"grad_norm": 4.087442874908447,
"learning_rate": 1.694028912633564e-05,
"loss": 1.7048,
"step": 26300
},
{
"epoch": 16.59333752357008,
"grad_norm": 4.184548377990723,
"learning_rate": 1.681458202388435e-05,
"loss": 1.6062,
"step": 26400
},
{
"epoch": 16.656191074795725,
"grad_norm": 5.8042707443237305,
"learning_rate": 1.6688874921433063e-05,
"loss": 1.6239,
"step": 26500
},
{
"epoch": 16.719044626021372,
"grad_norm": 4.104475498199463,
"learning_rate": 1.656316781898177e-05,
"loss": 1.5742,
"step": 26600
},
{
"epoch": 16.781898177247015,
"grad_norm": 4.2934722900390625,
"learning_rate": 1.6437460716530483e-05,
"loss": 1.6069,
"step": 26700
},
{
"epoch": 16.84475172847266,
"grad_norm": 4.601330757141113,
"learning_rate": 1.6311753614079195e-05,
"loss": 1.5827,
"step": 26800
},
{
"epoch": 16.907605279698302,
"grad_norm": 4.304816246032715,
"learning_rate": 1.618604651162791e-05,
"loss": 1.6461,
"step": 26900
},
{
"epoch": 16.970458830923945,
"grad_norm": 6.80120325088501,
"learning_rate": 1.606033940917662e-05,
"loss": 1.6143,
"step": 27000
},
{
"epoch": 17.0,
"eval_loss": 1.5869935750961304,
"eval_runtime": 20.3162,
"eval_samples_per_second": 47.154,
"eval_steps_per_second": 5.907,
"step": 27047
},
{
"epoch": 17.033312382149592,
"grad_norm": 4.368440628051758,
"learning_rate": 1.593463230672533e-05,
"loss": 1.6352,
"step": 27100
},
{
"epoch": 17.096165933375236,
"grad_norm": 4.066120624542236,
"learning_rate": 1.5808925204274042e-05,
"loss": 1.5052,
"step": 27200
},
{
"epoch": 17.15901948460088,
"grad_norm": 6.150811672210693,
"learning_rate": 1.5683218101822754e-05,
"loss": 1.5449,
"step": 27300
},
{
"epoch": 17.221873035826523,
"grad_norm": 7.994663715362549,
"learning_rate": 1.5557510999371466e-05,
"loss": 1.7157,
"step": 27400
},
{
"epoch": 17.28472658705217,
"grad_norm": 3.554856061935425,
"learning_rate": 1.5431803896920178e-05,
"loss": 1.5878,
"step": 27500
},
{
"epoch": 17.347580138277813,
"grad_norm": 4.025883674621582,
"learning_rate": 1.530609679446889e-05,
"loss": 1.6454,
"step": 27600
},
{
"epoch": 17.410433689503456,
"grad_norm": 2.9825448989868164,
"learning_rate": 1.51803896920176e-05,
"loss": 1.5605,
"step": 27700
},
{
"epoch": 17.4732872407291,
"grad_norm": 4.528345584869385,
"learning_rate": 1.505468258956631e-05,
"loss": 1.626,
"step": 27800
},
{
"epoch": 17.536140791954747,
"grad_norm": 4.549004554748535,
"learning_rate": 1.4928975487115023e-05,
"loss": 1.5508,
"step": 27900
},
{
"epoch": 17.59899434318039,
"grad_norm": 4.830588340759277,
"learning_rate": 1.4803268384663735e-05,
"loss": 1.5394,
"step": 28000
},
{
"epoch": 17.661847894406034,
"grad_norm": 4.127079486846924,
"learning_rate": 1.4677561282212447e-05,
"loss": 1.5548,
"step": 28100
},
{
"epoch": 17.724701445631677,
"grad_norm": 3.208592414855957,
"learning_rate": 1.4551854179761157e-05,
"loss": 1.5595,
"step": 28200
},
{
"epoch": 17.787554996857324,
"grad_norm": 4.784154891967773,
"learning_rate": 1.4426147077309869e-05,
"loss": 1.6029,
"step": 28300
},
{
"epoch": 17.850408548082967,
"grad_norm": 5.0941481590271,
"learning_rate": 1.4300439974858582e-05,
"loss": 1.634,
"step": 28400
},
{
"epoch": 17.91326209930861,
"grad_norm": 6.4498982429504395,
"learning_rate": 1.4174732872407292e-05,
"loss": 1.6685,
"step": 28500
},
{
"epoch": 17.976115650534254,
"grad_norm": 5.136322021484375,
"learning_rate": 1.4049025769956004e-05,
"loss": 1.5587,
"step": 28600
},
{
"epoch": 18.0,
"eval_loss": 1.565408706665039,
"eval_runtime": 20.3165,
"eval_samples_per_second": 47.154,
"eval_steps_per_second": 5.907,
"step": 28638
},
{
"epoch": 18.0389692017599,
"grad_norm": 7.265219211578369,
"learning_rate": 1.3923318667504714e-05,
"loss": 1.534,
"step": 28700
},
{
"epoch": 18.101822752985544,
"grad_norm": 5.552704334259033,
"learning_rate": 1.3797611565053426e-05,
"loss": 1.5396,
"step": 28800
},
{
"epoch": 18.164676304211188,
"grad_norm": 7.356419086456299,
"learning_rate": 1.3671904462602136e-05,
"loss": 1.5851,
"step": 28900
},
{
"epoch": 18.22752985543683,
"grad_norm": 5.519120693206787,
"learning_rate": 1.354619736015085e-05,
"loss": 1.6331,
"step": 29000
},
{
"epoch": 18.290383406662478,
"grad_norm": 4.4178242683410645,
"learning_rate": 1.3420490257699561e-05,
"loss": 1.508,
"step": 29100
},
{
"epoch": 18.35323695788812,
"grad_norm": 4.479162216186523,
"learning_rate": 1.3294783155248271e-05,
"loss": 1.5201,
"step": 29200
},
{
"epoch": 18.416090509113765,
"grad_norm": 4.4193806648254395,
"learning_rate": 1.3169076052796983e-05,
"loss": 1.5393,
"step": 29300
},
{
"epoch": 18.47894406033941,
"grad_norm": 6.695824146270752,
"learning_rate": 1.3043368950345693e-05,
"loss": 1.6264,
"step": 29400
},
{
"epoch": 18.541797611565052,
"grad_norm": 4.760421276092529,
"learning_rate": 1.2917661847894409e-05,
"loss": 1.5465,
"step": 29500
},
{
"epoch": 18.6046511627907,
"grad_norm": 4.158078193664551,
"learning_rate": 1.2791954745443119e-05,
"loss": 1.5533,
"step": 29600
},
{
"epoch": 18.667504714016342,
"grad_norm": 6.8502092361450195,
"learning_rate": 1.266624764299183e-05,
"loss": 1.6525,
"step": 29700
},
{
"epoch": 18.730358265241986,
"grad_norm": 4.013594150543213,
"learning_rate": 1.254054054054054e-05,
"loss": 1.5357,
"step": 29800
},
{
"epoch": 18.79321181646763,
"grad_norm": 6.064908981323242,
"learning_rate": 1.2414833438089252e-05,
"loss": 1.5659,
"step": 29900
},
{
"epoch": 18.856065367693276,
"grad_norm": 5.281710624694824,
"learning_rate": 1.2289126335637964e-05,
"loss": 1.4692,
"step": 30000
},
{
"epoch": 18.91891891891892,
"grad_norm": 4.661835193634033,
"learning_rate": 1.2163419233186674e-05,
"loss": 1.5126,
"step": 30100
},
{
"epoch": 18.981772470144563,
"grad_norm": 3.9490227699279785,
"learning_rate": 1.2037712130735388e-05,
"loss": 1.5389,
"step": 30200
},
{
"epoch": 19.0,
"eval_loss": 1.5563335418701172,
"eval_runtime": 20.3631,
"eval_samples_per_second": 47.046,
"eval_steps_per_second": 5.893,
"step": 30229
},
{
"epoch": 19.044626021370206,
"grad_norm": 4.6667866706848145,
"learning_rate": 1.1912005028284098e-05,
"loss": 1.5508,
"step": 30300
},
{
"epoch": 19.107479572595853,
"grad_norm": 4.471792697906494,
"learning_rate": 1.1786297925832811e-05,
"loss": 1.5253,
"step": 30400
},
{
"epoch": 19.170333123821496,
"grad_norm": 4.01970100402832,
"learning_rate": 1.1660590823381521e-05,
"loss": 1.5047,
"step": 30500
},
{
"epoch": 19.23318667504714,
"grad_norm": 5.021801471710205,
"learning_rate": 1.1534883720930233e-05,
"loss": 1.5459,
"step": 30600
},
{
"epoch": 19.296040226272783,
"grad_norm": 4.681889533996582,
"learning_rate": 1.1409176618478945e-05,
"loss": 1.561,
"step": 30700
},
{
"epoch": 19.35889377749843,
"grad_norm": 4.114772319793701,
"learning_rate": 1.1283469516027655e-05,
"loss": 1.532,
"step": 30800
},
{
"epoch": 19.421747328724074,
"grad_norm": 3.9337844848632812,
"learning_rate": 1.1157762413576367e-05,
"loss": 1.5512,
"step": 30900
},
{
"epoch": 19.484600879949717,
"grad_norm": 4.935436725616455,
"learning_rate": 1.1032055311125079e-05,
"loss": 1.5328,
"step": 31000
},
{
"epoch": 19.54745443117536,
"grad_norm": 5.703494071960449,
"learning_rate": 1.090634820867379e-05,
"loss": 1.5889,
"step": 31100
},
{
"epoch": 19.610307982401007,
"grad_norm": 6.010659217834473,
"learning_rate": 1.0780641106222502e-05,
"loss": 1.5166,
"step": 31200
},
{
"epoch": 19.67316153362665,
"grad_norm": 5.14444637298584,
"learning_rate": 1.0654934003771214e-05,
"loss": 1.5096,
"step": 31300
},
{
"epoch": 19.736015084852294,
"grad_norm": 7.321188449859619,
"learning_rate": 1.0529226901319924e-05,
"loss": 1.4865,
"step": 31400
},
{
"epoch": 19.798868636077938,
"grad_norm": 3.7702994346618652,
"learning_rate": 1.0403519798868636e-05,
"loss": 1.5122,
"step": 31500
},
{
"epoch": 19.86172218730358,
"grad_norm": 5.493444442749023,
"learning_rate": 1.0277812696417348e-05,
"loss": 1.4974,
"step": 31600
},
{
"epoch": 19.924575738529228,
"grad_norm": 5.273486137390137,
"learning_rate": 1.015210559396606e-05,
"loss": 1.5619,
"step": 31700
},
{
"epoch": 19.98742928975487,
"grad_norm": 4.340183734893799,
"learning_rate": 1.0026398491514772e-05,
"loss": 1.4476,
"step": 31800
},
{
"epoch": 20.0,
"eval_loss": 1.5459223985671997,
"eval_runtime": 20.3264,
"eval_samples_per_second": 47.131,
"eval_steps_per_second": 5.904,
"step": 31820
},
{
"epoch": 20.050282840980515,
"grad_norm": 3.8120639324188232,
"learning_rate": 9.900691389063482e-06,
"loss": 1.4837,
"step": 31900
},
{
"epoch": 20.113136392206158,
"grad_norm": 4.154244899749756,
"learning_rate": 9.774984286612195e-06,
"loss": 1.4684,
"step": 32000
},
{
"epoch": 20.175989943431805,
"grad_norm": 3.925746202468872,
"learning_rate": 9.649277184160905e-06,
"loss": 1.4685,
"step": 32100
},
{
"epoch": 20.23884349465745,
"grad_norm": 5.944131374359131,
"learning_rate": 9.523570081709617e-06,
"loss": 1.5097,
"step": 32200
},
{
"epoch": 20.301697045883092,
"grad_norm": 4.755185127258301,
"learning_rate": 9.397862979258329e-06,
"loss": 1.4334,
"step": 32300
},
{
"epoch": 20.364550597108735,
"grad_norm": 4.627038478851318,
"learning_rate": 9.27215587680704e-06,
"loss": 1.503,
"step": 32400
},
{
"epoch": 20.427404148334382,
"grad_norm": 9.863165855407715,
"learning_rate": 9.14644877435575e-06,
"loss": 1.4607,
"step": 32500
},
{
"epoch": 20.490257699560026,
"grad_norm": 4.401854991912842,
"learning_rate": 9.020741671904463e-06,
"loss": 1.4653,
"step": 32600
},
{
"epoch": 20.55311125078567,
"grad_norm": 6.041737079620361,
"learning_rate": 8.895034569453174e-06,
"loss": 1.504,
"step": 32700
},
{
"epoch": 20.615964802011312,
"grad_norm": 6.523427963256836,
"learning_rate": 8.769327467001886e-06,
"loss": 1.6205,
"step": 32800
},
{
"epoch": 20.67881835323696,
"grad_norm": 5.47548246383667,
"learning_rate": 8.643620364550598e-06,
"loss": 1.4491,
"step": 32900
},
{
"epoch": 20.741671904462603,
"grad_norm": 5.3726959228515625,
"learning_rate": 8.517913262099308e-06,
"loss": 1.5817,
"step": 33000
},
{
"epoch": 20.804525455688246,
"grad_norm": 3.872283935546875,
"learning_rate": 8.392206159648022e-06,
"loss": 1.5482,
"step": 33100
},
{
"epoch": 20.86737900691389,
"grad_norm": 4.935946464538574,
"learning_rate": 8.266499057196732e-06,
"loss": 1.5006,
"step": 33200
},
{
"epoch": 20.930232558139537,
"grad_norm": 6.805904388427734,
"learning_rate": 8.140791954745444e-06,
"loss": 1.5314,
"step": 33300
},
{
"epoch": 20.99308610936518,
"grad_norm": 4.420083522796631,
"learning_rate": 8.015084852294155e-06,
"loss": 1.5417,
"step": 33400
},
{
"epoch": 21.0,
"eval_loss": 1.5356966257095337,
"eval_runtime": 20.4137,
"eval_samples_per_second": 46.929,
"eval_steps_per_second": 5.878,
"step": 33411
},
{
"epoch": 21.055939660590823,
"grad_norm": 3.697171688079834,
"learning_rate": 7.889377749842865e-06,
"loss": 1.4994,
"step": 33500
},
{
"epoch": 21.118793211816467,
"grad_norm": 5.232399940490723,
"learning_rate": 7.763670647391579e-06,
"loss": 1.5351,
"step": 33600
},
{
"epoch": 21.18164676304211,
"grad_norm": 4.508577823638916,
"learning_rate": 7.637963544940289e-06,
"loss": 1.4301,
"step": 33700
},
{
"epoch": 21.244500314267757,
"grad_norm": 5.425107479095459,
"learning_rate": 7.512256442489001e-06,
"loss": 1.4739,
"step": 33800
},
{
"epoch": 21.3073538654934,
"grad_norm": 6.195432186126709,
"learning_rate": 7.386549340037713e-06,
"loss": 1.5458,
"step": 33900
},
{
"epoch": 21.370207416719044,
"grad_norm": 5.850045204162598,
"learning_rate": 7.260842237586424e-06,
"loss": 1.5189,
"step": 34000
},
{
"epoch": 21.433060967944687,
"grad_norm": 7.121579170227051,
"learning_rate": 7.135135135135136e-06,
"loss": 1.5273,
"step": 34100
},
{
"epoch": 21.495914519170334,
"grad_norm": 4.316208362579346,
"learning_rate": 7.009428032683847e-06,
"loss": 1.4437,
"step": 34200
},
{
"epoch": 21.558768070395978,
"grad_norm": 4.3052873611450195,
"learning_rate": 6.883720930232558e-06,
"loss": 1.4266,
"step": 34300
},
{
"epoch": 21.62162162162162,
"grad_norm": 4.691330432891846,
"learning_rate": 6.758013827781271e-06,
"loss": 1.422,
"step": 34400
},
{
"epoch": 21.684475172847264,
"grad_norm": 4.346444129943848,
"learning_rate": 6.632306725329982e-06,
"loss": 1.5511,
"step": 34500
},
{
"epoch": 21.74732872407291,
"grad_norm": 5.304843902587891,
"learning_rate": 6.506599622878693e-06,
"loss": 1.4961,
"step": 34600
},
{
"epoch": 21.810182275298555,
"grad_norm": 4.877419948577881,
"learning_rate": 6.3808925204274045e-06,
"loss": 1.4837,
"step": 34700
},
{
"epoch": 21.873035826524198,
"grad_norm": 4.086881637573242,
"learning_rate": 6.2551854179761155e-06,
"loss": 1.5164,
"step": 34800
},
{
"epoch": 21.93588937774984,
"grad_norm": 4.570976734161377,
"learning_rate": 6.129478315524827e-06,
"loss": 1.4681,
"step": 34900
},
{
"epoch": 21.99874292897549,
"grad_norm": 25.407676696777344,
"learning_rate": 6.003771213073539e-06,
"loss": 1.4062,
"step": 35000
},
{
"epoch": 22.0,
"eval_loss": 1.5373815298080444,
"eval_runtime": 20.3495,
"eval_samples_per_second": 47.077,
"eval_steps_per_second": 5.897,
"step": 35002
},
{
"epoch": 22.061596480201132,
"grad_norm": 4.965208053588867,
"learning_rate": 5.878064110622251e-06,
"loss": 1.446,
"step": 35100
},
{
"epoch": 22.124450031426775,
"grad_norm": 5.620969772338867,
"learning_rate": 5.752357008170962e-06,
"loss": 1.475,
"step": 35200
},
{
"epoch": 22.18730358265242,
"grad_norm": 4.315845489501953,
"learning_rate": 5.626649905719674e-06,
"loss": 1.4866,
"step": 35300
},
{
"epoch": 22.250157133878066,
"grad_norm": 4.076879501342773,
"learning_rate": 5.5009428032683854e-06,
"loss": 1.5079,
"step": 35400
},
{
"epoch": 22.31301068510371,
"grad_norm": 9.52351188659668,
"learning_rate": 5.375235700817096e-06,
"loss": 1.5637,
"step": 35500
},
{
"epoch": 22.375864236329353,
"grad_norm": 5.529058933258057,
"learning_rate": 5.249528598365807e-06,
"loss": 1.4702,
"step": 35600
},
{
"epoch": 22.438717787554996,
"grad_norm": 4.761877536773682,
"learning_rate": 5.123821495914519e-06,
"loss": 1.4367,
"step": 35700
},
{
"epoch": 22.501571338780643,
"grad_norm": 6.587429046630859,
"learning_rate": 4.998114393463231e-06,
"loss": 1.4052,
"step": 35800
},
{
"epoch": 22.564424890006286,
"grad_norm": 5.834304332733154,
"learning_rate": 4.872407291011943e-06,
"loss": 1.4186,
"step": 35900
},
{
"epoch": 22.62727844123193,
"grad_norm": 3.871225595474243,
"learning_rate": 4.746700188560654e-06,
"loss": 1.51,
"step": 36000
},
{
"epoch": 22.690131992457573,
"grad_norm": 3.876692771911621,
"learning_rate": 4.6209930861093655e-06,
"loss": 1.5022,
"step": 36100
},
{
"epoch": 22.752985543683216,
"grad_norm": 4.569952964782715,
"learning_rate": 4.495285983658077e-06,
"loss": 1.454,
"step": 36200
},
{
"epoch": 22.815839094908863,
"grad_norm": 5.837776184082031,
"learning_rate": 4.369578881206788e-06,
"loss": 1.4472,
"step": 36300
},
{
"epoch": 22.878692646134507,
"grad_norm": 5.9942426681518555,
"learning_rate": 4.243871778755499e-06,
"loss": 1.4198,
"step": 36400
},
{
"epoch": 22.94154619736015,
"grad_norm": 4.1033220291137695,
"learning_rate": 4.118164676304211e-06,
"loss": 1.4658,
"step": 36500
},
{
"epoch": 23.0,
"eval_loss": 1.5307875871658325,
"eval_runtime": 20.3299,
"eval_samples_per_second": 47.123,
"eval_steps_per_second": 5.903,
"step": 36593
},
{
"epoch": 23.004399748585794,
"grad_norm": 4.649007320404053,
"learning_rate": 3.992457573852923e-06,
"loss": 1.4064,
"step": 36600
},
{
"epoch": 23.06725329981144,
"grad_norm": 4.318711757659912,
"learning_rate": 3.866750471401635e-06,
"loss": 1.4249,
"step": 36700
},
{
"epoch": 23.130106851037084,
"grad_norm": 6.213062286376953,
"learning_rate": 3.7410433689503456e-06,
"loss": 1.4317,
"step": 36800
},
{
"epoch": 23.192960402262727,
"grad_norm": 4.529442310333252,
"learning_rate": 3.6153362664990574e-06,
"loss": 1.5102,
"step": 36900
},
{
"epoch": 23.25581395348837,
"grad_norm": 4.912539005279541,
"learning_rate": 3.4896291640477688e-06,
"loss": 1.4684,
"step": 37000
},
{
"epoch": 23.318667504714018,
"grad_norm": 4.593921661376953,
"learning_rate": 3.3639220615964806e-06,
"loss": 1.4181,
"step": 37100
},
{
"epoch": 23.38152105593966,
"grad_norm": 5.35049295425415,
"learning_rate": 3.2382149591451915e-06,
"loss": 1.4813,
"step": 37200
},
{
"epoch": 23.444374607165305,
"grad_norm": 4.00051212310791,
"learning_rate": 3.1125078566939033e-06,
"loss": 1.4392,
"step": 37300
},
{
"epoch": 23.507228158390948,
"grad_norm": 5.91484260559082,
"learning_rate": 2.9868007542426147e-06,
"loss": 1.4386,
"step": 37400
},
{
"epoch": 23.570081709616595,
"grad_norm": 7.114585876464844,
"learning_rate": 2.861093651791326e-06,
"loss": 1.4115,
"step": 37500
},
{
"epoch": 23.63293526084224,
"grad_norm": 2.977877378463745,
"learning_rate": 2.735386549340038e-06,
"loss": 1.4211,
"step": 37600
},
{
"epoch": 23.69578881206788,
"grad_norm": 3.83953857421875,
"learning_rate": 2.6096794468887493e-06,
"loss": 1.4601,
"step": 37700
},
{
"epoch": 23.758642363293525,
"grad_norm": 4.377187728881836,
"learning_rate": 2.483972344437461e-06,
"loss": 1.4281,
"step": 37800
},
{
"epoch": 23.821495914519172,
"grad_norm": 3.9868085384368896,
"learning_rate": 2.358265241986172e-06,
"loss": 1.4585,
"step": 37900
},
{
"epoch": 23.884349465744815,
"grad_norm": 3.989767551422119,
"learning_rate": 2.232558139534884e-06,
"loss": 1.5302,
"step": 38000
},
{
"epoch": 23.94720301697046,
"grad_norm": 4.481296062469482,
"learning_rate": 2.1068510370835952e-06,
"loss": 1.4366,
"step": 38100
},
{
"epoch": 24.0,
"eval_loss": 1.5289642810821533,
"eval_runtime": 20.3269,
"eval_samples_per_second": 47.13,
"eval_steps_per_second": 5.904,
"step": 38184
},
{
"epoch": 24.010056568196102,
"grad_norm": 4.909224033355713,
"learning_rate": 1.981143934632307e-06,
"loss": 1.4956,
"step": 38200
},
{
"epoch": 24.072910119421746,
"grad_norm": 4.9214372634887695,
"learning_rate": 1.8554368321810182e-06,
"loss": 1.4725,
"step": 38300
},
{
"epoch": 24.135763670647393,
"grad_norm": 4.345515251159668,
"learning_rate": 1.7297297297297298e-06,
"loss": 1.4407,
"step": 38400
},
{
"epoch": 24.198617221873036,
"grad_norm": 4.926340579986572,
"learning_rate": 1.6040226272784412e-06,
"loss": 1.5008,
"step": 38500
},
{
"epoch": 24.26147077309868,
"grad_norm": 4.5064263343811035,
"learning_rate": 1.4783155248271527e-06,
"loss": 1.4868,
"step": 38600
},
{
"epoch": 24.324324324324323,
"grad_norm": 5.347716808319092,
"learning_rate": 1.3526084223758643e-06,
"loss": 1.45,
"step": 38700
},
{
"epoch": 24.38717787554997,
"grad_norm": 5.024169921875,
"learning_rate": 1.2269013199245757e-06,
"loss": 1.3905,
"step": 38800
},
{
"epoch": 24.450031426775613,
"grad_norm": 4.319692611694336,
"learning_rate": 1.1011942174732873e-06,
"loss": 1.4671,
"step": 38900
},
{
"epoch": 24.512884978001257,
"grad_norm": 2.880321979522705,
"learning_rate": 9.75487115021999e-07,
"loss": 1.4211,
"step": 39000
},
{
"epoch": 24.5757385292269,
"grad_norm": 4.416039943695068,
"learning_rate": 8.497800125707103e-07,
"loss": 1.4176,
"step": 39100
},
{
"epoch": 24.638592080452547,
"grad_norm": 4.598896503448486,
"learning_rate": 7.240729101194218e-07,
"loss": 1.4194,
"step": 39200
},
{
"epoch": 24.70144563167819,
"grad_norm": 4.256235599517822,
"learning_rate": 5.983658076681333e-07,
"loss": 1.4331,
"step": 39300
},
{
"epoch": 24.764299182903834,
"grad_norm": 4.7764811515808105,
"learning_rate": 4.726587052168448e-07,
"loss": 1.4491,
"step": 39400
},
{
"epoch": 24.827152734129477,
"grad_norm": 4.296844005584717,
"learning_rate": 3.4695160276555627e-07,
"loss": 1.4443,
"step": 39500
},
{
"epoch": 24.890006285355124,
"grad_norm": 3.9589693546295166,
"learning_rate": 2.2124450031426776e-07,
"loss": 1.4612,
"step": 39600
},
{
"epoch": 24.952859836580767,
"grad_norm": 4.165828227996826,
"learning_rate": 9.553739786297926e-08,
"loss": 1.48,
"step": 39700
},
{
"epoch": 25.0,
"eval_loss": 1.528791069984436,
"eval_runtime": 20.2887,
"eval_samples_per_second": 47.218,
"eval_steps_per_second": 5.915,
"step": 39775
}
],
"logging_steps": 100,
"max_steps": 39775,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.44418915549184e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}