ParamDev's picture
Training in progress, step 4587, checkpoint
bc1ba1b verified
{
"best_global_step": 1529,
"best_metric": 0.15474164485931396,
"best_model_checkpoint": "./results_ner_lora_entity_aware/checkpoint-1529",
"epoch": 1.4903330625507718,
"eval_steps": 1529,
"global_step": 4587,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003249390739236393,
"grad_norm": 0.8849583864212036,
"learning_rate": 1.461038961038961e-06,
"loss": 0.2156,
"step": 10
},
{
"epoch": 0.006498781478472786,
"grad_norm": 0.865159809589386,
"learning_rate": 3.0844155844155847e-06,
"loss": 0.2016,
"step": 20
},
{
"epoch": 0.00974817221770918,
"grad_norm": 0.8396425247192383,
"learning_rate": 4.707792207792208e-06,
"loss": 0.174,
"step": 30
},
{
"epoch": 0.012997562956945572,
"grad_norm": 0.83138507604599,
"learning_rate": 6.331168831168831e-06,
"loss": 0.1586,
"step": 40
},
{
"epoch": 0.016246953696181964,
"grad_norm": 0.8232032060623169,
"learning_rate": 7.954545454545455e-06,
"loss": 0.1361,
"step": 50
},
{
"epoch": 0.01949634443541836,
"grad_norm": 0.9345568418502808,
"learning_rate": 9.577922077922078e-06,
"loss": 0.1184,
"step": 60
},
{
"epoch": 0.022745735174654752,
"grad_norm": 1.0010511875152588,
"learning_rate": 1.1201298701298701e-05,
"loss": 0.1081,
"step": 70
},
{
"epoch": 0.025995125913891144,
"grad_norm": 1.1553717851638794,
"learning_rate": 1.2824675324675326e-05,
"loss": 0.1003,
"step": 80
},
{
"epoch": 0.02924451665312754,
"grad_norm": 0.9876243472099304,
"learning_rate": 1.4448051948051949e-05,
"loss": 0.0857,
"step": 90
},
{
"epoch": 0.03249390739236393,
"grad_norm": 0.9326199293136597,
"learning_rate": 1.6071428571428572e-05,
"loss": 0.0828,
"step": 100
},
{
"epoch": 0.03574329813160033,
"grad_norm": 0.9165641665458679,
"learning_rate": 1.7694805194805193e-05,
"loss": 0.0862,
"step": 110
},
{
"epoch": 0.03899268887083672,
"grad_norm": 1.2718145847320557,
"learning_rate": 1.9318181818181818e-05,
"loss": 0.0833,
"step": 120
},
{
"epoch": 0.04224207961007311,
"grad_norm": 1.0397502183914185,
"learning_rate": 2.0941558441558443e-05,
"loss": 0.0765,
"step": 130
},
{
"epoch": 0.045491470349309504,
"grad_norm": 0.9659145474433899,
"learning_rate": 2.2564935064935067e-05,
"loss": 0.0733,
"step": 140
},
{
"epoch": 0.048740861088545896,
"grad_norm": 0.8223243355751038,
"learning_rate": 2.4188311688311692e-05,
"loss": 0.0742,
"step": 150
},
{
"epoch": 0.05199025182778229,
"grad_norm": 0.7280202507972717,
"learning_rate": 2.5811688311688314e-05,
"loss": 0.0739,
"step": 160
},
{
"epoch": 0.05523964256701868,
"grad_norm": 0.8244697451591492,
"learning_rate": 2.7435064935064935e-05,
"loss": 0.0678,
"step": 170
},
{
"epoch": 0.05848903330625508,
"grad_norm": 0.633224368095398,
"learning_rate": 2.9058441558441563e-05,
"loss": 0.0667,
"step": 180
},
{
"epoch": 0.06173842404549147,
"grad_norm": 1.4761995077133179,
"learning_rate": 3.068181818181818e-05,
"loss": 0.0676,
"step": 190
},
{
"epoch": 0.06498781478472786,
"grad_norm": 0.9232731461524963,
"learning_rate": 3.2305194805194806e-05,
"loss": 0.0714,
"step": 200
},
{
"epoch": 0.06823720552396426,
"grad_norm": 0.7005584239959717,
"learning_rate": 3.392857142857143e-05,
"loss": 0.0649,
"step": 210
},
{
"epoch": 0.07148659626320066,
"grad_norm": 1.1675697565078735,
"learning_rate": 3.5551948051948055e-05,
"loss": 0.0661,
"step": 220
},
{
"epoch": 0.07473598700243704,
"grad_norm": 0.5797019004821777,
"learning_rate": 3.717532467532468e-05,
"loss": 0.067,
"step": 230
},
{
"epoch": 0.07798537774167344,
"grad_norm": 0.5873407125473022,
"learning_rate": 3.87987012987013e-05,
"loss": 0.0615,
"step": 240
},
{
"epoch": 0.08123476848090982,
"grad_norm": 0.7156246900558472,
"learning_rate": 4.042207792207792e-05,
"loss": 0.0675,
"step": 250
},
{
"epoch": 0.08448415922014622,
"grad_norm": 0.6001958847045898,
"learning_rate": 4.204545454545455e-05,
"loss": 0.0609,
"step": 260
},
{
"epoch": 0.08773354995938261,
"grad_norm": 0.5709946751594543,
"learning_rate": 4.366883116883117e-05,
"loss": 0.0616,
"step": 270
},
{
"epoch": 0.09098294069861901,
"grad_norm": 0.5521292686462402,
"learning_rate": 4.52922077922078e-05,
"loss": 0.0635,
"step": 280
},
{
"epoch": 0.09423233143785541,
"grad_norm": 0.877484917640686,
"learning_rate": 4.691558441558442e-05,
"loss": 0.0614,
"step": 290
},
{
"epoch": 0.09748172217709179,
"grad_norm": 0.6828113794326782,
"learning_rate": 4.853896103896104e-05,
"loss": 0.0605,
"step": 300
},
{
"epoch": 0.10073111291632819,
"grad_norm": 0.5761043429374695,
"learning_rate": 4.999999639259403e-05,
"loss": 0.062,
"step": 310
},
{
"epoch": 0.10398050365556458,
"grad_norm": 0.6017851829528809,
"learning_rate": 4.999956350513694e-05,
"loss": 0.0629,
"step": 320
},
{
"epoch": 0.10722989439480098,
"grad_norm": 0.4347151517868042,
"learning_rate": 4.999840915079987e-05,
"loss": 0.06,
"step": 330
},
{
"epoch": 0.11047928513403736,
"grad_norm": 0.49241673946380615,
"learning_rate": 4.9996533362896526e-05,
"loss": 0.0625,
"step": 340
},
{
"epoch": 0.11372867587327376,
"grad_norm": 0.45582103729248047,
"learning_rate": 4.999393619556062e-05,
"loss": 0.0599,
"step": 350
},
{
"epoch": 0.11697806661251016,
"grad_norm": 0.4913433790206909,
"learning_rate": 4.999061772374426e-05,
"loss": 0.062,
"step": 360
},
{
"epoch": 0.12022745735174654,
"grad_norm": 0.47562339901924133,
"learning_rate": 4.998657804321582e-05,
"loss": 0.0599,
"step": 370
},
{
"epoch": 0.12347684809098294,
"grad_norm": 0.5014773607254028,
"learning_rate": 4.9981817270557166e-05,
"loss": 0.0538,
"step": 380
},
{
"epoch": 0.12672623883021933,
"grad_norm": 0.5543435215950012,
"learning_rate": 4.9976335543160284e-05,
"loss": 0.0628,
"step": 390
},
{
"epoch": 0.12997562956945571,
"grad_norm": 0.5599291920661926,
"learning_rate": 4.997013301922333e-05,
"loss": 0.0567,
"step": 400
},
{
"epoch": 0.13322502030869213,
"grad_norm": 0.46465206146240234,
"learning_rate": 4.996320987774606e-05,
"loss": 0.0612,
"step": 410
},
{
"epoch": 0.1364744110479285,
"grad_norm": 0.5900648236274719,
"learning_rate": 4.995556631852464e-05,
"loss": 0.0536,
"step": 420
},
{
"epoch": 0.1397238017871649,
"grad_norm": 0.402570903301239,
"learning_rate": 4.9947202562145925e-05,
"loss": 0.0564,
"step": 430
},
{
"epoch": 0.1429731925264013,
"grad_norm": 0.45116308331489563,
"learning_rate": 4.9938118849981054e-05,
"loss": 0.0542,
"step": 440
},
{
"epoch": 0.1462225832656377,
"grad_norm": 0.41757047176361084,
"learning_rate": 4.9928315444178496e-05,
"loss": 0.0527,
"step": 450
},
{
"epoch": 0.14947197400487408,
"grad_norm": 0.5318567156791687,
"learning_rate": 4.991779262765651e-05,
"loss": 0.0533,
"step": 460
},
{
"epoch": 0.15272136474411047,
"grad_norm": 0.44866418838500977,
"learning_rate": 4.9906550704094935e-05,
"loss": 0.0544,
"step": 470
},
{
"epoch": 0.15597075548334688,
"grad_norm": 0.4166482985019684,
"learning_rate": 4.989458999792645e-05,
"loss": 0.0541,
"step": 480
},
{
"epoch": 0.15922014622258326,
"grad_norm": 0.4640562832355499,
"learning_rate": 4.988191085432722e-05,
"loss": 0.0544,
"step": 490
},
{
"epoch": 0.16246953696181965,
"grad_norm": 0.4704243242740631,
"learning_rate": 4.986851363920693e-05,
"loss": 0.0519,
"step": 500
},
{
"epoch": 0.16571892770105606,
"grad_norm": 0.5697548985481262,
"learning_rate": 4.9854398739198195e-05,
"loss": 0.0527,
"step": 510
},
{
"epoch": 0.16896831844029245,
"grad_norm": 0.5503994822502136,
"learning_rate": 4.983956656164545e-05,
"loss": 0.0502,
"step": 520
},
{
"epoch": 0.17221770917952883,
"grad_norm": 0.6677025556564331,
"learning_rate": 4.982401753459317e-05,
"loss": 0.0563,
"step": 530
},
{
"epoch": 0.17546709991876522,
"grad_norm": 0.39828023314476013,
"learning_rate": 4.98077521067735e-05,
"loss": 0.0557,
"step": 540
},
{
"epoch": 0.17871649065800163,
"grad_norm": 0.40971750020980835,
"learning_rate": 4.979077074759333e-05,
"loss": 0.0507,
"step": 550
},
{
"epoch": 0.18196588139723802,
"grad_norm": 0.44697558879852295,
"learning_rate": 4.9773073947120765e-05,
"loss": 0.0533,
"step": 560
},
{
"epoch": 0.1852152721364744,
"grad_norm": 0.5169378519058228,
"learning_rate": 4.975466221607092e-05,
"loss": 0.0511,
"step": 570
},
{
"epoch": 0.18846466287571081,
"grad_norm": 0.43874937295913696,
"learning_rate": 4.973553608579125e-05,
"loss": 0.0508,
"step": 580
},
{
"epoch": 0.1917140536149472,
"grad_norm": 0.3906107246875763,
"learning_rate": 4.971569610824616e-05,
"loss": 0.0558,
"step": 590
},
{
"epoch": 0.19496344435418358,
"grad_norm": 0.4098195433616638,
"learning_rate": 4.969514285600113e-05,
"loss": 0.0545,
"step": 600
},
{
"epoch": 0.19821283509341997,
"grad_norm": 0.39674079418182373,
"learning_rate": 4.967387692220614e-05,
"loss": 0.0487,
"step": 610
},
{
"epoch": 0.20146222583265638,
"grad_norm": 0.4392511248588562,
"learning_rate": 4.965189892057859e-05,
"loss": 0.0487,
"step": 620
},
{
"epoch": 0.20471161657189277,
"grad_norm": 0.36493393778800964,
"learning_rate": 4.962920948538555e-05,
"loss": 0.0532,
"step": 630
},
{
"epoch": 0.20796100731112915,
"grad_norm": 0.44200050830841064,
"learning_rate": 4.9605809271425504e-05,
"loss": 0.0492,
"step": 640
},
{
"epoch": 0.21121039805036557,
"grad_norm": 0.42539969086647034,
"learning_rate": 4.9581698954009415e-05,
"loss": 0.051,
"step": 650
},
{
"epoch": 0.21445978878960195,
"grad_norm": 0.39299121499061584,
"learning_rate": 4.9556879228941244e-05,
"loss": 0.0507,
"step": 660
},
{
"epoch": 0.21770917952883834,
"grad_norm": 0.36381906270980835,
"learning_rate": 4.95313508124979e-05,
"loss": 0.0502,
"step": 670
},
{
"epoch": 0.22095857026807472,
"grad_norm": 0.5863086581230164,
"learning_rate": 4.950511444140849e-05,
"loss": 0.05,
"step": 680
},
{
"epoch": 0.22420796100731114,
"grad_norm": 0.35644617676734924,
"learning_rate": 4.9478170872833186e-05,
"loss": 0.0521,
"step": 690
},
{
"epoch": 0.22745735174654752,
"grad_norm": 0.40089258551597595,
"learning_rate": 4.945052088434124e-05,
"loss": 0.0461,
"step": 700
},
{
"epoch": 0.2307067424857839,
"grad_norm": 0.3229173421859741,
"learning_rate": 4.942216527388861e-05,
"loss": 0.0494,
"step": 710
},
{
"epoch": 0.23395613322502032,
"grad_norm": 0.44354256987571716,
"learning_rate": 4.939310485979495e-05,
"loss": 0.0474,
"step": 720
},
{
"epoch": 0.2372055239642567,
"grad_norm": 0.4226253926753998,
"learning_rate": 4.9363340480719954e-05,
"loss": 0.0493,
"step": 730
},
{
"epoch": 0.2404549147034931,
"grad_norm": 0.4496769905090332,
"learning_rate": 4.9332872995639165e-05,
"loss": 0.0442,
"step": 740
},
{
"epoch": 0.2437043054427295,
"grad_norm": 0.41821110248565674,
"learning_rate": 4.930170328381919e-05,
"loss": 0.0497,
"step": 750
},
{
"epoch": 0.2469536961819659,
"grad_norm": 0.32461634278297424,
"learning_rate": 4.9269832244792327e-05,
"loss": 0.0449,
"step": 760
},
{
"epoch": 0.2502030869212023,
"grad_norm": 0.4358930289745331,
"learning_rate": 4.9237260798330593e-05,
"loss": 0.0448,
"step": 770
},
{
"epoch": 0.25345247766043866,
"grad_norm": 0.41930267214775085,
"learning_rate": 4.92039898844192e-05,
"loss": 0.0468,
"step": 780
},
{
"epoch": 0.25670186839967507,
"grad_norm": 0.4221108853816986,
"learning_rate": 4.91700204632294e-05,
"loss": 0.0439,
"step": 790
},
{
"epoch": 0.25995125913891143,
"grad_norm": 0.3895890712738037,
"learning_rate": 4.913535351509082e-05,
"loss": 0.0455,
"step": 800
},
{
"epoch": 0.26320064987814784,
"grad_norm": 0.36097270250320435,
"learning_rate": 4.9099990040463116e-05,
"loss": 0.0457,
"step": 810
},
{
"epoch": 0.26645004061738425,
"grad_norm": 0.32470473647117615,
"learning_rate": 4.906393105990713e-05,
"loss": 0.0461,
"step": 820
},
{
"epoch": 0.2696994313566206,
"grad_norm": 0.3427784740924835,
"learning_rate": 4.9027177614055445e-05,
"loss": 0.0479,
"step": 830
},
{
"epoch": 0.272948822095857,
"grad_norm": 0.47217708826065063,
"learning_rate": 4.898973076358233e-05,
"loss": 0.0465,
"step": 840
},
{
"epoch": 0.27619821283509344,
"grad_norm": 0.36148127913475037,
"learning_rate": 4.8951591589173156e-05,
"loss": 0.0438,
"step": 850
},
{
"epoch": 0.2794476035743298,
"grad_norm": 0.4368058145046234,
"learning_rate": 4.891276119149318e-05,
"loss": 0.0415,
"step": 860
},
{
"epoch": 0.2826969943135662,
"grad_norm": 0.42918527126312256,
"learning_rate": 4.887324069115581e-05,
"loss": 0.0446,
"step": 870
},
{
"epoch": 0.2859463850528026,
"grad_norm": 0.3209812641143799,
"learning_rate": 4.883303122869024e-05,
"loss": 0.0465,
"step": 880
},
{
"epoch": 0.289195775792039,
"grad_norm": 0.44423332810401917,
"learning_rate": 4.879213396450854e-05,
"loss": 0.0497,
"step": 890
},
{
"epoch": 0.2924451665312754,
"grad_norm": 0.4171295762062073,
"learning_rate": 4.8750550078872195e-05,
"loss": 0.0485,
"step": 900
},
{
"epoch": 0.2956945572705118,
"grad_norm": 0.27637121081352234,
"learning_rate": 4.8708280771858e-05,
"loss": 0.046,
"step": 910
},
{
"epoch": 0.29894394800974816,
"grad_norm": 0.3019394874572754,
"learning_rate": 4.8665327263323445e-05,
"loss": 0.045,
"step": 920
},
{
"epoch": 0.3021933387489846,
"grad_norm": 0.39697766304016113,
"learning_rate": 4.862169079287154e-05,
"loss": 0.0415,
"step": 930
},
{
"epoch": 0.30544272948822093,
"grad_norm": 0.4019138813018799,
"learning_rate": 4.8577372619815e-05,
"loss": 0.0433,
"step": 940
},
{
"epoch": 0.30869212022745735,
"grad_norm": 0.2970142960548401,
"learning_rate": 4.85323740231399e-05,
"loss": 0.0482,
"step": 950
},
{
"epoch": 0.31194151096669376,
"grad_norm": 0.3531886041164398,
"learning_rate": 4.848669630146882e-05,
"loss": 0.0439,
"step": 960
},
{
"epoch": 0.3151909017059301,
"grad_norm": 0.5152345895767212,
"learning_rate": 4.8440340773023295e-05,
"loss": 0.0408,
"step": 970
},
{
"epoch": 0.31844029244516653,
"grad_norm": 0.3633795976638794,
"learning_rate": 4.839330877558583e-05,
"loss": 0.0436,
"step": 980
},
{
"epoch": 0.32168968318440294,
"grad_norm": 0.43122872710227966,
"learning_rate": 4.834560166646126e-05,
"loss": 0.0438,
"step": 990
},
{
"epoch": 0.3249390739236393,
"grad_norm": 0.2737273871898651,
"learning_rate": 4.8297220822437594e-05,
"loss": 0.0424,
"step": 1000
},
{
"epoch": 0.3281884646628757,
"grad_norm": 0.36643481254577637,
"learning_rate": 4.824816763974626e-05,
"loss": 0.0408,
"step": 1010
},
{
"epoch": 0.3314378554021121,
"grad_norm": 0.37588241696357727,
"learning_rate": 4.8198443534021856e-05,
"loss": 0.0432,
"step": 1020
},
{
"epoch": 0.3346872461413485,
"grad_norm": 0.4051400423049927,
"learning_rate": 4.8148049940261244e-05,
"loss": 0.0454,
"step": 1030
},
{
"epoch": 0.3379366368805849,
"grad_norm": 0.42694568634033203,
"learning_rate": 4.8096988312782174e-05,
"loss": 0.0427,
"step": 1040
},
{
"epoch": 0.3411860276198213,
"grad_norm": 0.3985839784145355,
"learning_rate": 4.804526012518129e-05,
"loss": 0.0429,
"step": 1050
},
{
"epoch": 0.34443541835905767,
"grad_norm": 0.34616586565971375,
"learning_rate": 4.7992866870291645e-05,
"loss": 0.04,
"step": 1060
},
{
"epoch": 0.3476848090982941,
"grad_norm": 0.35213449597358704,
"learning_rate": 4.7939810060139534e-05,
"loss": 0.0435,
"step": 1070
},
{
"epoch": 0.35093419983753044,
"grad_norm": 0.3723832368850708,
"learning_rate": 4.788609122590096e-05,
"loss": 0.046,
"step": 1080
},
{
"epoch": 0.35418359057676685,
"grad_norm": 0.3361580967903137,
"learning_rate": 4.783171191785737e-05,
"loss": 0.0364,
"step": 1090
},
{
"epoch": 0.35743298131600326,
"grad_norm": 0.4315849244594574,
"learning_rate": 4.777667370535097e-05,
"loss": 0.0392,
"step": 1100
},
{
"epoch": 0.3606823720552396,
"grad_norm": 0.36335381865501404,
"learning_rate": 4.772097817673937e-05,
"loss": 0.0375,
"step": 1110
},
{
"epoch": 0.36393176279447603,
"grad_norm": 0.3676309883594513,
"learning_rate": 4.7664626939349823e-05,
"loss": 0.0436,
"step": 1120
},
{
"epoch": 0.36718115353371245,
"grad_norm": 0.45313313603401184,
"learning_rate": 4.760762161943276e-05,
"loss": 0.0407,
"step": 1130
},
{
"epoch": 0.3704305442729488,
"grad_norm": 0.3923757076263428,
"learning_rate": 4.7549963862114934e-05,
"loss": 0.0404,
"step": 1140
},
{
"epoch": 0.3736799350121852,
"grad_norm": 0.3564886152744293,
"learning_rate": 4.7491655331351884e-05,
"loss": 0.0417,
"step": 1150
},
{
"epoch": 0.37692932575142163,
"grad_norm": 0.39160194993019104,
"learning_rate": 4.7432697709879925e-05,
"loss": 0.0405,
"step": 1160
},
{
"epoch": 0.380178716490658,
"grad_norm": 0.3885195553302765,
"learning_rate": 4.737309269916763e-05,
"loss": 0.0351,
"step": 1170
},
{
"epoch": 0.3834281072298944,
"grad_norm": 0.37907102704048157,
"learning_rate": 4.7312842019366684e-05,
"loss": 0.0365,
"step": 1180
},
{
"epoch": 0.3866774979691308,
"grad_norm": 0.39848607778549194,
"learning_rate": 4.725194740926222e-05,
"loss": 0.038,
"step": 1190
},
{
"epoch": 0.38992688870836717,
"grad_norm": 0.46821895241737366,
"learning_rate": 4.719041062622272e-05,
"loss": 0.0405,
"step": 1200
},
{
"epoch": 0.3931762794476036,
"grad_norm": 0.3871254324913025,
"learning_rate": 4.712823344614921e-05,
"loss": 0.038,
"step": 1210
},
{
"epoch": 0.39642567018683994,
"grad_norm": 0.26873621344566345,
"learning_rate": 4.706541766342407e-05,
"loss": 0.0377,
"step": 1220
},
{
"epoch": 0.39967506092607635,
"grad_norm": 1.015006184577942,
"learning_rate": 4.7001965090859224e-05,
"loss": 0.0395,
"step": 1230
},
{
"epoch": 0.40292445166531277,
"grad_norm": 0.4097539186477661,
"learning_rate": 4.693787755964382e-05,
"loss": 0.0305,
"step": 1240
},
{
"epoch": 0.4061738424045491,
"grad_norm": 0.3266974091529846,
"learning_rate": 4.68731569192914e-05,
"loss": 0.034,
"step": 1250
},
{
"epoch": 0.40942323314378554,
"grad_norm": 0.36162036657333374,
"learning_rate": 4.6807805037586514e-05,
"loss": 0.0403,
"step": 1260
},
{
"epoch": 0.41267262388302195,
"grad_norm": 0.35943084955215454,
"learning_rate": 4.674182380053083e-05,
"loss": 0.037,
"step": 1270
},
{
"epoch": 0.4159220146222583,
"grad_norm": 0.5160156488418579,
"learning_rate": 4.667521511228866e-05,
"loss": 0.0364,
"step": 1280
},
{
"epoch": 0.4191714053614947,
"grad_norm": 0.3633994162082672,
"learning_rate": 4.660798089513209e-05,
"loss": 0.0364,
"step": 1290
},
{
"epoch": 0.42242079610073113,
"grad_norm": 0.42284584045410156,
"learning_rate": 4.654012308938542e-05,
"loss": 0.0363,
"step": 1300
},
{
"epoch": 0.4256701868399675,
"grad_norm": 0.3621046245098114,
"learning_rate": 4.6471643653369226e-05,
"loss": 0.0309,
"step": 1310
},
{
"epoch": 0.4289195775792039,
"grad_norm": 0.42647165060043335,
"learning_rate": 4.64025445633438e-05,
"loss": 0.0368,
"step": 1320
},
{
"epoch": 0.4321689683184403,
"grad_norm": 0.5478382706642151,
"learning_rate": 4.6332827813452175e-05,
"loss": 0.0362,
"step": 1330
},
{
"epoch": 0.4354183590576767,
"grad_norm": 0.3689424395561218,
"learning_rate": 4.62624954156625e-05,
"loss": 0.0361,
"step": 1340
},
{
"epoch": 0.4386677497969131,
"grad_norm": 0.3740200698375702,
"learning_rate": 4.619154939971006e-05,
"loss": 0.0369,
"step": 1350
},
{
"epoch": 0.44191714053614944,
"grad_norm": 0.3798083961009979,
"learning_rate": 4.611999181303861e-05,
"loss": 0.0353,
"step": 1360
},
{
"epoch": 0.44516653127538586,
"grad_norm": 0.3334667384624481,
"learning_rate": 4.6047824720741374e-05,
"loss": 0.0371,
"step": 1370
},
{
"epoch": 0.44841592201462227,
"grad_norm": 0.4219120740890503,
"learning_rate": 4.597505020550138e-05,
"loss": 0.036,
"step": 1380
},
{
"epoch": 0.4516653127538586,
"grad_norm": 0.24345648288726807,
"learning_rate": 4.590167036753141e-05,
"loss": 0.0307,
"step": 1390
},
{
"epoch": 0.45491470349309504,
"grad_norm": 0.31113502383232117,
"learning_rate": 4.582768732451334e-05,
"loss": 0.035,
"step": 1400
},
{
"epoch": 0.45816409423233145,
"grad_norm": 0.3489152193069458,
"learning_rate": 4.575310321153706e-05,
"loss": 0.0384,
"step": 1410
},
{
"epoch": 0.4614134849715678,
"grad_norm": 0.3709469139575958,
"learning_rate": 4.567792018103884e-05,
"loss": 0.0374,
"step": 1420
},
{
"epoch": 0.4646628757108042,
"grad_norm": 0.40091830492019653,
"learning_rate": 4.560214040273922e-05,
"loss": 0.036,
"step": 1430
},
{
"epoch": 0.46791226645004064,
"grad_norm": 0.4365979731082916,
"learning_rate": 4.55257660635804e-05,
"loss": 0.0353,
"step": 1440
},
{
"epoch": 0.471161657189277,
"grad_norm": 0.3686366677284241,
"learning_rate": 4.5448799367663096e-05,
"loss": 0.0354,
"step": 1450
},
{
"epoch": 0.4744110479285134,
"grad_norm": 0.33025604486465454,
"learning_rate": 4.537124253618298e-05,
"loss": 0.0345,
"step": 1460
},
{
"epoch": 0.4776604386677498,
"grad_norm": 0.4022039771080017,
"learning_rate": 4.529309780736654e-05,
"loss": 0.0367,
"step": 1470
},
{
"epoch": 0.4809098294069862,
"grad_norm": 0.3877394497394562,
"learning_rate": 4.521436743640648e-05,
"loss": 0.0342,
"step": 1480
},
{
"epoch": 0.4841592201462226,
"grad_norm": 0.4671391546726227,
"learning_rate": 4.51350536953967e-05,
"loss": 0.0337,
"step": 1490
},
{
"epoch": 0.487408610885459,
"grad_norm": 0.40457358956336975,
"learning_rate": 4.505515887326664e-05,
"loss": 0.0302,
"step": 1500
},
{
"epoch": 0.49065800162469536,
"grad_norm": 0.3942878544330597,
"learning_rate": 4.497468527571529e-05,
"loss": 0.0315,
"step": 1510
},
{
"epoch": 0.4939073923639318,
"grad_norm": 0.4007840156555176,
"learning_rate": 4.4893635225144606e-05,
"loss": 0.0301,
"step": 1520
},
{
"epoch": 0.4968318440292445,
"eval_loss": 0.15474164485931396,
"eval_runtime": 733.5407,
"eval_samples_per_second": 3.817,
"eval_steps_per_second": 3.817,
"step": 1529
},
{
"epoch": 0.49715678310316813,
"grad_norm": 0.4904119372367859,
"learning_rate": 4.481201106059251e-05,
"loss": 0.0326,
"step": 1530
},
{
"epoch": 0.5004061738424046,
"grad_norm": 0.41244634985923767,
"learning_rate": 4.472981513766538e-05,
"loss": 0.0296,
"step": 1540
},
{
"epoch": 0.503655564581641,
"grad_norm": 0.4068795442581177,
"learning_rate": 4.464704982847008e-05,
"loss": 0.0345,
"step": 1550
},
{
"epoch": 0.5069049553208773,
"grad_norm": 0.3623734712600708,
"learning_rate": 4.456371752154549e-05,
"loss": 0.0309,
"step": 1560
},
{
"epoch": 0.5101543460601138,
"grad_norm": 0.3473750054836273,
"learning_rate": 4.447982062179358e-05,
"loss": 0.0313,
"step": 1570
},
{
"epoch": 0.5134037367993501,
"grad_norm": 0.4023807644844055,
"learning_rate": 4.439536155041e-05,
"loss": 0.0328,
"step": 1580
},
{
"epoch": 0.5166531275385865,
"grad_norm": 0.3513451814651489,
"learning_rate": 4.4310342744814214e-05,
"loss": 0.0343,
"step": 1590
},
{
"epoch": 0.5199025182778229,
"grad_norm": 0.37276962399482727,
"learning_rate": 4.4224766658579166e-05,
"loss": 0.031,
"step": 1600
},
{
"epoch": 0.5231519090170593,
"grad_norm": 0.456547349691391,
"learning_rate": 4.413863576136044e-05,
"loss": 0.0317,
"step": 1610
},
{
"epoch": 0.5264012997562957,
"grad_norm": 0.3823520541191101,
"learning_rate": 4.4051952538825034e-05,
"loss": 0.0302,
"step": 1620
},
{
"epoch": 0.529650690495532,
"grad_norm": 0.42634570598602295,
"learning_rate": 4.3964719492579584e-05,
"loss": 0.0288,
"step": 1630
},
{
"epoch": 0.5329000812347685,
"grad_norm": 0.33916255831718445,
"learning_rate": 4.387693914009819e-05,
"loss": 0.0298,
"step": 1640
},
{
"epoch": 0.5361494719740049,
"grad_norm": 0.441773921251297,
"learning_rate": 4.3788614014649775e-05,
"loss": 0.033,
"step": 1650
},
{
"epoch": 0.5393988627132412,
"grad_norm": 0.35982316732406616,
"learning_rate": 4.3699746665224945e-05,
"loss": 0.0267,
"step": 1660
},
{
"epoch": 0.5426482534524777,
"grad_norm": 0.3963621258735657,
"learning_rate": 4.3610339656462445e-05,
"loss": 0.0308,
"step": 1670
},
{
"epoch": 0.545897644191714,
"grad_norm": 0.4419509172439575,
"learning_rate": 4.352039556857516e-05,
"loss": 0.0325,
"step": 1680
},
{
"epoch": 0.5491470349309504,
"grad_norm": 0.4380287826061249,
"learning_rate": 4.3429916997275626e-05,
"loss": 0.0297,
"step": 1690
},
{
"epoch": 0.5523964256701869,
"grad_norm": 0.27853381633758545,
"learning_rate": 4.333890655370113e-05,
"loss": 0.0314,
"step": 1700
},
{
"epoch": 0.5556458164094232,
"grad_norm": 0.3319164514541626,
"learning_rate": 4.324736686433837e-05,
"loss": 0.0294,
"step": 1710
},
{
"epoch": 0.5588952071486596,
"grad_norm": 0.5052310824394226,
"learning_rate": 4.315530057094762e-05,
"loss": 0.0314,
"step": 1720
},
{
"epoch": 0.5621445978878961,
"grad_norm": 0.2669266164302826,
"learning_rate": 4.306271033048655e-05,
"loss": 0.0305,
"step": 1730
},
{
"epoch": 0.5653939886271324,
"grad_norm": 0.3192387819290161,
"learning_rate": 4.2969598815033476e-05,
"loss": 0.0274,
"step": 1740
},
{
"epoch": 0.5686433793663688,
"grad_norm": 0.5369754433631897,
"learning_rate": 4.2875968711710286e-05,
"loss": 0.032,
"step": 1750
},
{
"epoch": 0.5718927701056052,
"grad_norm": 0.2641260623931885,
"learning_rate": 4.2781822722604916e-05,
"loss": 0.0272,
"step": 1760
},
{
"epoch": 0.5751421608448416,
"grad_norm": 0.4835808575153351,
"learning_rate": 4.268716356469331e-05,
"loss": 0.0286,
"step": 1770
},
{
"epoch": 0.578391551584078,
"grad_norm": 0.27514582872390747,
"learning_rate": 4.259199396976107e-05,
"loss": 0.0269,
"step": 1780
},
{
"epoch": 0.5816409423233144,
"grad_norm": 0.3719632625579834,
"learning_rate": 4.2496316684324585e-05,
"loss": 0.029,
"step": 1790
},
{
"epoch": 0.5848903330625508,
"grad_norm": 0.36428073048591614,
"learning_rate": 4.2400134469551746e-05,
"loss": 0.0304,
"step": 1800
},
{
"epoch": 0.5881397238017871,
"grad_norm": 0.6899104714393616,
"learning_rate": 4.230345010118233e-05,
"loss": 0.0296,
"step": 1810
},
{
"epoch": 0.5913891145410236,
"grad_norm": 0.3395729660987854,
"learning_rate": 4.220626636944783e-05,
"loss": 0.025,
"step": 1820
},
{
"epoch": 0.59463850528026,
"grad_norm": 0.38969168066978455,
"learning_rate": 4.2108586078990966e-05,
"loss": 0.0248,
"step": 1830
},
{
"epoch": 0.5978878960194963,
"grad_norm": 0.3905599117279053,
"learning_rate": 4.2010412048784733e-05,
"loss": 0.0257,
"step": 1840
},
{
"epoch": 0.6011372867587328,
"grad_norm": 0.3760197162628174,
"learning_rate": 4.191174711205105e-05,
"loss": 0.0278,
"step": 1850
},
{
"epoch": 0.6043866774979691,
"grad_norm": 0.4263547956943512,
"learning_rate": 4.181259411617898e-05,
"loss": 0.0248,
"step": 1860
},
{
"epoch": 0.6076360682372055,
"grad_norm": 0.46628740429878235,
"learning_rate": 4.1712955922642614e-05,
"loss": 0.0274,
"step": 1870
},
{
"epoch": 0.6108854589764419,
"grad_norm": 0.46346044540405273,
"learning_rate": 4.161283540691841e-05,
"loss": 0.0248,
"step": 1880
},
{
"epoch": 0.6141348497156783,
"grad_norm": 0.37473928928375244,
"learning_rate": 4.151223545840225e-05,
"loss": 0.0272,
"step": 1890
},
{
"epoch": 0.6173842404549147,
"grad_norm": 0.4056950807571411,
"learning_rate": 4.141115898032607e-05,
"loss": 0.024,
"step": 1900
},
{
"epoch": 0.620633631194151,
"grad_norm": 0.4770098626613617,
"learning_rate": 4.130960888967405e-05,
"loss": 0.0237,
"step": 1910
},
{
"epoch": 0.6238830219333875,
"grad_norm": 0.41782036423683167,
"learning_rate": 4.1207588117098445e-05,
"loss": 0.0272,
"step": 1920
},
{
"epoch": 0.6271324126726239,
"grad_norm": 0.4040960669517517,
"learning_rate": 4.1105099606835e-05,
"loss": 0.0256,
"step": 1930
},
{
"epoch": 0.6303818034118602,
"grad_norm": 0.3390992283821106,
"learning_rate": 4.1002146316617986e-05,
"loss": 0.0276,
"step": 1940
},
{
"epoch": 0.6336311941510967,
"grad_norm": 0.4464505910873413,
"learning_rate": 4.0898731217594836e-05,
"loss": 0.0265,
"step": 1950
},
{
"epoch": 0.6368805848903331,
"grad_norm": 0.46650540828704834,
"learning_rate": 4.0794857294240415e-05,
"loss": 0.0229,
"step": 1960
},
{
"epoch": 0.6401299756295694,
"grad_norm": 0.39290061593055725,
"learning_rate": 4.0690527544270886e-05,
"loss": 0.0207,
"step": 1970
},
{
"epoch": 0.6433793663688059,
"grad_norm": 0.38553354144096375,
"learning_rate": 4.0585744978557174e-05,
"loss": 0.0267,
"step": 1980
},
{
"epoch": 0.6466287571080422,
"grad_norm": 0.456086128950119,
"learning_rate": 4.048051262103811e-05,
"loss": 0.0264,
"step": 1990
},
{
"epoch": 0.6498781478472786,
"grad_norm": 0.5561078786849976,
"learning_rate": 4.0374833508633156e-05,
"loss": 0.0218,
"step": 2000
},
{
"epoch": 0.6531275385865151,
"grad_norm": 0.3495825231075287,
"learning_rate": 4.0268710691154724e-05,
"loss": 0.0226,
"step": 2010
},
{
"epoch": 0.6563769293257514,
"grad_norm": 0.4553760588169098,
"learning_rate": 4.0162147231220216e-05,
"loss": 0.024,
"step": 2020
},
{
"epoch": 0.6596263200649878,
"grad_norm": 0.40624141693115234,
"learning_rate": 4.0055146204163605e-05,
"loss": 0.022,
"step": 2030
},
{
"epoch": 0.6628757108042242,
"grad_norm": 0.37093663215637207,
"learning_rate": 3.994771069794668e-05,
"loss": 0.0241,
"step": 2040
},
{
"epoch": 0.6661251015434606,
"grad_norm": 0.5362465977668762,
"learning_rate": 3.9839843813069984e-05,
"loss": 0.0246,
"step": 2050
},
{
"epoch": 0.669374492282697,
"grad_norm": 0.4092622995376587,
"learning_rate": 3.9731548662483234e-05,
"loss": 0.0241,
"step": 2060
},
{
"epoch": 0.6726238830219334,
"grad_norm": 0.41743412613868713,
"learning_rate": 3.962282837149558e-05,
"loss": 0.0232,
"step": 2070
},
{
"epoch": 0.6758732737611698,
"grad_norm": 0.5507500171661377,
"learning_rate": 3.951368607768537e-05,
"loss": 0.0223,
"step": 2080
},
{
"epoch": 0.6791226645004061,
"grad_norm": 0.45379722118377686,
"learning_rate": 3.9404124930809625e-05,
"loss": 0.0242,
"step": 2090
},
{
"epoch": 0.6823720552396426,
"grad_norm": 0.36823663115501404,
"learning_rate": 3.929414809271308e-05,
"loss": 0.0265,
"step": 2100
},
{
"epoch": 0.685621445978879,
"grad_norm": 0.3751804530620575,
"learning_rate": 3.918375873723701e-05,
"loss": 0.0245,
"step": 2110
},
{
"epoch": 0.6888708367181153,
"grad_norm": 0.4553548991680145,
"learning_rate": 3.907296005012758e-05,
"loss": 0.0257,
"step": 2120
},
{
"epoch": 0.6921202274573518,
"grad_norm": 0.3335023820400238,
"learning_rate": 3.896175522894395e-05,
"loss": 0.0226,
"step": 2130
},
{
"epoch": 0.6953696181965882,
"grad_norm": 0.4702470600605011,
"learning_rate": 3.8850147482965973e-05,
"loss": 0.0218,
"step": 2140
},
{
"epoch": 0.6986190089358245,
"grad_norm": 0.39953991770744324,
"learning_rate": 3.873814003310158e-05,
"loss": 0.0194,
"step": 2150
},
{
"epoch": 0.7018683996750609,
"grad_norm": 0.5067981481552124,
"learning_rate": 3.862573611179381e-05,
"loss": 0.022,
"step": 2160
},
{
"epoch": 0.7051177904142973,
"grad_norm": 0.30548062920570374,
"learning_rate": 3.851293896292756e-05,
"loss": 0.0171,
"step": 2170
},
{
"epoch": 0.7083671811535337,
"grad_norm": 0.39521142840385437,
"learning_rate": 3.839975184173596e-05,
"loss": 0.0192,
"step": 2180
},
{
"epoch": 0.7116165718927701,
"grad_norm": 0.36969834566116333,
"learning_rate": 3.8286178014706395e-05,
"loss": 0.0245,
"step": 2190
},
{
"epoch": 0.7148659626320065,
"grad_norm": 0.4855635166168213,
"learning_rate": 3.8172220759486287e-05,
"loss": 0.0229,
"step": 2200
},
{
"epoch": 0.7181153533712429,
"grad_norm": 0.45929041504859924,
"learning_rate": 3.8057883364788475e-05,
"loss": 0.0186,
"step": 2210
},
{
"epoch": 0.7213647441104792,
"grad_norm": 0.2941083610057831,
"learning_rate": 3.7943169130296295e-05,
"loss": 0.0188,
"step": 2220
},
{
"epoch": 0.7246141348497157,
"grad_norm": 0.3690025806427002,
"learning_rate": 3.782808136656839e-05,
"loss": 0.0188,
"step": 2230
},
{
"epoch": 0.7278635255889521,
"grad_norm": 0.38714373111724854,
"learning_rate": 3.771262339494314e-05,
"loss": 0.0191,
"step": 2240
},
{
"epoch": 0.7311129163281884,
"grad_norm": 0.40680810809135437,
"learning_rate": 3.759679854744282e-05,
"loss": 0.0197,
"step": 2250
},
{
"epoch": 0.7343623070674249,
"grad_norm": 0.41902831196784973,
"learning_rate": 3.748061016667745e-05,
"loss": 0.0205,
"step": 2260
},
{
"epoch": 0.7376116978066612,
"grad_norm": 0.4369294047355652,
"learning_rate": 3.736406160574833e-05,
"loss": 0.019,
"step": 2270
},
{
"epoch": 0.7408610885458976,
"grad_norm": 0.3856930732727051,
"learning_rate": 3.724715622815122e-05,
"loss": 0.022,
"step": 2280
},
{
"epoch": 0.7441104792851341,
"grad_norm": 0.34679755568504333,
"learning_rate": 3.712989740767938e-05,
"loss": 0.0164,
"step": 2290
},
{
"epoch": 0.7473598700243704,
"grad_norm": 0.3927323818206787,
"learning_rate": 3.7012288528326086e-05,
"loss": 0.0181,
"step": 2300
},
{
"epoch": 0.7506092607636068,
"grad_norm": 0.4021192491054535,
"learning_rate": 3.689433298418706e-05,
"loss": 0.0159,
"step": 2310
},
{
"epoch": 0.7538586515028433,
"grad_norm": 0.48003751039505005,
"learning_rate": 3.6776034179362474e-05,
"loss": 0.0177,
"step": 2320
},
{
"epoch": 0.7571080422420796,
"grad_norm": 0.3487580716609955,
"learning_rate": 3.66573955278587e-05,
"loss": 0.0162,
"step": 2330
},
{
"epoch": 0.760357432981316,
"grad_norm": 0.47422289848327637,
"learning_rate": 3.653842045348985e-05,
"loss": 0.018,
"step": 2340
},
{
"epoch": 0.7636068237205524,
"grad_norm": 0.38853368163108826,
"learning_rate": 3.64191123897789e-05,
"loss": 0.0229,
"step": 2350
},
{
"epoch": 0.7668562144597888,
"grad_norm": 0.36860230565071106,
"learning_rate": 3.62994747798586e-05,
"loss": 0.018,
"step": 2360
},
{
"epoch": 0.7701056051990252,
"grad_norm": 0.4562481939792633,
"learning_rate": 3.617951107637219e-05,
"loss": 0.0191,
"step": 2370
},
{
"epoch": 0.7733549959382616,
"grad_norm": 0.708402156829834,
"learning_rate": 3.605922474137366e-05,
"loss": 0.019,
"step": 2380
},
{
"epoch": 0.776604386677498,
"grad_norm": 0.48525258898735046,
"learning_rate": 3.5938619246227884e-05,
"loss": 0.0217,
"step": 2390
},
{
"epoch": 0.7798537774167343,
"grad_norm": 0.3320712447166443,
"learning_rate": 3.581769807151044e-05,
"loss": 0.0195,
"step": 2400
},
{
"epoch": 0.7831031681559708,
"grad_norm": 0.36696651577949524,
"learning_rate": 3.56964647069072e-05,
"loss": 0.0192,
"step": 2410
},
{
"epoch": 0.7863525588952072,
"grad_norm": 1.0005451440811157,
"learning_rate": 3.55749226511135e-05,
"loss": 0.0196,
"step": 2420
},
{
"epoch": 0.7896019496344435,
"grad_norm": 0.45593878626823425,
"learning_rate": 3.54530754117333e-05,
"loss": 0.0225,
"step": 2430
},
{
"epoch": 0.7928513403736799,
"grad_norm": 0.33774876594543457,
"learning_rate": 3.533092650517793e-05,
"loss": 0.0199,
"step": 2440
},
{
"epoch": 0.7961007311129163,
"grad_norm": 0.483853816986084,
"learning_rate": 3.5208479456564524e-05,
"loss": 0.0206,
"step": 2450
},
{
"epoch": 0.7993501218521527,
"grad_norm": 0.36340585350990295,
"learning_rate": 3.508573779961441e-05,
"loss": 0.0172,
"step": 2460
},
{
"epoch": 0.8025995125913891,
"grad_norm": 0.515352725982666,
"learning_rate": 3.4962705076551026e-05,
"loss": 0.0151,
"step": 2470
},
{
"epoch": 0.8058489033306255,
"grad_norm": 0.3187580406665802,
"learning_rate": 3.483938483799778e-05,
"loss": 0.0148,
"step": 2480
},
{
"epoch": 0.8090982940698619,
"grad_norm": 0.4531770646572113,
"learning_rate": 3.47157806428755e-05,
"loss": 0.0158,
"step": 2490
},
{
"epoch": 0.8123476848090982,
"grad_norm": 0.6547293066978455,
"learning_rate": 3.45918960582998e-05,
"loss": 0.0144,
"step": 2500
},
{
"epoch": 0.8155970755483347,
"grad_norm": 0.24430936574935913,
"learning_rate": 3.446773465947809e-05,
"loss": 0.0148,
"step": 2510
},
{
"epoch": 0.8188464662875711,
"grad_norm": 0.425484299659729,
"learning_rate": 3.4343300029606404e-05,
"loss": 0.0192,
"step": 2520
},
{
"epoch": 0.8220958570268074,
"grad_norm": 0.43435633182525635,
"learning_rate": 3.4218595759766013e-05,
"loss": 0.0192,
"step": 2530
},
{
"epoch": 0.8253452477660439,
"grad_norm": 0.499104380607605,
"learning_rate": 3.409362544881977e-05,
"loss": 0.0172,
"step": 2540
},
{
"epoch": 0.8285946385052803,
"grad_norm": 0.5690107941627502,
"learning_rate": 3.3968392703308264e-05,
"loss": 0.017,
"step": 2550
},
{
"epoch": 0.8318440292445166,
"grad_norm": 0.4789363145828247,
"learning_rate": 3.3842901137345725e-05,
"loss": 0.0151,
"step": 2560
},
{
"epoch": 0.8350934199837531,
"grad_norm": 0.34053072333335876,
"learning_rate": 3.3717154372515716e-05,
"loss": 0.0155,
"step": 2570
},
{
"epoch": 0.8383428107229894,
"grad_norm": 0.44401687383651733,
"learning_rate": 3.3591156037766655e-05,
"loss": 0.0138,
"step": 2580
},
{
"epoch": 0.8415922014622258,
"grad_norm": 0.41682168841362,
"learning_rate": 3.346490976930704e-05,
"loss": 0.014,
"step": 2590
},
{
"epoch": 0.8448415922014623,
"grad_norm": 0.6371095776557922,
"learning_rate": 3.333841921050053e-05,
"loss": 0.0176,
"step": 2600
},
{
"epoch": 0.8480909829406986,
"grad_norm": 0.22434203326702118,
"learning_rate": 3.3211688011760835e-05,
"loss": 0.0123,
"step": 2610
},
{
"epoch": 0.851340373679935,
"grad_norm": 0.5385378003120422,
"learning_rate": 3.30847198304463e-05,
"loss": 0.016,
"step": 2620
},
{
"epoch": 0.8545897644191714,
"grad_norm": 0.35033077001571655,
"learning_rate": 3.2957518330754406e-05,
"loss": 0.0149,
"step": 2630
},
{
"epoch": 0.8578391551584078,
"grad_norm": 0.3793995976448059,
"learning_rate": 3.2830087183616015e-05,
"loss": 0.0153,
"step": 2640
},
{
"epoch": 0.8610885458976442,
"grad_norm": 0.6376614570617676,
"learning_rate": 3.270243006658942e-05,
"loss": 0.0154,
"step": 2650
},
{
"epoch": 0.8643379366368806,
"grad_norm": 0.3490144908428192,
"learning_rate": 3.257455066375423e-05,
"loss": 0.0154,
"step": 2660
},
{
"epoch": 0.867587327376117,
"grad_norm": 0.40602752566337585,
"learning_rate": 3.244645266560501e-05,
"loss": 0.0136,
"step": 2670
},
{
"epoch": 0.8708367181153533,
"grad_norm": 0.37300461530685425,
"learning_rate": 3.2318139768944856e-05,
"loss": 0.0127,
"step": 2680
},
{
"epoch": 0.8740861088545898,
"grad_norm": 0.3366054594516754,
"learning_rate": 3.218961567677861e-05,
"loss": 0.0142,
"step": 2690
},
{
"epoch": 0.8773354995938262,
"grad_norm": 0.4088799059391022,
"learning_rate": 3.206088409820606e-05,
"loss": 0.0143,
"step": 2700
},
{
"epoch": 0.8805848903330625,
"grad_norm": 0.3606589734554291,
"learning_rate": 3.19319487483149e-05,
"loss": 0.0125,
"step": 2710
},
{
"epoch": 0.8838342810722989,
"grad_norm": 0.48760858178138733,
"learning_rate": 3.180281334807348e-05,
"loss": 0.0121,
"step": 2720
},
{
"epoch": 0.8870836718115354,
"grad_norm": 0.4494096040725708,
"learning_rate": 3.1673481624223426e-05,
"loss": 0.0123,
"step": 2730
},
{
"epoch": 0.8903330625507717,
"grad_norm": 0.4649289846420288,
"learning_rate": 3.154395730917213e-05,
"loss": 0.0135,
"step": 2740
},
{
"epoch": 0.8935824532900081,
"grad_norm": 0.3580702841281891,
"learning_rate": 3.141424414088499e-05,
"loss": 0.014,
"step": 2750
},
{
"epoch": 0.8968318440292445,
"grad_norm": 0.2921467423439026,
"learning_rate": 3.128434586277757e-05,
"loss": 0.0146,
"step": 2760
},
{
"epoch": 0.9000812347684809,
"grad_norm": 0.5378055572509766,
"learning_rate": 3.115426622360752e-05,
"loss": 0.0131,
"step": 2770
},
{
"epoch": 0.9033306255077173,
"grad_norm": 0.34166646003723145,
"learning_rate": 3.102400897736645e-05,
"loss": 0.0123,
"step": 2780
},
{
"epoch": 0.9065800162469537,
"grad_norm": 0.43183135986328125,
"learning_rate": 3.0893577883171556e-05,
"loss": 0.0151,
"step": 2790
},
{
"epoch": 0.9098294069861901,
"grad_norm": 0.6324542760848999,
"learning_rate": 3.076297670515713e-05,
"loss": 0.0128,
"step": 2800
},
{
"epoch": 0.9130787977254264,
"grad_norm": 0.43282851576805115,
"learning_rate": 3.063220921236598e-05,
"loss": 0.0129,
"step": 2810
},
{
"epoch": 0.9163281884646629,
"grad_norm": 0.2942393124103546,
"learning_rate": 3.0501279178640575e-05,
"loss": 0.0131,
"step": 2820
},
{
"epoch": 0.9195775792038993,
"grad_norm": 0.32284924387931824,
"learning_rate": 3.0370190382514213e-05,
"loss": 0.0103,
"step": 2830
},
{
"epoch": 0.9228269699431356,
"grad_norm": 0.38951289653778076,
"learning_rate": 3.0238946607101936e-05,
"loss": 0.0105,
"step": 2840
},
{
"epoch": 0.9260763606823721,
"grad_norm": 0.407099187374115,
"learning_rate": 3.0107551639991365e-05,
"loss": 0.0109,
"step": 2850
},
{
"epoch": 0.9293257514216084,
"grad_norm": 0.5025432705879211,
"learning_rate": 2.997600927313338e-05,
"loss": 0.0115,
"step": 2860
},
{
"epoch": 0.9325751421608448,
"grad_norm": 0.18864522874355316,
"learning_rate": 2.98443233027327e-05,
"loss": 0.011,
"step": 2870
},
{
"epoch": 0.9358245329000813,
"grad_norm": 0.3602016866207123,
"learning_rate": 2.971249752913834e-05,
"loss": 0.012,
"step": 2880
},
{
"epoch": 0.9390739236393176,
"grad_norm": 0.38901951909065247,
"learning_rate": 2.958053575673389e-05,
"loss": 0.0113,
"step": 2890
},
{
"epoch": 0.942323314378554,
"grad_norm": 0.3340912163257599,
"learning_rate": 2.944844179382778e-05,
"loss": 0.0102,
"step": 2900
},
{
"epoch": 0.9455727051177905,
"grad_norm": 0.32333633303642273,
"learning_rate": 2.931621945254334e-05,
"loss": 0.0117,
"step": 2910
},
{
"epoch": 0.9488220958570268,
"grad_norm": 0.45609724521636963,
"learning_rate": 2.918387254870879e-05,
"loss": 0.0114,
"step": 2920
},
{
"epoch": 0.9520714865962632,
"grad_norm": 0.3951948285102844,
"learning_rate": 2.905140490174713e-05,
"loss": 0.0099,
"step": 2930
},
{
"epoch": 0.9553208773354996,
"grad_norm": 0.4054865539073944,
"learning_rate": 2.8918820334565905e-05,
"loss": 0.0118,
"step": 2940
},
{
"epoch": 0.958570268074736,
"grad_norm": 0.40964823961257935,
"learning_rate": 2.8786122673446893e-05,
"loss": 0.0113,
"step": 2950
},
{
"epoch": 0.9618196588139724,
"grad_norm": 0.37629735469818115,
"learning_rate": 2.865331574793564e-05,
"loss": 0.0112,
"step": 2960
},
{
"epoch": 0.9650690495532088,
"grad_norm": 0.3705214262008667,
"learning_rate": 2.8520403390731e-05,
"loss": 0.0117,
"step": 2970
},
{
"epoch": 0.9683184402924452,
"grad_norm": 0.3900469243526459,
"learning_rate": 2.8387389437574495e-05,
"loss": 0.0108,
"step": 2980
},
{
"epoch": 0.9715678310316815,
"grad_norm": 0.2905224561691284,
"learning_rate": 2.8254277727139616e-05,
"loss": 0.0112,
"step": 2990
},
{
"epoch": 0.974817221770918,
"grad_norm": 0.6466222405433655,
"learning_rate": 2.812107210092105e-05,
"loss": 0.0124,
"step": 3000
},
{
"epoch": 0.9780666125101544,
"grad_norm": 0.5542816519737244,
"learning_rate": 2.798777640312381e-05,
"loss": 0.0112,
"step": 3010
},
{
"epoch": 0.9813160032493907,
"grad_norm": 0.3827550411224365,
"learning_rate": 2.7854394480552327e-05,
"loss": 0.0112,
"step": 3020
},
{
"epoch": 0.9845653939886271,
"grad_norm": 0.32786279916763306,
"learning_rate": 2.7720930182499367e-05,
"loss": 0.0115,
"step": 3030
},
{
"epoch": 0.9878147847278635,
"grad_norm": 0.537501335144043,
"learning_rate": 2.7587387360635032e-05,
"loss": 0.0113,
"step": 3040
},
{
"epoch": 0.9910641754670999,
"grad_norm": 0.47765976190567017,
"learning_rate": 2.7453769868895518e-05,
"loss": 0.0141,
"step": 3050
},
{
"epoch": 0.993663688058489,
"eval_loss": 0.1697877049446106,
"eval_runtime": 733.8424,
"eval_samples_per_second": 3.816,
"eval_steps_per_second": 3.816,
"step": 3058
},
{
"epoch": 0.9943135662063363,
"grad_norm": 0.6474146246910095,
"learning_rate": 2.7320081563371948e-05,
"loss": 0.0085,
"step": 3060
},
{
"epoch": 0.9975629569455727,
"grad_norm": 0.4375881850719452,
"learning_rate": 2.718632630219907e-05,
"loss": 0.0114,
"step": 3070
},
{
"epoch": 1.0006498781478472,
"grad_norm": 0.327910840511322,
"learning_rate": 2.7052507945443927e-05,
"loss": 0.009,
"step": 3080
},
{
"epoch": 1.0038992688870836,
"grad_norm": 0.4386765658855438,
"learning_rate": 2.6918630354994434e-05,
"loss": 0.0081,
"step": 3090
},
{
"epoch": 1.0071486596263202,
"grad_norm": 0.3415273129940033,
"learning_rate": 2.6784697394447942e-05,
"loss": 0.007,
"step": 3100
},
{
"epoch": 1.0103980503655565,
"grad_norm": 0.31640973687171936,
"learning_rate": 2.6650712928999755e-05,
"loss": 0.009,
"step": 3110
},
{
"epoch": 1.0136474411047929,
"grad_norm": 0.4809754490852356,
"learning_rate": 2.6516680825331548e-05,
"loss": 0.0065,
"step": 3120
},
{
"epoch": 1.0168968318440292,
"grad_norm": 0.48411983251571655,
"learning_rate": 2.6382604951499802e-05,
"loss": 0.0064,
"step": 3130
},
{
"epoch": 1.0201462225832656,
"grad_norm": 0.5264196395874023,
"learning_rate": 2.624848917682417e-05,
"loss": 0.0076,
"step": 3140
},
{
"epoch": 1.023395613322502,
"grad_norm": 0.5818049311637878,
"learning_rate": 2.6114337371775815e-05,
"loss": 0.0069,
"step": 3150
},
{
"epoch": 1.0266450040617385,
"grad_norm": 0.20237238705158234,
"learning_rate": 2.5980153407865694e-05,
"loss": 0.0059,
"step": 3160
},
{
"epoch": 1.0298943948009749,
"grad_norm": 0.4652661681175232,
"learning_rate": 2.5845941157532856e-05,
"loss": 0.0058,
"step": 3170
},
{
"epoch": 1.0331437855402112,
"grad_norm": 0.42343178391456604,
"learning_rate": 2.5711704494032662e-05,
"loss": 0.0072,
"step": 3180
},
{
"epoch": 1.0363931762794476,
"grad_norm": 0.31664910912513733,
"learning_rate": 2.557744729132503e-05,
"loss": 0.0062,
"step": 3190
},
{
"epoch": 1.039642567018684,
"grad_norm": 0.4600023627281189,
"learning_rate": 2.5443173423962606e-05,
"loss": 0.0065,
"step": 3200
},
{
"epoch": 1.0428919577579203,
"grad_norm": 0.21867072582244873,
"learning_rate": 2.5308886766978985e-05,
"loss": 0.0075,
"step": 3210
},
{
"epoch": 1.0461413484971567,
"grad_norm": 0.2808971405029297,
"learning_rate": 2.517459119577685e-05,
"loss": 0.0064,
"step": 3220
},
{
"epoch": 1.0493907392363933,
"grad_norm": 0.3623920679092407,
"learning_rate": 2.504029058601612e-05,
"loss": 0.0057,
"step": 3230
},
{
"epoch": 1.0526401299756296,
"grad_norm": 0.7856932282447815,
"learning_rate": 2.490598881350215e-05,
"loss": 0.0069,
"step": 3240
},
{
"epoch": 1.055889520714866,
"grad_norm": 0.3538922965526581,
"learning_rate": 2.4771689754073858e-05,
"loss": 0.0067,
"step": 3250
},
{
"epoch": 1.0591389114541023,
"grad_norm": 0.40585437417030334,
"learning_rate": 2.4637397283491828e-05,
"loss": 0.0068,
"step": 3260
},
{
"epoch": 1.0623883021933387,
"grad_norm": 0.3392549753189087,
"learning_rate": 2.450311527732653e-05,
"loss": 0.0075,
"step": 3270
},
{
"epoch": 1.065637692932575,
"grad_norm": 0.4685852527618408,
"learning_rate": 2.436884761084642e-05,
"loss": 0.0076,
"step": 3280
},
{
"epoch": 1.0688870836718116,
"grad_norm": 0.2836906611919403,
"learning_rate": 2.423459815890614e-05,
"loss": 0.0063,
"step": 3290
},
{
"epoch": 1.072136474411048,
"grad_norm": 0.3446894586086273,
"learning_rate": 2.4100370795834652e-05,
"loss": 0.0056,
"step": 3300
},
{
"epoch": 1.0753858651502843,
"grad_norm": 0.6391892433166504,
"learning_rate": 2.3966169395323466e-05,
"loss": 0.007,
"step": 3310
},
{
"epoch": 1.0786352558895207,
"grad_norm": 0.4981069564819336,
"learning_rate": 2.383199783031484e-05,
"loss": 0.0069,
"step": 3320
},
{
"epoch": 1.081884646628757,
"grad_norm": 0.3438924252986908,
"learning_rate": 2.369785997288998e-05,
"loss": 0.0062,
"step": 3330
},
{
"epoch": 1.0851340373679934,
"grad_norm": 0.6114248633384705,
"learning_rate": 2.356375969415735e-05,
"loss": 0.006,
"step": 3340
},
{
"epoch": 1.08838342810723,
"grad_norm": 0.658473551273346,
"learning_rate": 2.3429700864140892e-05,
"loss": 0.0057,
"step": 3350
},
{
"epoch": 1.0916328188464663,
"grad_norm": 0.2530268430709839,
"learning_rate": 2.3295687351668407e-05,
"loss": 0.0057,
"step": 3360
},
{
"epoch": 1.0948822095857027,
"grad_norm": 0.33400505781173706,
"learning_rate": 2.3161723024259832e-05,
"loss": 0.0058,
"step": 3370
},
{
"epoch": 1.098131600324939,
"grad_norm": 0.47318515181541443,
"learning_rate": 2.302781174801569e-05,
"loss": 0.0058,
"step": 3380
},
{
"epoch": 1.1013809910641754,
"grad_norm": 0.18566595017910004,
"learning_rate": 2.2893957387505488e-05,
"loss": 0.0048,
"step": 3390
},
{
"epoch": 1.1046303818034118,
"grad_norm": 0.43583136796951294,
"learning_rate": 2.2760163805656172e-05,
"loss": 0.0062,
"step": 3400
},
{
"epoch": 1.1078797725426484,
"grad_norm": 0.09334202855825424,
"learning_rate": 2.262643486364069e-05,
"loss": 0.0054,
"step": 3410
},
{
"epoch": 1.1111291632818847,
"grad_norm": 0.4679158627986908,
"learning_rate": 2.2492774420766518e-05,
"loss": 0.0058,
"step": 3420
},
{
"epoch": 1.114378554021121,
"grad_norm": 0.2904549539089203,
"learning_rate": 2.2359186334364314e-05,
"loss": 0.0062,
"step": 3430
},
{
"epoch": 1.1176279447603574,
"grad_norm": 0.2544921338558197,
"learning_rate": 2.22256744596766e-05,
"loss": 0.0054,
"step": 3440
},
{
"epoch": 1.1208773354995938,
"grad_norm": 0.3968259394168854,
"learning_rate": 2.2092242649746468e-05,
"loss": 0.007,
"step": 3450
},
{
"epoch": 1.1241267262388301,
"grad_norm": 0.5143113732337952,
"learning_rate": 2.195889475530641e-05,
"loss": 0.0063,
"step": 3460
},
{
"epoch": 1.1273761169780667,
"grad_norm": 0.39842620491981506,
"learning_rate": 2.1825634624667188e-05,
"loss": 0.0043,
"step": 3470
},
{
"epoch": 1.130625507717303,
"grad_norm": 0.7474611401557922,
"learning_rate": 2.169246610360679e-05,
"loss": 0.0044,
"step": 3480
},
{
"epoch": 1.1338748984565394,
"grad_norm": 0.43382173776626587,
"learning_rate": 2.15593930352594e-05,
"loss": 0.0068,
"step": 3490
},
{
"epoch": 1.1371242891957758,
"grad_norm": 0.43323034048080444,
"learning_rate": 2.1426419260004533e-05,
"loss": 0.0066,
"step": 3500
},
{
"epoch": 1.1403736799350122,
"grad_norm": 0.5351451635360718,
"learning_rate": 2.1293548615356175e-05,
"loss": 0.0059,
"step": 3510
},
{
"epoch": 1.1436230706742485,
"grad_norm": 0.3306446373462677,
"learning_rate": 2.1160784935852065e-05,
"loss": 0.0063,
"step": 3520
},
{
"epoch": 1.1468724614134849,
"grad_norm": 0.8898324370384216,
"learning_rate": 2.1028132052942995e-05,
"loss": 0.0059,
"step": 3530
},
{
"epoch": 1.1501218521527214,
"grad_norm": 0.3271152973175049,
"learning_rate": 2.0895593794882268e-05,
"loss": 0.0055,
"step": 3540
},
{
"epoch": 1.1533712428919578,
"grad_norm": 0.22426745295524597,
"learning_rate": 2.0763173986615216e-05,
"loss": 0.005,
"step": 3550
},
{
"epoch": 1.1566206336311942,
"grad_norm": 0.42375096678733826,
"learning_rate": 2.063087644966879e-05,
"loss": 0.0049,
"step": 3560
},
{
"epoch": 1.1598700243704305,
"grad_norm": 0.4661770761013031,
"learning_rate": 2.04987050020413e-05,
"loss": 0.0048,
"step": 3570
},
{
"epoch": 1.1631194151096669,
"grad_norm": 0.4002689719200134,
"learning_rate": 2.0366663458092224e-05,
"loss": 0.0045,
"step": 3580
},
{
"epoch": 1.1663688058489032,
"grad_norm": 0.5181555151939392,
"learning_rate": 2.0234755628432133e-05,
"loss": 0.0043,
"step": 3590
},
{
"epoch": 1.1696181965881398,
"grad_norm": 0.4838791787624359,
"learning_rate": 2.0102985319812688e-05,
"loss": 0.0059,
"step": 3600
},
{
"epoch": 1.1728675873273762,
"grad_norm": 0.5321421027183533,
"learning_rate": 1.9971356335016834e-05,
"loss": 0.0062,
"step": 3610
},
{
"epoch": 1.1761169780666125,
"grad_norm": 0.17948143184185028,
"learning_rate": 1.9839872472749013e-05,
"loss": 0.0046,
"step": 3620
},
{
"epoch": 1.1793663688058489,
"grad_norm": 0.5810254216194153,
"learning_rate": 1.9708537527525544e-05,
"loss": 0.0051,
"step": 3630
},
{
"epoch": 1.1826157595450852,
"grad_norm": 0.32746565341949463,
"learning_rate": 1.957735528956514e-05,
"loss": 0.0061,
"step": 3640
},
{
"epoch": 1.1858651502843216,
"grad_norm": 0.359332412481308,
"learning_rate": 1.9446329544679488e-05,
"loss": 0.0058,
"step": 3650
},
{
"epoch": 1.189114541023558,
"grad_norm": 0.4285711944103241,
"learning_rate": 1.9315464074164036e-05,
"loss": 0.004,
"step": 3660
},
{
"epoch": 1.1923639317627945,
"grad_norm": 0.42309272289276123,
"learning_rate": 1.918476265468882e-05,
"loss": 0.0049,
"step": 3670
},
{
"epoch": 1.195613322502031,
"grad_norm": 0.495802640914917,
"learning_rate": 1.9054229058189514e-05,
"loss": 0.0046,
"step": 3680
},
{
"epoch": 1.1988627132412673,
"grad_norm": 0.39304494857788086,
"learning_rate": 1.892386705175856e-05,
"loss": 0.0043,
"step": 3690
},
{
"epoch": 1.2021121039805036,
"grad_norm": 0.30474936962127686,
"learning_rate": 1.879368039753644e-05,
"loss": 0.0042,
"step": 3700
},
{
"epoch": 1.20536149471974,
"grad_norm": 0.19054022431373596,
"learning_rate": 1.866367285260312e-05,
"loss": 0.0045,
"step": 3710
},
{
"epoch": 1.2086108854589765,
"grad_norm": 0.38206177949905396,
"learning_rate": 1.853384816886962e-05,
"loss": 0.0056,
"step": 3720
},
{
"epoch": 1.211860276198213,
"grad_norm": 0.34758618474006653,
"learning_rate": 1.840421009296975e-05,
"loss": 0.0034,
"step": 3730
},
{
"epoch": 1.2151096669374493,
"grad_norm": 0.3336513638496399,
"learning_rate": 1.827476236615194e-05,
"loss": 0.0051,
"step": 3740
},
{
"epoch": 1.2183590576766856,
"grad_norm": 0.6265475153923035,
"learning_rate": 1.8145508724171316e-05,
"loss": 0.0032,
"step": 3750
},
{
"epoch": 1.221608448415922,
"grad_norm": 0.4233214259147644,
"learning_rate": 1.80164528971819e-05,
"loss": 0.0063,
"step": 3760
},
{
"epoch": 1.2248578391551583,
"grad_norm": 0.2057565301656723,
"learning_rate": 1.7887598609628897e-05,
"loss": 0.0037,
"step": 3770
},
{
"epoch": 1.2281072298943947,
"grad_norm": 0.2737014889717102,
"learning_rate": 1.7758949580141276e-05,
"loss": 0.0047,
"step": 3780
},
{
"epoch": 1.2313566206336313,
"grad_norm": 0.38340964913368225,
"learning_rate": 1.7630509521424407e-05,
"loss": 0.0049,
"step": 3790
},
{
"epoch": 1.2346060113728676,
"grad_norm": 0.3404456377029419,
"learning_rate": 1.750228214015295e-05,
"loss": 0.0034,
"step": 3800
},
{
"epoch": 1.237855402112104,
"grad_norm": 0.40690556168556213,
"learning_rate": 1.7374271136863863e-05,
"loss": 0.0042,
"step": 3810
},
{
"epoch": 1.2411047928513403,
"grad_norm": 0.22202616930007935,
"learning_rate": 1.7246480205849613e-05,
"loss": 0.0046,
"step": 3820
},
{
"epoch": 1.2443541835905767,
"grad_norm": 0.37885475158691406,
"learning_rate": 1.7118913035051564e-05,
"loss": 0.004,
"step": 3830
},
{
"epoch": 1.2476035743298133,
"grad_norm": 0.16762322187423706,
"learning_rate": 1.6991573305953533e-05,
"loss": 0.0034,
"step": 3840
},
{
"epoch": 1.2508529650690496,
"grad_norm": 0.5190407633781433,
"learning_rate": 1.686446469347558e-05,
"loss": 0.0042,
"step": 3850
},
{
"epoch": 1.254102355808286,
"grad_norm": 0.290955126285553,
"learning_rate": 1.6737590865867907e-05,
"loss": 0.0056,
"step": 3860
},
{
"epoch": 1.2573517465475224,
"grad_norm": 0.240234836935997,
"learning_rate": 1.6610955484605023e-05,
"loss": 0.0034,
"step": 3870
},
{
"epoch": 1.2606011372867587,
"grad_norm": 0.29844263195991516,
"learning_rate": 1.6484562204280075e-05,
"loss": 0.0038,
"step": 3880
},
{
"epoch": 1.263850528025995,
"grad_norm": 0.3929229974746704,
"learning_rate": 1.6358414672499377e-05,
"loss": 0.0051,
"step": 3890
},
{
"epoch": 1.2670999187652314,
"grad_norm": 0.3525027632713318,
"learning_rate": 1.623251652977713e-05,
"loss": 0.0037,
"step": 3900
},
{
"epoch": 1.2703493095044678,
"grad_norm": 0.20320917665958405,
"learning_rate": 1.6106871409430387e-05,
"loss": 0.0044,
"step": 3910
},
{
"epoch": 1.2735987002437044,
"grad_norm": 0.1730404794216156,
"learning_rate": 1.5981482937474172e-05,
"loss": 0.0039,
"step": 3920
},
{
"epoch": 1.2768480909829407,
"grad_norm": 0.3508148789405823,
"learning_rate": 1.5856354732516865e-05,
"loss": 0.0041,
"step": 3930
},
{
"epoch": 1.280097481722177,
"grad_norm": 0.35546788573265076,
"learning_rate": 1.573149040565572e-05,
"loss": 0.0043,
"step": 3940
},
{
"epoch": 1.2833468724614134,
"grad_norm": 0.2085341513156891,
"learning_rate": 1.5606893560372714e-05,
"loss": 0.0043,
"step": 3950
},
{
"epoch": 1.28659626320065,
"grad_norm": 0.18922321498394012,
"learning_rate": 1.548256779243052e-05,
"loss": 0.0037,
"step": 3960
},
{
"epoch": 1.2898456539398864,
"grad_norm": 0.3243728280067444,
"learning_rate": 1.5358516689768734e-05,
"loss": 0.0044,
"step": 3970
},
{
"epoch": 1.2930950446791227,
"grad_norm": 0.37964773178100586,
"learning_rate": 1.5234743832400344e-05,
"loss": 0.0031,
"step": 3980
},
{
"epoch": 1.296344435418359,
"grad_norm": 0.3203120529651642,
"learning_rate": 1.5111252792308406e-05,
"loss": 0.0031,
"step": 3990
},
{
"epoch": 1.2995938261575954,
"grad_norm": 0.1457476019859314,
"learning_rate": 1.4988047133342964e-05,
"loss": 0.0042,
"step": 4000
},
{
"epoch": 1.3028432168968318,
"grad_norm": 0.14640583097934723,
"learning_rate": 1.486513041111819e-05,
"loss": 0.003,
"step": 4010
},
{
"epoch": 1.3060926076360682,
"grad_norm": 0.5449049472808838,
"learning_rate": 1.4742506172909775e-05,
"loss": 0.0033,
"step": 4020
},
{
"epoch": 1.3093419983753045,
"grad_norm": 0.40858981013298035,
"learning_rate": 1.4620177957552578e-05,
"loss": 0.0038,
"step": 4030
},
{
"epoch": 1.312591389114541,
"grad_norm": 0.5728728175163269,
"learning_rate": 1.4498149295338464e-05,
"loss": 0.0031,
"step": 4040
},
{
"epoch": 1.3158407798537775,
"grad_norm": 0.24417610466480255,
"learning_rate": 1.437642370791446e-05,
"loss": 0.0034,
"step": 4050
},
{
"epoch": 1.3190901705930138,
"grad_norm": 0.317682683467865,
"learning_rate": 1.4255004708181075e-05,
"loss": 0.0028,
"step": 4060
},
{
"epoch": 1.3223395613322502,
"grad_norm": 0.6588785648345947,
"learning_rate": 1.4133895800190983e-05,
"loss": 0.004,
"step": 4070
},
{
"epoch": 1.3255889520714865,
"grad_norm": 0.44252049922943115,
"learning_rate": 1.4013100479047825e-05,
"loss": 0.0034,
"step": 4080
},
{
"epoch": 1.328838342810723,
"grad_norm": 0.4248214066028595,
"learning_rate": 1.3892622230805436e-05,
"loss": 0.005,
"step": 4090
},
{
"epoch": 1.3320877335499595,
"grad_norm": 0.31828951835632324,
"learning_rate": 1.3772464532367125e-05,
"loss": 0.0026,
"step": 4100
},
{
"epoch": 1.3353371242891958,
"grad_norm": 0.25999122858047485,
"learning_rate": 1.3652630851385454e-05,
"loss": 0.0035,
"step": 4110
},
{
"epoch": 1.3385865150284322,
"grad_norm": 0.15871325135231018,
"learning_rate": 1.353312464616207e-05,
"loss": 0.0035,
"step": 4120
},
{
"epoch": 1.3418359057676685,
"grad_norm": 0.44427499175071716,
"learning_rate": 1.341394936554794e-05,
"loss": 0.0028,
"step": 4130
},
{
"epoch": 1.345085296506905,
"grad_norm": 0.306194543838501,
"learning_rate": 1.329510844884385e-05,
"loss": 0.0026,
"step": 4140
},
{
"epoch": 1.3483346872461413,
"grad_norm": 0.19543297588825226,
"learning_rate": 1.3176605325701086e-05,
"loss": 0.0028,
"step": 4150
},
{
"epoch": 1.3515840779853776,
"grad_norm": 0.21941304206848145,
"learning_rate": 1.305844341602249e-05,
"loss": 0.0018,
"step": 4160
},
{
"epoch": 1.3548334687246142,
"grad_norm": 0.36652112007141113,
"learning_rate": 1.2940626129863792e-05,
"loss": 0.003,
"step": 4170
},
{
"epoch": 1.3580828594638505,
"grad_norm": 0.47468826174736023,
"learning_rate": 1.282315686733514e-05,
"loss": 0.0025,
"step": 4180
},
{
"epoch": 1.361332250203087,
"grad_norm": 0.3629470765590668,
"learning_rate": 1.2706039018503013e-05,
"loss": 0.003,
"step": 4190
},
{
"epoch": 1.3645816409423233,
"grad_norm": 0.3716108500957489,
"learning_rate": 1.2589275963292397e-05,
"loss": 0.0027,
"step": 4200
},
{
"epoch": 1.3678310316815598,
"grad_norm": 0.8420029878616333,
"learning_rate": 1.2472871071389205e-05,
"loss": 0.0027,
"step": 4210
},
{
"epoch": 1.3710804224207962,
"grad_norm": 0.43932396173477173,
"learning_rate": 1.2356827702143048e-05,
"loss": 0.003,
"step": 4220
},
{
"epoch": 1.3743298131600326,
"grad_norm": 0.28611013293266296,
"learning_rate": 1.2241149204470314e-05,
"loss": 0.0034,
"step": 4230
},
{
"epoch": 1.377579203899269,
"grad_norm": 0.30783408880233765,
"learning_rate": 1.2125838916757471e-05,
"loss": 0.0034,
"step": 4240
},
{
"epoch": 1.3808285946385053,
"grad_norm": 0.16370636224746704,
"learning_rate": 1.2010900166764774e-05,
"loss": 0.0028,
"step": 4250
},
{
"epoch": 1.3840779853777416,
"grad_norm": 0.055241186171770096,
"learning_rate": 1.1896336271530187e-05,
"loss": 0.0035,
"step": 4260
},
{
"epoch": 1.387327376116978,
"grad_norm": 0.42322880029678345,
"learning_rate": 1.1782150537273665e-05,
"loss": 0.0029,
"step": 4270
},
{
"epoch": 1.3905767668562143,
"grad_norm": 0.12644466757774353,
"learning_rate": 1.166834625930178e-05,
"loss": 0.0022,
"step": 4280
},
{
"epoch": 1.393826157595451,
"grad_norm": 0.24443909525871277,
"learning_rate": 1.1554926721912562e-05,
"loss": 0.0022,
"step": 4290
},
{
"epoch": 1.3970755483346873,
"grad_norm": 0.12778101861476898,
"learning_rate": 1.144189519830074e-05,
"loss": 0.003,
"step": 4300
},
{
"epoch": 1.4003249390739236,
"grad_norm": 0.15853983163833618,
"learning_rate": 1.1329254950463315e-05,
"loss": 0.0023,
"step": 4310
},
{
"epoch": 1.40357432981316,
"grad_norm": 0.11951529234647751,
"learning_rate": 1.1217009229105357e-05,
"loss": 0.0031,
"step": 4320
},
{
"epoch": 1.4068237205523964,
"grad_norm": 0.21437332034111023,
"learning_rate": 1.1105161273546236e-05,
"loss": 0.0019,
"step": 4330
},
{
"epoch": 1.410073111291633,
"grad_norm": 0.8929743766784668,
"learning_rate": 1.0993714311626146e-05,
"loss": 0.0024,
"step": 4340
},
{
"epoch": 1.4133225020308693,
"grad_norm": 0.2045769989490509,
"learning_rate": 1.0882671559612909e-05,
"loss": 0.0029,
"step": 4350
},
{
"epoch": 1.4165718927701056,
"grad_norm": 0.08286549896001816,
"learning_rate": 1.0772036222109182e-05,
"loss": 0.0029,
"step": 4360
},
{
"epoch": 1.419821283509342,
"grad_norm": 0.09282595664262772,
"learning_rate": 1.066181149196e-05,
"loss": 0.0018,
"step": 4370
},
{
"epoch": 1.4230706742485784,
"grad_norm": 0.2858879566192627,
"learning_rate": 1.055200055016057e-05,
"loss": 0.0029,
"step": 4380
},
{
"epoch": 1.4263200649878147,
"grad_norm": 0.2344265580177307,
"learning_rate": 1.0442606565764534e-05,
"loss": 0.0019,
"step": 4390
},
{
"epoch": 1.429569455727051,
"grad_norm": 0.346629798412323,
"learning_rate": 1.0333632695792492e-05,
"loss": 0.0025,
"step": 4400
},
{
"epoch": 1.4328188464662874,
"grad_norm": 0.29290953278541565,
"learning_rate": 1.0225082085140856e-05,
"loss": 0.0022,
"step": 4410
},
{
"epoch": 1.436068237205524,
"grad_norm": 0.43641456961631775,
"learning_rate": 1.0116957866491128e-05,
"loss": 0.0021,
"step": 4420
},
{
"epoch": 1.4393176279447604,
"grad_norm": 0.3562842309474945,
"learning_rate": 1.000926316021952e-05,
"loss": 0.0022,
"step": 4430
},
{
"epoch": 1.4425670186839967,
"grad_norm": 0.4262392520904541,
"learning_rate": 9.902001074306835e-06,
"loss": 0.0028,
"step": 4440
},
{
"epoch": 1.445816409423233,
"grad_norm": 0.2457405924797058,
"learning_rate": 9.795174704248808e-06,
"loss": 0.0025,
"step": 4450
},
{
"epoch": 1.4490658001624697,
"grad_norm": 0.22532138228416443,
"learning_rate": 9.6887871329668e-06,
"loss": 0.0019,
"step": 4460
},
{
"epoch": 1.452315190901706,
"grad_norm": 0.4915858507156372,
"learning_rate": 9.582841430718767e-06,
"loss": 0.0023,
"step": 4470
},
{
"epoch": 1.4555645816409424,
"grad_norm": 0.18646268546581268,
"learning_rate": 9.477340655010716e-06,
"loss": 0.0021,
"step": 4480
},
{
"epoch": 1.4588139723801787,
"grad_norm": 0.10206873714923859,
"learning_rate": 9.372287850508421e-06,
"loss": 0.0017,
"step": 4490
},
{
"epoch": 1.462063363119415,
"grad_norm": 0.3298508822917938,
"learning_rate": 9.267686048949568e-06,
"loss": 0.0018,
"step": 4500
},
{
"epoch": 1.4653127538586515,
"grad_norm": 0.5361196398735046,
"learning_rate": 9.163538269056296e-06,
"loss": 0.003,
"step": 4510
},
{
"epoch": 1.4685621445978878,
"grad_norm": 0.41454723477363586,
"learning_rate": 9.05984751644803e-06,
"loss": 0.0019,
"step": 4520
},
{
"epoch": 1.4718115353371242,
"grad_norm": 0.47003769874572754,
"learning_rate": 8.956616783554759e-06,
"loss": 0.0025,
"step": 4530
},
{
"epoch": 1.4750609260763607,
"grad_norm": 0.2163703888654709,
"learning_rate": 8.853849049530703e-06,
"loss": 0.0018,
"step": 4540
},
{
"epoch": 1.478310316815597,
"grad_norm": 0.08466655015945435,
"learning_rate": 8.751547280168297e-06,
"loss": 0.0021,
"step": 4550
},
{
"epoch": 1.4815597075548335,
"grad_norm": 0.2865130305290222,
"learning_rate": 8.649714427812607e-06,
"loss": 0.0017,
"step": 4560
},
{
"epoch": 1.4848090982940698,
"grad_norm": 0.2946512699127197,
"learning_rate": 8.548353431276182e-06,
"loss": 0.0019,
"step": 4570
},
{
"epoch": 1.4880584890333062,
"grad_norm": 0.4205271303653717,
"learning_rate": 8.447467215754157e-06,
"loss": 0.0021,
"step": 4580
},
{
"epoch": 1.4903330625507718,
"eval_loss": 0.20176434516906738,
"eval_runtime": 733.5854,
"eval_samples_per_second": 3.817,
"eval_steps_per_second": 3.817,
"step": 4587
}
],
"logging_steps": 10,
"max_steps": 6156,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1529,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.5805200990465556e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}