vmarcetic's picture
Add LoRA adapters (Qwen3-Coder-30B-A3B, 1 epoch, 100K samples)
5353bb0 verified
{
"best_global_step": 5000,
"best_metric": 0.6726189255714417,
"best_model_checkpoint": "/workspace/rails-finetune/adapters-qwen3-coder-30b/checkpoint-5000",
"epoch": 0.7984828825232059,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015969657650464117,
"grad_norm": 0.15089154243469238,
"learning_rate": 6.000000000000001e-07,
"loss": 1.5709858894348145,
"step": 10
},
{
"epoch": 0.0031939315300928235,
"grad_norm": 0.1435190588235855,
"learning_rate": 1.2666666666666669e-06,
"loss": 1.6188209533691407,
"step": 20
},
{
"epoch": 0.004790897295139236,
"grad_norm": 0.14615978300571442,
"learning_rate": 1.9333333333333336e-06,
"loss": 1.6381675720214843,
"step": 30
},
{
"epoch": 0.006387863060185647,
"grad_norm": 0.1281592696905136,
"learning_rate": 2.6e-06,
"loss": 1.5563851356506349,
"step": 40
},
{
"epoch": 0.00798482882523206,
"grad_norm": 0.16032098233699799,
"learning_rate": 3.266666666666667e-06,
"loss": 1.6382104873657226,
"step": 50
},
{
"epoch": 0.009581794590278471,
"grad_norm": 0.14609354734420776,
"learning_rate": 3.9333333333333335e-06,
"loss": 1.6256795883178712,
"step": 60
},
{
"epoch": 0.011178760355324883,
"grad_norm": 0.13070641458034515,
"learning_rate": 4.600000000000001e-06,
"loss": 1.5222463607788086,
"step": 70
},
{
"epoch": 0.012775726120371294,
"grad_norm": 0.1499311476945877,
"learning_rate": 5.2666666666666665e-06,
"loss": 1.518197727203369,
"step": 80
},
{
"epoch": 0.014372691885417706,
"grad_norm": 0.12919697165489197,
"learning_rate": 5.933333333333335e-06,
"loss": 1.4710905075073242,
"step": 90
},
{
"epoch": 0.01596965765046412,
"grad_norm": 0.11437301337718964,
"learning_rate": 6.600000000000001e-06,
"loss": 1.4049152374267577,
"step": 100
},
{
"epoch": 0.01756662341551053,
"grad_norm": 0.13434389233589172,
"learning_rate": 7.266666666666668e-06,
"loss": 1.3853497505187988,
"step": 110
},
{
"epoch": 0.019163589180556943,
"grad_norm": 0.08636850863695145,
"learning_rate": 7.933333333333334e-06,
"loss": 1.3351438522338868,
"step": 120
},
{
"epoch": 0.020760554945603353,
"grad_norm": 0.08471965044736862,
"learning_rate": 8.6e-06,
"loss": 1.2937309265136718,
"step": 130
},
{
"epoch": 0.022357520710649767,
"grad_norm": 0.058983951807022095,
"learning_rate": 9.266666666666667e-06,
"loss": 1.2116679191589355,
"step": 140
},
{
"epoch": 0.023954486475696177,
"grad_norm": 0.06733114272356033,
"learning_rate": 9.933333333333334e-06,
"loss": 1.1680314064025878,
"step": 150
},
{
"epoch": 0.025551452240742588,
"grad_norm": 0.045156873762607574,
"learning_rate": 1.0600000000000002e-05,
"loss": 1.1280742645263673,
"step": 160
},
{
"epoch": 0.027148418005789002,
"grad_norm": 0.0415460579097271,
"learning_rate": 1.1266666666666668e-05,
"loss": 1.0879782676696776,
"step": 170
},
{
"epoch": 0.028745383770835412,
"grad_norm": 0.05221620574593544,
"learning_rate": 1.1933333333333335e-05,
"loss": 1.0641801834106446,
"step": 180
},
{
"epoch": 0.030342349535881826,
"grad_norm": 0.036492571234703064,
"learning_rate": 1.2600000000000001e-05,
"loss": 1.0208759307861328,
"step": 190
},
{
"epoch": 0.03193931530092824,
"grad_norm": 0.03153146430850029,
"learning_rate": 1.3266666666666668e-05,
"loss": 1.0204400062561034,
"step": 200
},
{
"epoch": 0.03353628106597465,
"grad_norm": 0.03343074768781662,
"learning_rate": 1.3933333333333334e-05,
"loss": 0.9532640457153321,
"step": 210
},
{
"epoch": 0.03513324683102106,
"grad_norm": 0.034289468079805374,
"learning_rate": 1.46e-05,
"loss": 0.9065071105957031,
"step": 220
},
{
"epoch": 0.03673021259606747,
"grad_norm": 0.040245093405246735,
"learning_rate": 1.5266666666666667e-05,
"loss": 0.895290470123291,
"step": 230
},
{
"epoch": 0.038327178361113885,
"grad_norm": 0.03090902790427208,
"learning_rate": 1.5933333333333336e-05,
"loss": 0.89828462600708,
"step": 240
},
{
"epoch": 0.03992414412616029,
"grad_norm": 0.023231355473399162,
"learning_rate": 1.66e-05,
"loss": 0.867582893371582,
"step": 250
},
{
"epoch": 0.041521109891206706,
"grad_norm": 0.022748615592718124,
"learning_rate": 1.726666666666667e-05,
"loss": 0.8476214408874512,
"step": 260
},
{
"epoch": 0.04311807565625312,
"grad_norm": 0.025588329881429672,
"learning_rate": 1.7933333333333333e-05,
"loss": 0.8639037132263183,
"step": 270
},
{
"epoch": 0.044715041421299534,
"grad_norm": 0.029987547546625137,
"learning_rate": 1.86e-05,
"loss": 0.8353777885437011,
"step": 280
},
{
"epoch": 0.04631200718634594,
"grad_norm": 0.020994428545236588,
"learning_rate": 1.926666666666667e-05,
"loss": 0.8189143180847168,
"step": 290
},
{
"epoch": 0.047908972951392355,
"grad_norm": 0.023545585572719574,
"learning_rate": 1.9933333333333334e-05,
"loss": 0.7846664905548095,
"step": 300
},
{
"epoch": 0.04950593871643877,
"grad_norm": 0.025134805589914322,
"learning_rate": 1.999988754726792e-05,
"loss": 0.7751604557037354,
"step": 310
},
{
"epoch": 0.051102904481485176,
"grad_norm": 0.021275486797094345,
"learning_rate": 1.9999498825021314e-05,
"loss": 0.7967638969421387,
"step": 320
},
{
"epoch": 0.05269987024653159,
"grad_norm": 0.019746815785765648,
"learning_rate": 1.9998832455745586e-05,
"loss": 0.8192337036132813,
"step": 330
},
{
"epoch": 0.054296836011578004,
"grad_norm": 0.02567191608250141,
"learning_rate": 1.9997888457943197e-05,
"loss": 0.7604565620422363,
"step": 340
},
{
"epoch": 0.05589380177662441,
"grad_norm": 0.024295911192893982,
"learning_rate": 1.9996666857825287e-05,
"loss": 0.7550592422485352,
"step": 350
},
{
"epoch": 0.057490767541670824,
"grad_norm": 0.02220323495566845,
"learning_rate": 1.9995167689310917e-05,
"loss": 0.7514551639556885,
"step": 360
},
{
"epoch": 0.05908773330671724,
"grad_norm": 0.024802017956972122,
"learning_rate": 1.9993390994026144e-05,
"loss": 0.7904987812042237,
"step": 370
},
{
"epoch": 0.06068469907176365,
"grad_norm": 0.027685435488820076,
"learning_rate": 1.9991336821302856e-05,
"loss": 0.7770297050476074,
"step": 380
},
{
"epoch": 0.06228166483681006,
"grad_norm": 0.019939841702580452,
"learning_rate": 1.9989005228177406e-05,
"loss": 0.7634893417358398,
"step": 390
},
{
"epoch": 0.06387863060185647,
"grad_norm": 0.028217511251568794,
"learning_rate": 1.9986396279389028e-05,
"loss": 0.8059945106506348,
"step": 400
},
{
"epoch": 0.06547559636690288,
"grad_norm": 0.019439322873950005,
"learning_rate": 1.9983510047378046e-05,
"loss": 0.7787755489349365,
"step": 410
},
{
"epoch": 0.0670725621319493,
"grad_norm": 0.02309611812233925,
"learning_rate": 1.9980346612283842e-05,
"loss": 0.7745323657989502,
"step": 420
},
{
"epoch": 0.06866952789699571,
"grad_norm": 0.020378535613417625,
"learning_rate": 1.997690606194266e-05,
"loss": 0.7243520736694335,
"step": 430
},
{
"epoch": 0.07026649366204211,
"grad_norm": 0.021776653826236725,
"learning_rate": 1.9973188491885146e-05,
"loss": 0.7424108028411865,
"step": 440
},
{
"epoch": 0.07186345942708854,
"grad_norm": 0.03435182571411133,
"learning_rate": 1.9969194005333697e-05,
"loss": 0.7804229736328125,
"step": 450
},
{
"epoch": 0.07346042519213494,
"grad_norm": 0.026880595833063126,
"learning_rate": 1.9964922713199613e-05,
"loss": 0.7963083267211915,
"step": 460
},
{
"epoch": 0.07505739095718135,
"grad_norm": 0.02643360197544098,
"learning_rate": 1.9960374734079984e-05,
"loss": 0.7517959117889405,
"step": 470
},
{
"epoch": 0.07665435672222777,
"grad_norm": 0.036808405071496964,
"learning_rate": 1.9955550194254436e-05,
"loss": 0.7511186122894287,
"step": 480
},
{
"epoch": 0.07825132248727418,
"grad_norm": 0.03115280158817768,
"learning_rate": 1.995044922768159e-05,
"loss": 0.765751314163208,
"step": 490
},
{
"epoch": 0.07984828825232058,
"grad_norm": 0.025536708533763885,
"learning_rate": 1.994507197599537e-05,
"loss": 0.7346895217895508,
"step": 500
},
{
"epoch": 0.07984828825232058,
"eval_loss": 0.7535409331321716,
"eval_runtime": 1705.9703,
"eval_samples_per_second": 3.263,
"eval_steps_per_second": 1.631,
"step": 500
},
{
"epoch": 0.081445254017367,
"grad_norm": 0.03875249624252319,
"learning_rate": 1.993941858850106e-05,
"loss": 0.7961405277252197,
"step": 510
},
{
"epoch": 0.08304221978241341,
"grad_norm": 0.027557438239455223,
"learning_rate": 1.993348922217114e-05,
"loss": 0.7667338371276855,
"step": 520
},
{
"epoch": 0.08463918554745983,
"grad_norm": 0.026045992970466614,
"learning_rate": 1.9927284041640967e-05,
"loss": 0.7275139808654785,
"step": 530
},
{
"epoch": 0.08623615131250624,
"grad_norm": 0.026213862001895905,
"learning_rate": 1.992080321920416e-05,
"loss": 0.7866159439086914,
"step": 540
},
{
"epoch": 0.08783311707755265,
"grad_norm": 0.028538137674331665,
"learning_rate": 1.9914046934807853e-05,
"loss": 0.7298189640045166,
"step": 550
},
{
"epoch": 0.08943008284259907,
"grad_norm": 0.028126219287514687,
"learning_rate": 1.9907015376047675e-05,
"loss": 0.749655294418335,
"step": 560
},
{
"epoch": 0.09102704860764547,
"grad_norm": 0.027085473760962486,
"learning_rate": 1.9899708738162553e-05,
"loss": 0.8086295127868652,
"step": 570
},
{
"epoch": 0.09262401437269188,
"grad_norm": 0.034640420228242874,
"learning_rate": 1.989212722402928e-05,
"loss": 0.7653923511505127,
"step": 580
},
{
"epoch": 0.0942209801377383,
"grad_norm": 0.03858828917145729,
"learning_rate": 1.98842710441569e-05,
"loss": 0.8174265861511231,
"step": 590
},
{
"epoch": 0.09581794590278471,
"grad_norm": 0.04248794540762901,
"learning_rate": 1.987614041668084e-05,
"loss": 0.7357111930847168,
"step": 600
},
{
"epoch": 0.09741491166783112,
"grad_norm": 0.03530848026275635,
"learning_rate": 1.9867735567356876e-05,
"loss": 0.7747371196746826,
"step": 610
},
{
"epoch": 0.09901187743287754,
"grad_norm": 0.027734428644180298,
"learning_rate": 1.9859056729554845e-05,
"loss": 0.73280348777771,
"step": 620
},
{
"epoch": 0.10060884319792394,
"grad_norm": 0.02657356671988964,
"learning_rate": 1.9850104144252177e-05,
"loss": 0.749216365814209,
"step": 630
},
{
"epoch": 0.10220580896297035,
"grad_norm": 0.021906374022364616,
"learning_rate": 1.98408780600272e-05,
"loss": 0.7534349441528321,
"step": 640
},
{
"epoch": 0.10380277472801677,
"grad_norm": 0.025684406980872154,
"learning_rate": 1.9831378733052244e-05,
"loss": 0.7199561595916748,
"step": 650
},
{
"epoch": 0.10539974049306318,
"grad_norm": 0.025311095640063286,
"learning_rate": 1.982160642708652e-05,
"loss": 0.7382417678833008,
"step": 660
},
{
"epoch": 0.10699670625810959,
"grad_norm": 0.035563357174396515,
"learning_rate": 1.9811561413468794e-05,
"loss": 0.7394683837890625,
"step": 670
},
{
"epoch": 0.10859367202315601,
"grad_norm": 0.023977380245923996,
"learning_rate": 1.9801243971109868e-05,
"loss": 0.7267738819122315,
"step": 680
},
{
"epoch": 0.11019063778820241,
"grad_norm": 0.02275015600025654,
"learning_rate": 1.9790654386484818e-05,
"loss": 0.7240358829498291,
"step": 690
},
{
"epoch": 0.11178760355324882,
"grad_norm": 0.036198344081640244,
"learning_rate": 1.9779792953625052e-05,
"loss": 0.7299670696258544,
"step": 700
},
{
"epoch": 0.11338456931829524,
"grad_norm": 0.027184955775737762,
"learning_rate": 1.976865997411014e-05,
"loss": 0.695775318145752,
"step": 710
},
{
"epoch": 0.11498153508334165,
"grad_norm": 0.023749997839331627,
"learning_rate": 1.9757255757059446e-05,
"loss": 0.7071991920471191,
"step": 720
},
{
"epoch": 0.11657850084838806,
"grad_norm": 0.027117466554045677,
"learning_rate": 1.9745580619123535e-05,
"loss": 0.7466438293457032,
"step": 730
},
{
"epoch": 0.11817546661343448,
"grad_norm": 0.027798349037766457,
"learning_rate": 1.9733634884475395e-05,
"loss": 0.7503840923309326,
"step": 740
},
{
"epoch": 0.11977243237848088,
"grad_norm": 0.026327304542064667,
"learning_rate": 1.9721418884801414e-05,
"loss": 0.724392032623291,
"step": 750
},
{
"epoch": 0.1213693981435273,
"grad_norm": 0.029995381832122803,
"learning_rate": 1.97089329592922e-05,
"loss": 0.7100958824157715,
"step": 760
},
{
"epoch": 0.12296636390857371,
"grad_norm": 0.03063913807272911,
"learning_rate": 1.969617745463314e-05,
"loss": 0.7130911827087403,
"step": 770
},
{
"epoch": 0.12456332967362012,
"grad_norm": 0.021976860240101814,
"learning_rate": 1.968315272499478e-05,
"loss": 0.7231676578521729,
"step": 780
},
{
"epoch": 0.12616029543866653,
"grad_norm": 0.02538118325173855,
"learning_rate": 1.9669859132022994e-05,
"loss": 0.7253612995147705,
"step": 790
},
{
"epoch": 0.12775726120371295,
"grad_norm": 0.02788228541612625,
"learning_rate": 1.9656297044828943e-05,
"loss": 0.7297886848449707,
"step": 800
},
{
"epoch": 0.12935422696875937,
"grad_norm": 0.028525004163384438,
"learning_rate": 1.9642466839978814e-05,
"loss": 0.7109212875366211,
"step": 810
},
{
"epoch": 0.13095119273380576,
"grad_norm": 0.028368208557367325,
"learning_rate": 1.962836890148339e-05,
"loss": 0.7552286624908447,
"step": 820
},
{
"epoch": 0.13254815849885218,
"grad_norm": 0.03732667118310928,
"learning_rate": 1.9614003620787358e-05,
"loss": 0.74849853515625,
"step": 830
},
{
"epoch": 0.1341451242638986,
"grad_norm": 0.026267215609550476,
"learning_rate": 1.9599371396758457e-05,
"loss": 0.7323933124542237,
"step": 840
},
{
"epoch": 0.135742090028945,
"grad_norm": 0.031789544969797134,
"learning_rate": 1.958447263567641e-05,
"loss": 0.7394798755645752,
"step": 850
},
{
"epoch": 0.13733905579399142,
"grad_norm": 0.03209487721323967,
"learning_rate": 1.956930775122162e-05,
"loss": 0.7479897499084472,
"step": 860
},
{
"epoch": 0.13893602155903784,
"grad_norm": 0.026272661983966827,
"learning_rate": 1.9553877164463698e-05,
"loss": 0.7194801807403565,
"step": 870
},
{
"epoch": 0.14053298732408423,
"grad_norm": 0.02558542974293232,
"learning_rate": 1.953818130384978e-05,
"loss": 0.6967973709106445,
"step": 880
},
{
"epoch": 0.14212995308913065,
"grad_norm": 0.03761237859725952,
"learning_rate": 1.9522220605192615e-05,
"loss": 0.6678271770477295,
"step": 890
},
{
"epoch": 0.14372691885417707,
"grad_norm": 0.029605882242321968,
"learning_rate": 1.9505995511658464e-05,
"loss": 0.6922338008880615,
"step": 900
},
{
"epoch": 0.14532388461922346,
"grad_norm": 0.03535737469792366,
"learning_rate": 1.948950647375481e-05,
"loss": 0.7159334182739258,
"step": 910
},
{
"epoch": 0.14692085038426989,
"grad_norm": 0.030855044722557068,
"learning_rate": 1.9472753949317843e-05,
"loss": 0.7333884716033936,
"step": 920
},
{
"epoch": 0.1485178161493163,
"grad_norm": 0.05045896768569946,
"learning_rate": 1.9455738403499728e-05,
"loss": 0.6935329914093018,
"step": 930
},
{
"epoch": 0.1501147819143627,
"grad_norm": 0.033187806606292725,
"learning_rate": 1.9438460308755724e-05,
"loss": 0.7046589851379395,
"step": 940
},
{
"epoch": 0.15171174767940912,
"grad_norm": 0.03167016804218292,
"learning_rate": 1.9420920144831044e-05,
"loss": 0.710863447189331,
"step": 950
},
{
"epoch": 0.15330871344445554,
"grad_norm": 0.025539802387356758,
"learning_rate": 1.9403118398747533e-05,
"loss": 0.6936647891998291,
"step": 960
},
{
"epoch": 0.15490567920950193,
"grad_norm": 0.04223870858550072,
"learning_rate": 1.9385055564790157e-05,
"loss": 0.6985628128051757,
"step": 970
},
{
"epoch": 0.15650264497454835,
"grad_norm": 0.030605314299464226,
"learning_rate": 1.9366732144493266e-05,
"loss": 0.7294198989868164,
"step": 980
},
{
"epoch": 0.15809961073959478,
"grad_norm": 0.03095085918903351,
"learning_rate": 1.9348148646626687e-05,
"loss": 0.7009531021118164,
"step": 990
},
{
"epoch": 0.15969657650464117,
"grad_norm": 0.02993757091462612,
"learning_rate": 1.9329305587181574e-05,
"loss": 0.7266313552856445,
"step": 1000
},
{
"epoch": 0.15969657650464117,
"eval_loss": 0.7159722447395325,
"eval_runtime": 1703.1621,
"eval_samples_per_second": 3.268,
"eval_steps_per_second": 1.634,
"step": 1000
},
{
"epoch": 0.1612935422696876,
"grad_norm": 0.035543542355298996,
"learning_rate": 1.9310203489356092e-05,
"loss": 0.74082350730896,
"step": 1010
},
{
"epoch": 0.162890508034734,
"grad_norm": 0.03636915981769562,
"learning_rate": 1.9290842883540897e-05,
"loss": 0.717669153213501,
"step": 1020
},
{
"epoch": 0.1644874737997804,
"grad_norm": 0.04510757327079773,
"learning_rate": 1.92712243073044e-05,
"loss": 0.7365827560424805,
"step": 1030
},
{
"epoch": 0.16608443956482682,
"grad_norm": 0.03563699871301651,
"learning_rate": 1.925134830537784e-05,
"loss": 0.6981801986694336,
"step": 1040
},
{
"epoch": 0.16768140532987325,
"grad_norm": 0.04541337490081787,
"learning_rate": 1.9231215429640167e-05,
"loss": 0.754232931137085,
"step": 1050
},
{
"epoch": 0.16927837109491967,
"grad_norm": 0.033323634415864944,
"learning_rate": 1.921082623910271e-05,
"loss": 0.691849946975708,
"step": 1060
},
{
"epoch": 0.17087533685996606,
"grad_norm": 0.03559419885277748,
"learning_rate": 1.919018129989366e-05,
"loss": 0.6994197845458985,
"step": 1070
},
{
"epoch": 0.17247230262501248,
"grad_norm": 0.03182852268218994,
"learning_rate": 1.916928118524235e-05,
"loss": 0.6645867824554443,
"step": 1080
},
{
"epoch": 0.1740692683900589,
"grad_norm": 0.02616371586918831,
"learning_rate": 1.9148126475463336e-05,
"loss": 0.7270137786865234,
"step": 1090
},
{
"epoch": 0.1756662341551053,
"grad_norm": 0.03250862658023834,
"learning_rate": 1.9126717757940288e-05,
"loss": 0.7272531509399414,
"step": 1100
},
{
"epoch": 0.17726319992015172,
"grad_norm": 0.036741774529218674,
"learning_rate": 1.9105055627109683e-05,
"loss": 0.7251851558685303,
"step": 1110
},
{
"epoch": 0.17886016568519814,
"grad_norm": 0.03240974619984627,
"learning_rate": 1.908314068444429e-05,
"loss": 0.7145021915435791,
"step": 1120
},
{
"epoch": 0.18045713145024453,
"grad_norm": 0.0324835442006588,
"learning_rate": 1.9060973538436478e-05,
"loss": 0.7045553684234619,
"step": 1130
},
{
"epoch": 0.18205409721529095,
"grad_norm": 0.029804598540067673,
"learning_rate": 1.9038554804581318e-05,
"loss": 0.7342820644378663,
"step": 1140
},
{
"epoch": 0.18365106298033737,
"grad_norm": 0.03631270304322243,
"learning_rate": 1.9015885105359492e-05,
"loss": 0.7435333728790283,
"step": 1150
},
{
"epoch": 0.18524802874538376,
"grad_norm": 0.031312599778175354,
"learning_rate": 1.8992965070220007e-05,
"loss": 0.7185348033905029,
"step": 1160
},
{
"epoch": 0.18684499451043018,
"grad_norm": 0.03553950414061546,
"learning_rate": 1.896979533556273e-05,
"loss": 0.7191666603088379,
"step": 1170
},
{
"epoch": 0.1884419602754766,
"grad_norm": 0.024733861908316612,
"learning_rate": 1.8946376544720698e-05,
"loss": 0.7352997779846191,
"step": 1180
},
{
"epoch": 0.190038926040523,
"grad_norm": 0.030409259721636772,
"learning_rate": 1.8922709347942275e-05,
"loss": 0.7265621185302734,
"step": 1190
},
{
"epoch": 0.19163589180556942,
"grad_norm": 0.030736226588487625,
"learning_rate": 1.8898794402373077e-05,
"loss": 0.7150910377502442,
"step": 1200
},
{
"epoch": 0.19323285757061584,
"grad_norm": 0.038398947566747665,
"learning_rate": 1.887463237203775e-05,
"loss": 0.7055376052856446,
"step": 1210
},
{
"epoch": 0.19482982333566223,
"grad_norm": 0.03485625982284546,
"learning_rate": 1.88502239278215e-05,
"loss": 0.6555115699768066,
"step": 1220
},
{
"epoch": 0.19642678910070865,
"grad_norm": 0.03220400586724281,
"learning_rate": 1.8825569747451505e-05,
"loss": 0.6946470737457275,
"step": 1230
},
{
"epoch": 0.19802375486575508,
"grad_norm": 0.03333678096532822,
"learning_rate": 1.880067051547806e-05,
"loss": 0.7297664642333984,
"step": 1240
},
{
"epoch": 0.19962072063080147,
"grad_norm": 0.0367787703871727,
"learning_rate": 1.8775526923255597e-05,
"loss": 0.7051557064056396,
"step": 1250
},
{
"epoch": 0.2012176863958479,
"grad_norm": 0.03103001043200493,
"learning_rate": 1.8750139668923472e-05,
"loss": 0.7219597339630127,
"step": 1260
},
{
"epoch": 0.2028146521608943,
"grad_norm": 0.04533790051937103,
"learning_rate": 1.872450945738659e-05,
"loss": 0.6916751384735107,
"step": 1270
},
{
"epoch": 0.2044116179259407,
"grad_norm": 0.03510045260190964,
"learning_rate": 1.8698637000295816e-05,
"loss": 0.7119457721710205,
"step": 1280
},
{
"epoch": 0.20600858369098712,
"grad_norm": 0.029134295880794525,
"learning_rate": 1.867252301602825e-05,
"loss": 0.6870355129241943,
"step": 1290
},
{
"epoch": 0.20760554945603354,
"grad_norm": 0.03194071725010872,
"learning_rate": 1.8646168229667238e-05,
"loss": 0.734464693069458,
"step": 1300
},
{
"epoch": 0.20920251522107994,
"grad_norm": 0.026471436023712158,
"learning_rate": 1.861957337298227e-05,
"loss": 0.7240866184234619,
"step": 1310
},
{
"epoch": 0.21079948098612636,
"grad_norm": 0.0347883440554142,
"learning_rate": 1.8592739184408657e-05,
"loss": 0.6553101062774658,
"step": 1320
},
{
"epoch": 0.21239644675117278,
"grad_norm": 0.026658106595277786,
"learning_rate": 1.8565666409027004e-05,
"loss": 0.7519384384155273,
"step": 1330
},
{
"epoch": 0.21399341251621917,
"grad_norm": 0.030675504356622696,
"learning_rate": 1.8538355798542556e-05,
"loss": 0.6963082790374756,
"step": 1340
},
{
"epoch": 0.2155903782812656,
"grad_norm": 0.04593832045793533,
"learning_rate": 1.85108081112643e-05,
"loss": 0.7145741939544678,
"step": 1350
},
{
"epoch": 0.21718734404631201,
"grad_norm": 0.04163511469960213,
"learning_rate": 1.8483024112083928e-05,
"loss": 0.7342512130737304,
"step": 1360
},
{
"epoch": 0.2187843098113584,
"grad_norm": 0.025683345273137093,
"learning_rate": 1.8455004572454583e-05,
"loss": 0.7134137630462647,
"step": 1370
},
{
"epoch": 0.22038127557640483,
"grad_norm": 0.026864832267165184,
"learning_rate": 1.8426750270369452e-05,
"loss": 0.6854844570159913,
"step": 1380
},
{
"epoch": 0.22197824134145125,
"grad_norm": 0.03378361091017723,
"learning_rate": 1.839826199034015e-05,
"loss": 0.6904460430145264,
"step": 1390
},
{
"epoch": 0.22357520710649764,
"grad_norm": 0.03360961750149727,
"learning_rate": 1.8369540523374963e-05,
"loss": 0.7089653491973877,
"step": 1400
},
{
"epoch": 0.22517217287154406,
"grad_norm": 0.033883776515722275,
"learning_rate": 1.8340586666956846e-05,
"loss": 0.7053616523742676,
"step": 1410
},
{
"epoch": 0.22676913863659048,
"grad_norm": 0.03643488511443138,
"learning_rate": 1.8311401225021318e-05,
"loss": 0.7411230087280274,
"step": 1420
},
{
"epoch": 0.22836610440163688,
"grad_norm": 0.030240802094340324,
"learning_rate": 1.8281985007934115e-05,
"loss": 0.7020374298095703,
"step": 1430
},
{
"epoch": 0.2299630701666833,
"grad_norm": 0.03315526619553566,
"learning_rate": 1.8252338832468702e-05,
"loss": 0.7227590084075928,
"step": 1440
},
{
"epoch": 0.23156003593172972,
"grad_norm": 0.04955840855836868,
"learning_rate": 1.8222463521783584e-05,
"loss": 0.7004672527313233,
"step": 1450
},
{
"epoch": 0.2331570016967761,
"grad_norm": 0.03338323533535004,
"learning_rate": 1.819235990539946e-05,
"loss": 0.7032230377197266,
"step": 1460
},
{
"epoch": 0.23475396746182253,
"grad_norm": 0.03659350797533989,
"learning_rate": 1.8162028819176192e-05,
"loss": 0.7022134780883789,
"step": 1470
},
{
"epoch": 0.23635093322686895,
"grad_norm": 0.029659852385520935,
"learning_rate": 1.813147110528958e-05,
"loss": 0.7326688289642334,
"step": 1480
},
{
"epoch": 0.23794789899191537,
"grad_norm": 0.03314507007598877,
"learning_rate": 1.8100687612208e-05,
"loss": 0.7101527690887451,
"step": 1490
},
{
"epoch": 0.23954486475696177,
"grad_norm": 0.048770975321531296,
"learning_rate": 1.806967919466883e-05,
"loss": 0.7013855457305909,
"step": 1500
},
{
"epoch": 0.23954486475696177,
"eval_loss": 0.7003746032714844,
"eval_runtime": 1694.4125,
"eval_samples_per_second": 3.285,
"eval_steps_per_second": 1.642,
"step": 1500
},
{
"epoch": 0.2411418305220082,
"grad_norm": 0.031342763453722,
"learning_rate": 1.803844671365471e-05,
"loss": 0.6915247917175293,
"step": 1510
},
{
"epoch": 0.2427387962870546,
"grad_norm": 0.029329324141144753,
"learning_rate": 1.800699103636967e-05,
"loss": 0.721204423904419,
"step": 1520
},
{
"epoch": 0.244335762052101,
"grad_norm": 0.03263983875513077,
"learning_rate": 1.7975313036215015e-05,
"loss": 0.7468688011169433,
"step": 1530
},
{
"epoch": 0.24593272781714742,
"grad_norm": 0.035935308784246445,
"learning_rate": 1.794341359276509e-05,
"loss": 0.6986902713775635,
"step": 1540
},
{
"epoch": 0.24752969358219384,
"grad_norm": 0.03797990456223488,
"learning_rate": 1.7911293591742855e-05,
"loss": 0.7199019908905029,
"step": 1550
},
{
"epoch": 0.24912665934724024,
"grad_norm": 0.024390801787376404,
"learning_rate": 1.787895392499529e-05,
"loss": 0.6761057853698731,
"step": 1560
},
{
"epoch": 0.25072362511228663,
"grad_norm": 0.0313715860247612,
"learning_rate": 1.7846395490468643e-05,
"loss": 0.6731560707092286,
"step": 1570
},
{
"epoch": 0.25232059087733305,
"grad_norm": 0.0433608703315258,
"learning_rate": 1.781361919218348e-05,
"loss": 0.685046911239624,
"step": 1580
},
{
"epoch": 0.25391755664237947,
"grad_norm": 0.050231028348207474,
"learning_rate": 1.7780625940209596e-05,
"loss": 0.7280925273895263,
"step": 1590
},
{
"epoch": 0.2555145224074259,
"grad_norm": 0.0476137213408947,
"learning_rate": 1.774741665064074e-05,
"loss": 0.728914451599121,
"step": 1600
},
{
"epoch": 0.2571114881724723,
"grad_norm": 0.03540361300110817,
"learning_rate": 1.771399224556919e-05,
"loss": 0.6937174320220947,
"step": 1610
},
{
"epoch": 0.25870845393751873,
"grad_norm": 0.038516197353601456,
"learning_rate": 1.7680353653060135e-05,
"loss": 0.6788946151733398,
"step": 1620
},
{
"epoch": 0.2603054197025651,
"grad_norm": 0.0472414456307888,
"learning_rate": 1.7646501807125905e-05,
"loss": 0.7246061325073242,
"step": 1630
},
{
"epoch": 0.2619023854676115,
"grad_norm": 0.026418814435601234,
"learning_rate": 1.7612437647700056e-05,
"loss": 0.6792353630065918,
"step": 1640
},
{
"epoch": 0.26349935123265794,
"grad_norm": 0.025900904089212418,
"learning_rate": 1.757816212061126e-05,
"loss": 0.6991750240325928,
"step": 1650
},
{
"epoch": 0.26509631699770436,
"grad_norm": 0.036637961864471436,
"learning_rate": 1.7543676177557042e-05,
"loss": 0.7213243007659912,
"step": 1660
},
{
"epoch": 0.2666932827627508,
"grad_norm": 0.031415775418281555,
"learning_rate": 1.750898077607735e-05,
"loss": 0.6988609790802002,
"step": 1670
},
{
"epoch": 0.2682902485277972,
"grad_norm": 0.036175280809402466,
"learning_rate": 1.7474076879527977e-05,
"loss": 0.6812397003173828,
"step": 1680
},
{
"epoch": 0.2698872142928436,
"grad_norm": 0.03462570160627365,
"learning_rate": 1.743896545705382e-05,
"loss": 0.6791361331939697,
"step": 1690
},
{
"epoch": 0.27148418005789,
"grad_norm": 0.03334948793053627,
"learning_rate": 1.740364748356195e-05,
"loss": 0.6668567180633544,
"step": 1700
},
{
"epoch": 0.2730811458229364,
"grad_norm": 0.03270823508501053,
"learning_rate": 1.7368123939694554e-05,
"loss": 0.6887404441833496,
"step": 1710
},
{
"epoch": 0.27467811158798283,
"grad_norm": 0.03120650351047516,
"learning_rate": 1.7332395811801706e-05,
"loss": 0.6659372806549072,
"step": 1720
},
{
"epoch": 0.27627507735302925,
"grad_norm": 0.042766936123371124,
"learning_rate": 1.7296464091913986e-05,
"loss": 0.7515771389007568,
"step": 1730
},
{
"epoch": 0.2778720431180757,
"grad_norm": 0.027079230174422264,
"learning_rate": 1.7260329777714923e-05,
"loss": 0.7227123260498047,
"step": 1740
},
{
"epoch": 0.2794690088831221,
"grad_norm": 0.03032068908214569,
"learning_rate": 1.722399387251329e-05,
"loss": 0.695145559310913,
"step": 1750
},
{
"epoch": 0.28106597464816846,
"grad_norm": 0.03420598804950714,
"learning_rate": 1.7187457385215274e-05,
"loss": 0.6715566158294678,
"step": 1760
},
{
"epoch": 0.2826629404132149,
"grad_norm": 0.037743836641311646,
"learning_rate": 1.7150721330296428e-05,
"loss": 0.6833428382873535,
"step": 1770
},
{
"epoch": 0.2842599061782613,
"grad_norm": 0.035041823983192444,
"learning_rate": 1.7113786727773528e-05,
"loss": 0.7029437065124512,
"step": 1780
},
{
"epoch": 0.2858568719433077,
"grad_norm": 0.03538225591182709,
"learning_rate": 1.7076654603176234e-05,
"loss": 0.7168496131896973,
"step": 1790
},
{
"epoch": 0.28745383770835414,
"grad_norm": 0.03331568092107773,
"learning_rate": 1.7039325987518623e-05,
"loss": 0.7009388923645019,
"step": 1800
},
{
"epoch": 0.28905080347340056,
"grad_norm": 0.033853549510240555,
"learning_rate": 1.700180191727057e-05,
"loss": 0.7036603450775146,
"step": 1810
},
{
"epoch": 0.29064776923844693,
"grad_norm": 0.03764677420258522,
"learning_rate": 1.696408343432895e-05,
"loss": 0.6595663547515869,
"step": 1820
},
{
"epoch": 0.29224473500349335,
"grad_norm": 0.028797749429941177,
"learning_rate": 1.6926171585988728e-05,
"loss": 0.6624193668365479,
"step": 1830
},
{
"epoch": 0.29384170076853977,
"grad_norm": 0.03950833901762962,
"learning_rate": 1.6888067424913863e-05,
"loss": 0.7282033920288086,
"step": 1840
},
{
"epoch": 0.2954386665335862,
"grad_norm": 0.032418642193078995,
"learning_rate": 1.6849772009108094e-05,
"loss": 0.6922025680541992,
"step": 1850
},
{
"epoch": 0.2970356322986326,
"grad_norm": 0.04753576219081879,
"learning_rate": 1.6811286401885554e-05,
"loss": 0.6543939590454102,
"step": 1860
},
{
"epoch": 0.29863259806367903,
"grad_norm": 0.03233656659722328,
"learning_rate": 1.677261167184125e-05,
"loss": 0.7344549655914306,
"step": 1870
},
{
"epoch": 0.3002295638287254,
"grad_norm": 0.030969245359301567,
"learning_rate": 1.673374889282139e-05,
"loss": 0.6734820365905761,
"step": 1880
},
{
"epoch": 0.3018265295937718,
"grad_norm": 0.03292040154337883,
"learning_rate": 1.6694699143893566e-05,
"loss": 0.6728582382202148,
"step": 1890
},
{
"epoch": 0.30342349535881824,
"grad_norm": 0.05033507198095322,
"learning_rate": 1.6655463509316797e-05,
"loss": 0.7132375240325928,
"step": 1900
},
{
"epoch": 0.30502046112386466,
"grad_norm": 0.03168455511331558,
"learning_rate": 1.6616043078511425e-05,
"loss": 0.7131926536560058,
"step": 1910
},
{
"epoch": 0.3066174268889111,
"grad_norm": 0.03114555962383747,
"learning_rate": 1.657643894602885e-05,
"loss": 0.7184583187103272,
"step": 1920
},
{
"epoch": 0.3082143926539575,
"grad_norm": 0.031201248988509178,
"learning_rate": 1.6536652211521155e-05,
"loss": 0.6606316566467285,
"step": 1930
},
{
"epoch": 0.30981135841900387,
"grad_norm": 0.035202883183956146,
"learning_rate": 1.6496683979710576e-05,
"loss": 0.7029526233673096,
"step": 1940
},
{
"epoch": 0.3114083241840503,
"grad_norm": 0.03515475615859032,
"learning_rate": 1.6456535360358807e-05,
"loss": 0.6629764080047608,
"step": 1950
},
{
"epoch": 0.3130052899490967,
"grad_norm": 0.03606207296252251,
"learning_rate": 1.6416207468236208e-05,
"loss": 0.6876577377319336,
"step": 1960
},
{
"epoch": 0.31460225571414313,
"grad_norm": 0.03580459579825401,
"learning_rate": 1.6375701423090846e-05,
"loss": 0.6879127979278564,
"step": 1970
},
{
"epoch": 0.31619922147918955,
"grad_norm": 0.048258859664201736,
"learning_rate": 1.6335018349617394e-05,
"loss": 0.7255002498626709,
"step": 1980
},
{
"epoch": 0.317796187244236,
"grad_norm": 0.046948857605457306,
"learning_rate": 1.629415937742591e-05,
"loss": 0.6919849395751954,
"step": 1990
},
{
"epoch": 0.31939315300928234,
"grad_norm": 0.03351585939526558,
"learning_rate": 1.625312564101049e-05,
"loss": 0.6886905193328857,
"step": 2000
},
{
"epoch": 0.31939315300928234,
"eval_loss": 0.6917140483856201,
"eval_runtime": 1706.5142,
"eval_samples_per_second": 3.262,
"eval_steps_per_second": 1.631,
"step": 2000
},
{
"epoch": 0.32099011877432876,
"grad_norm": 0.028385179117321968,
"learning_rate": 1.621191827971772e-05,
"loss": 0.7103724956512452,
"step": 2010
},
{
"epoch": 0.3225870845393752,
"grad_norm": 0.02977878600358963,
"learning_rate": 1.617053843771509e-05,
"loss": 0.7012845516204834,
"step": 2020
},
{
"epoch": 0.3241840503044216,
"grad_norm": 0.03950609639286995,
"learning_rate": 1.6128987263959212e-05,
"loss": 0.670585298538208,
"step": 2030
},
{
"epoch": 0.325781016069468,
"grad_norm": 0.03531676530838013,
"learning_rate": 1.6087265912163898e-05,
"loss": 0.6601489543914795,
"step": 2040
},
{
"epoch": 0.32737798183451444,
"grad_norm": 0.030362443998456,
"learning_rate": 1.6045375540768136e-05,
"loss": 0.7151473522186279,
"step": 2050
},
{
"epoch": 0.3289749475995608,
"grad_norm": 0.03205285966396332,
"learning_rate": 1.600331731290395e-05,
"loss": 0.6645094394683838,
"step": 2060
},
{
"epoch": 0.33057191336460723,
"grad_norm": 0.028163282200694084,
"learning_rate": 1.5961092396364053e-05,
"loss": 0.6868838310241699,
"step": 2070
},
{
"epoch": 0.33216887912965365,
"grad_norm": 0.03369533643126488,
"learning_rate": 1.5918701963569475e-05,
"loss": 0.7029123783111573,
"step": 2080
},
{
"epoch": 0.33376584489470007,
"grad_norm": 0.0379788838326931,
"learning_rate": 1.5876147191536983e-05,
"loss": 0.7173356056213379,
"step": 2090
},
{
"epoch": 0.3353628106597465,
"grad_norm": 0.0344555526971817,
"learning_rate": 1.583342926184639e-05,
"loss": 0.7149466037750244,
"step": 2100
},
{
"epoch": 0.3369597764247929,
"grad_norm": 0.03237446770071983,
"learning_rate": 1.5790549360607772e-05,
"loss": 0.6467396259307862,
"step": 2110
},
{
"epoch": 0.33855674218983933,
"grad_norm": 0.032042454928159714,
"learning_rate": 1.5747508678428515e-05,
"loss": 0.6929362297058106,
"step": 2120
},
{
"epoch": 0.3401537079548857,
"grad_norm": 0.042478688061237335,
"learning_rate": 1.570430841038027e-05,
"loss": 0.6702353000640869,
"step": 2130
},
{
"epoch": 0.3417506737199321,
"grad_norm": 0.04559747502207756,
"learning_rate": 1.5660949755965757e-05,
"loss": 0.6600518226623535,
"step": 2140
},
{
"epoch": 0.34334763948497854,
"grad_norm": 0.03681457042694092,
"learning_rate": 1.5617433919085474e-05,
"loss": 0.6836549758911132,
"step": 2150
},
{
"epoch": 0.34494460525002496,
"grad_norm": 0.02465015836060047,
"learning_rate": 1.5573762108004262e-05,
"loss": 0.7304904460906982,
"step": 2160
},
{
"epoch": 0.3465415710150714,
"grad_norm": 0.03969291225075722,
"learning_rate": 1.5529935535317746e-05,
"loss": 0.6897444725036621,
"step": 2170
},
{
"epoch": 0.3481385367801178,
"grad_norm": 0.03233686834573746,
"learning_rate": 1.548595541791869e-05,
"loss": 0.684897518157959,
"step": 2180
},
{
"epoch": 0.34973550254516417,
"grad_norm": 0.03138101100921631,
"learning_rate": 1.544182297696319e-05,
"loss": 0.7246172428131104,
"step": 2190
},
{
"epoch": 0.3513324683102106,
"grad_norm": 0.03732126206159592,
"learning_rate": 1.5397539437836765e-05,
"loss": 0.6686253070831298,
"step": 2200
},
{
"epoch": 0.352929434075257,
"grad_norm": 0.028484191745519638,
"learning_rate": 1.535310603012035e-05,
"loss": 0.6723896503448487,
"step": 2210
},
{
"epoch": 0.35452639984030343,
"grad_norm": 0.04804018512368202,
"learning_rate": 1.5308523987556144e-05,
"loss": 0.7210356712341308,
"step": 2220
},
{
"epoch": 0.35612336560534985,
"grad_norm": 0.035007379949092865,
"learning_rate": 1.5263794548013347e-05,
"loss": 0.6699332237243653,
"step": 2230
},
{
"epoch": 0.35772033137039627,
"grad_norm": 0.034309230744838715,
"learning_rate": 1.5218918953453808e-05,
"loss": 0.6913293838500977,
"step": 2240
},
{
"epoch": 0.35931729713544264,
"grad_norm": 0.05658178776502609,
"learning_rate": 1.5173898449897527e-05,
"loss": 0.6920335292816162,
"step": 2250
},
{
"epoch": 0.36091426290048906,
"grad_norm": 0.03614016994833946,
"learning_rate": 1.5128734287388051e-05,
"loss": 0.6855293273925781,
"step": 2260
},
{
"epoch": 0.3625112286655355,
"grad_norm": 0.0348992682993412,
"learning_rate": 1.5083427719957792e-05,
"loss": 0.7068172454833984,
"step": 2270
},
{
"epoch": 0.3641081944305819,
"grad_norm": 0.037384193390607834,
"learning_rate": 1.5037980005593178e-05,
"loss": 0.7026480197906494,
"step": 2280
},
{
"epoch": 0.3657051601956283,
"grad_norm": 0.03162362426519394,
"learning_rate": 1.4992392406199741e-05,
"loss": 0.7369038581848144,
"step": 2290
},
{
"epoch": 0.36730212596067474,
"grad_norm": 0.03644828870892525,
"learning_rate": 1.4946666187567073e-05,
"loss": 0.7133200168609619,
"step": 2300
},
{
"epoch": 0.3688990917257211,
"grad_norm": 0.05075901374220848,
"learning_rate": 1.490080261933368e-05,
"loss": 0.7146055221557617,
"step": 2310
},
{
"epoch": 0.3704960574907675,
"grad_norm": 0.029153743758797646,
"learning_rate": 1.4854802974951732e-05,
"loss": 0.6756022453308106,
"step": 2320
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.030742282047867775,
"learning_rate": 1.4808668531651698e-05,
"loss": 0.6855404376983643,
"step": 2330
},
{
"epoch": 0.37368998902086037,
"grad_norm": 0.03525373339653015,
"learning_rate": 1.4762400570406889e-05,
"loss": 0.6626357078552246,
"step": 2340
},
{
"epoch": 0.3752869547859068,
"grad_norm": 0.036246221512556076,
"learning_rate": 1.4716000375897897e-05,
"loss": 0.714638090133667,
"step": 2350
},
{
"epoch": 0.3768839205509532,
"grad_norm": 0.03430463373661041,
"learning_rate": 1.4669469236476897e-05,
"loss": 0.6736037731170654,
"step": 2360
},
{
"epoch": 0.3784808863159996,
"grad_norm": 0.035327013581991196,
"learning_rate": 1.4622808444131904e-05,
"loss": 0.6907155036926269,
"step": 2370
},
{
"epoch": 0.380077852081046,
"grad_norm": 0.039804793894290924,
"learning_rate": 1.457601929445089e-05,
"loss": 0.6726024627685547,
"step": 2380
},
{
"epoch": 0.3816748178460924,
"grad_norm": 0.03689780458807945,
"learning_rate": 1.4529103086585803e-05,
"loss": 0.6932793617248535,
"step": 2390
},
{
"epoch": 0.38327178361113884,
"grad_norm": 0.03002229705452919,
"learning_rate": 1.4482061123216507e-05,
"loss": 0.6448677062988282,
"step": 2400
},
{
"epoch": 0.38486874937618526,
"grad_norm": 0.04287157952785492,
"learning_rate": 1.4434894710514596e-05,
"loss": 0.6573331356048584,
"step": 2410
},
{
"epoch": 0.3864657151412317,
"grad_norm": 0.033608682453632355,
"learning_rate": 1.4387605158107146e-05,
"loss": 0.6761981010437011,
"step": 2420
},
{
"epoch": 0.38806268090627805,
"grad_norm": 0.0340590700507164,
"learning_rate": 1.4340193779040335e-05,
"loss": 0.7119544982910156,
"step": 2430
},
{
"epoch": 0.38965964667132447,
"grad_norm": 0.04576168581843376,
"learning_rate": 1.429266188974299e-05,
"loss": 0.722407054901123,
"step": 2440
},
{
"epoch": 0.3912566124363709,
"grad_norm": 0.030462034046649933,
"learning_rate": 1.4245010809990052e-05,
"loss": 0.6741881847381592,
"step": 2450
},
{
"epoch": 0.3928535782014173,
"grad_norm": 0.024982597678899765,
"learning_rate": 1.419724186286589e-05,
"loss": 0.6923768997192383,
"step": 2460
},
{
"epoch": 0.39445054396646373,
"grad_norm": 0.028493596240878105,
"learning_rate": 1.414935637472761e-05,
"loss": 0.7135504722595215,
"step": 2470
},
{
"epoch": 0.39604750973151015,
"grad_norm": 0.03309721499681473,
"learning_rate": 1.4101355675168197e-05,
"loss": 0.6870269298553466,
"step": 2480
},
{
"epoch": 0.3976444754965565,
"grad_norm": 0.033349502831697464,
"learning_rate": 1.4053241096979611e-05,
"loss": 0.7310259819030762,
"step": 2490
},
{
"epoch": 0.39924144126160294,
"grad_norm": 0.0456295944750309,
"learning_rate": 1.4005013976115777e-05,
"loss": 0.6850334644317627,
"step": 2500
},
{
"epoch": 0.39924144126160294,
"eval_loss": 0.6847082376480103,
"eval_runtime": 1727.6141,
"eval_samples_per_second": 3.222,
"eval_steps_per_second": 1.611,
"step": 2500
},
{
"epoch": 0.40083840702664936,
"grad_norm": 0.04150133952498436,
"learning_rate": 1.3956675651655489e-05,
"loss": 0.6748110771179199,
"step": 2510
},
{
"epoch": 0.4024353727916958,
"grad_norm": 0.05328802019357681,
"learning_rate": 1.390822746576523e-05,
"loss": 0.6966916561126709,
"step": 2520
},
{
"epoch": 0.4040323385567422,
"grad_norm": 0.04510056599974632,
"learning_rate": 1.3859670763661908e-05,
"loss": 0.6688936710357666,
"step": 2530
},
{
"epoch": 0.4056293043217886,
"grad_norm": 0.03648371994495392,
"learning_rate": 1.38110068935755e-05,
"loss": 0.6503877162933349,
"step": 2540
},
{
"epoch": 0.40722627008683504,
"grad_norm": 0.03958805277943611,
"learning_rate": 1.3762237206711624e-05,
"loss": 0.6709461212158203,
"step": 2550
},
{
"epoch": 0.4088232358518814,
"grad_norm": 0.03620138764381409,
"learning_rate": 1.3713363057214006e-05,
"loss": 0.6775450706481934,
"step": 2560
},
{
"epoch": 0.4104202016169278,
"grad_norm": 0.036233801394701004,
"learning_rate": 1.3664385802126904e-05,
"loss": 0.7039198875427246,
"step": 2570
},
{
"epoch": 0.41201716738197425,
"grad_norm": 0.034267205744981766,
"learning_rate": 1.3615306801357413e-05,
"loss": 0.6893480777740478,
"step": 2580
},
{
"epoch": 0.41361413314702067,
"grad_norm": 0.04056892171502113,
"learning_rate": 1.3566127417637698e-05,
"loss": 0.6852858543395997,
"step": 2590
},
{
"epoch": 0.4152110989120671,
"grad_norm": 0.04654235392808914,
"learning_rate": 1.351684901648718e-05,
"loss": 0.6932817459106445,
"step": 2600
},
{
"epoch": 0.4168080646771135,
"grad_norm": 0.05177690088748932,
"learning_rate": 1.3467472966174595e-05,
"loss": 0.663325309753418,
"step": 2610
},
{
"epoch": 0.4184050304421599,
"grad_norm": 0.04320165514945984,
"learning_rate": 1.3418000637680026e-05,
"loss": 0.6278695106506348,
"step": 2620
},
{
"epoch": 0.4200019962072063,
"grad_norm": 0.0365922674536705,
"learning_rate": 1.336843340465682e-05,
"loss": 0.6674781799316406,
"step": 2630
},
{
"epoch": 0.4215989619722527,
"grad_norm": 0.03406934812664986,
"learning_rate": 1.3318772643393447e-05,
"loss": 0.6559438705444336,
"step": 2640
},
{
"epoch": 0.42319592773729914,
"grad_norm": 0.030784646049141884,
"learning_rate": 1.3269019732775304e-05,
"loss": 0.7225832462310791,
"step": 2650
},
{
"epoch": 0.42479289350234556,
"grad_norm": 0.0360528789460659,
"learning_rate": 1.3219176054246404e-05,
"loss": 0.7170959949493408,
"step": 2660
},
{
"epoch": 0.426389859267392,
"grad_norm": 0.0355147160589695,
"learning_rate": 1.316924299177103e-05,
"loss": 0.683486795425415,
"step": 2670
},
{
"epoch": 0.42798682503243834,
"grad_norm": 0.046759720891714096,
"learning_rate": 1.3119221931795321e-05,
"loss": 0.7131591320037842,
"step": 2680
},
{
"epoch": 0.42958379079748477,
"grad_norm": 0.03554374352097511,
"learning_rate": 1.3069114263208746e-05,
"loss": 0.6715425491333008,
"step": 2690
},
{
"epoch": 0.4311807565625312,
"grad_norm": 0.04263368248939514,
"learning_rate": 1.3018921377305574e-05,
"loss": 0.6871604442596435,
"step": 2700
},
{
"epoch": 0.4327777223275776,
"grad_norm": 0.04496333748102188,
"learning_rate": 1.2968644667746207e-05,
"loss": 0.6588253021240235,
"step": 2710
},
{
"epoch": 0.43437468809262403,
"grad_norm": 0.036086685955524445,
"learning_rate": 1.2918285530518519e-05,
"loss": 0.6757514953613282,
"step": 2720
},
{
"epoch": 0.43597165385767045,
"grad_norm": 0.03308691456913948,
"learning_rate": 1.2867845363899075e-05,
"loss": 0.6658079624176025,
"step": 2730
},
{
"epoch": 0.4375686196227168,
"grad_norm": 0.04500787332653999,
"learning_rate": 1.2817325568414299e-05,
"loss": 0.6812032222747803,
"step": 2740
},
{
"epoch": 0.43916558538776324,
"grad_norm": 0.03841916844248772,
"learning_rate": 1.2766727546801612e-05,
"loss": 0.6864322662353516,
"step": 2750
},
{
"epoch": 0.44076255115280966,
"grad_norm": 0.037914253771305084,
"learning_rate": 1.2716052703970459e-05,
"loss": 0.7053091526031494,
"step": 2760
},
{
"epoch": 0.4423595169178561,
"grad_norm": 0.03686266764998436,
"learning_rate": 1.2665302446963312e-05,
"loss": 0.7154440879821777,
"step": 2770
},
{
"epoch": 0.4439564826829025,
"grad_norm": 0.04227980226278305,
"learning_rate": 1.261447818491661e-05,
"loss": 0.716032886505127,
"step": 2780
},
{
"epoch": 0.4455534484479489,
"grad_norm": 0.04683419317007065,
"learning_rate": 1.2563581329021608e-05,
"loss": 0.6738180637359619,
"step": 2790
},
{
"epoch": 0.4471504142129953,
"grad_norm": 0.05264711007475853,
"learning_rate": 1.2512613292485218e-05,
"loss": 0.7129797458648681,
"step": 2800
},
{
"epoch": 0.4487473799780417,
"grad_norm": 0.04030587151646614,
"learning_rate": 1.2461575490490753e-05,
"loss": 0.727331018447876,
"step": 2810
},
{
"epoch": 0.4503443457430881,
"grad_norm": 0.04141924902796745,
"learning_rate": 1.2410469340158655e-05,
"loss": 0.676334810256958,
"step": 2820
},
{
"epoch": 0.45194131150813455,
"grad_norm": 0.04476945474743843,
"learning_rate": 1.2359296260507117e-05,
"loss": 0.6823254108428956,
"step": 2830
},
{
"epoch": 0.45353827727318097,
"grad_norm": 0.03869614750146866,
"learning_rate": 1.23080576724127e-05,
"loss": 0.6882941722869873,
"step": 2840
},
{
"epoch": 0.4551352430382274,
"grad_norm": 0.031092161312699318,
"learning_rate": 1.2256754998570887e-05,
"loss": 0.6744899272918701,
"step": 2850
},
{
"epoch": 0.45673220880327375,
"grad_norm": 0.03507015109062195,
"learning_rate": 1.2205389663456566e-05,
"loss": 0.7103989124298096,
"step": 2860
},
{
"epoch": 0.4583291745683202,
"grad_norm": 0.032851576805114746,
"learning_rate": 1.2153963093284483e-05,
"loss": 0.7116181373596191,
"step": 2870
},
{
"epoch": 0.4599261403333666,
"grad_norm": 0.05045435577630997,
"learning_rate": 1.210247671596965e-05,
"loss": 0.6489015579223633,
"step": 2880
},
{
"epoch": 0.461523106098413,
"grad_norm": 0.032028377056121826,
"learning_rate": 1.2050931961087673e-05,
"loss": 0.6941986560821534,
"step": 2890
},
{
"epoch": 0.46312007186345944,
"grad_norm": 0.038908179849386215,
"learning_rate": 1.1999330259835096e-05,
"loss": 0.6908363342285156,
"step": 2900
},
{
"epoch": 0.46471703762850586,
"grad_norm": 0.03464297205209732,
"learning_rate": 1.1947673044989627e-05,
"loss": 0.7050665378570556,
"step": 2910
},
{
"epoch": 0.4663140033935522,
"grad_norm": 0.028774775564670563,
"learning_rate": 1.1895961750870375e-05,
"loss": 0.6700276851654052,
"step": 2920
},
{
"epoch": 0.46791096915859864,
"grad_norm": 0.03465382754802704,
"learning_rate": 1.1844197813298018e-05,
"loss": 0.6821481227874756,
"step": 2930
},
{
"epoch": 0.46950793492364506,
"grad_norm": 0.03162831813097,
"learning_rate": 1.1792382669554938e-05,
"loss": 0.6927790164947509,
"step": 2940
},
{
"epoch": 0.4711049006886915,
"grad_norm": 0.03354581072926521,
"learning_rate": 1.1740517758345312e-05,
"loss": 0.7100222110748291,
"step": 2950
},
{
"epoch": 0.4727018664537379,
"grad_norm": 0.033408552408218384,
"learning_rate": 1.168860451975516e-05,
"loss": 0.6876598358154297,
"step": 2960
},
{
"epoch": 0.4742988322187843,
"grad_norm": 0.04334475100040436,
"learning_rate": 1.1636644395212375e-05,
"loss": 0.6866564273834228,
"step": 2970
},
{
"epoch": 0.47589579798383075,
"grad_norm": 0.040464501827955246,
"learning_rate": 1.1584638827446684e-05,
"loss": 0.6588007926940918,
"step": 2980
},
{
"epoch": 0.4774927637488771,
"grad_norm": 0.04957546666264534,
"learning_rate": 1.1532589260449589e-05,
"loss": 0.7070925235748291,
"step": 2990
},
{
"epoch": 0.47908972951392353,
"grad_norm": 0.038624707609415054,
"learning_rate": 1.1480497139434294e-05,
"loss": 0.6919936180114746,
"step": 3000
},
{
"epoch": 0.47908972951392353,
"eval_loss": 0.6803271770477295,
"eval_runtime": 1741.9445,
"eval_samples_per_second": 3.195,
"eval_steps_per_second": 1.598,
"step": 3000
},
{
"epoch": 0.48068669527896996,
"grad_norm": 0.04100416228175163,
"learning_rate": 1.1428363910795547e-05,
"loss": 0.6852492809295654,
"step": 3010
},
{
"epoch": 0.4822836610440164,
"grad_norm": 0.033158283680677414,
"learning_rate": 1.1376191022069501e-05,
"loss": 0.667516565322876,
"step": 3020
},
{
"epoch": 0.4838806268090628,
"grad_norm": 0.03429755941033363,
"learning_rate": 1.1323979921893514e-05,
"loss": 0.6956333160400391,
"step": 3030
},
{
"epoch": 0.4854775925741092,
"grad_norm": 0.038232944905757904,
"learning_rate": 1.1271732059965925e-05,
"loss": 0.6716940402984619,
"step": 3040
},
{
"epoch": 0.4870745583391556,
"grad_norm": 0.039248283952474594,
"learning_rate": 1.1219448887005805e-05,
"loss": 0.6984889507293701,
"step": 3050
},
{
"epoch": 0.488671524104202,
"grad_norm": 0.03670873865485191,
"learning_rate": 1.1167131854712676e-05,
"loss": 0.6709868907928467,
"step": 3060
},
{
"epoch": 0.4902684898692484,
"grad_norm": 0.038310978561639786,
"learning_rate": 1.11147824157262e-05,
"loss": 0.6672838211059571,
"step": 3070
},
{
"epoch": 0.49186545563429485,
"grad_norm": 0.049189358949661255,
"learning_rate": 1.1062402023585846e-05,
"loss": 0.6705825805664063,
"step": 3080
},
{
"epoch": 0.49346242139934127,
"grad_norm": 0.0355689600110054,
"learning_rate": 1.1009992132690523e-05,
"loss": 0.6964725017547607,
"step": 3090
},
{
"epoch": 0.4950593871643877,
"grad_norm": 0.03348139300942421,
"learning_rate": 1.0957554198258225e-05,
"loss": 0.6852751255035401,
"step": 3100
},
{
"epoch": 0.49665635292943405,
"grad_norm": 0.03575340285897255,
"learning_rate": 1.0905089676285588e-05,
"loss": 0.670794153213501,
"step": 3110
},
{
"epoch": 0.4982533186944805,
"grad_norm": 0.04119575023651123,
"learning_rate": 1.0852600023507482e-05,
"loss": 0.6651177406311035,
"step": 3120
},
{
"epoch": 0.4998502844595269,
"grad_norm": 0.04122074693441391,
"learning_rate": 1.0800086697356576e-05,
"loss": 0.6976897239685058,
"step": 3130
},
{
"epoch": 0.5014472502245733,
"grad_norm": 0.034223176538944244,
"learning_rate": 1.0747551155922836e-05,
"loss": 0.6890422821044921,
"step": 3140
},
{
"epoch": 0.5030442159896197,
"grad_norm": 0.03588424623012543,
"learning_rate": 1.069499485791307e-05,
"loss": 0.702085018157959,
"step": 3150
},
{
"epoch": 0.5046411817546661,
"grad_norm": 0.03808250650763512,
"learning_rate": 1.0642419262610417e-05,
"loss": 0.6910345554351807,
"step": 3160
},
{
"epoch": 0.5062381475197125,
"grad_norm": 0.03852913901209831,
"learning_rate": 1.0589825829833825e-05,
"loss": 0.6685952186584473,
"step": 3170
},
{
"epoch": 0.5078351132847589,
"grad_norm": 0.04528075084090233,
"learning_rate": 1.0537216019897514e-05,
"loss": 0.6971333980560303,
"step": 3180
},
{
"epoch": 0.5094320790498054,
"grad_norm": 0.048296891152858734,
"learning_rate": 1.0484591293570437e-05,
"loss": 0.6506364345550537,
"step": 3190
},
{
"epoch": 0.5110290448148518,
"grad_norm": 0.03280564025044441,
"learning_rate": 1.0431953112035722e-05,
"loss": 0.7106664657592774,
"step": 3200
},
{
"epoch": 0.5126260105798982,
"grad_norm": 0.03674250841140747,
"learning_rate": 1.0379302936850083e-05,
"loss": 0.6798049449920655,
"step": 3210
},
{
"epoch": 0.5142229763449446,
"grad_norm": 0.02851015329360962,
"learning_rate": 1.0326642229903265e-05,
"loss": 0.6674058914184571,
"step": 3220
},
{
"epoch": 0.515819942109991,
"grad_norm": 0.04876931011676788,
"learning_rate": 1.0273972453377433e-05,
"loss": 0.6956850051879883,
"step": 3230
},
{
"epoch": 0.5174169078750375,
"grad_norm": 0.04633413255214691,
"learning_rate": 1.022129506970657e-05,
"loss": 0.6859352111816406,
"step": 3240
},
{
"epoch": 0.5190138736400839,
"grad_norm": 0.03311692923307419,
"learning_rate": 1.0168611541535898e-05,
"loss": 0.7103584289550782,
"step": 3250
},
{
"epoch": 0.5206108394051302,
"grad_norm": 0.047599345445632935,
"learning_rate": 1.0115923331681231e-05,
"loss": 0.6435458183288574,
"step": 3260
},
{
"epoch": 0.5222078051701766,
"grad_norm": 0.033676277846097946,
"learning_rate": 1.0063231903088384e-05,
"loss": 0.6581607341766358,
"step": 3270
},
{
"epoch": 0.523804770935223,
"grad_norm": 0.04461989924311638,
"learning_rate": 1.0010538718792544e-05,
"loss": 0.7152835369110108,
"step": 3280
},
{
"epoch": 0.5254017367002695,
"grad_norm": 0.04492766410112381,
"learning_rate": 9.957845241877639e-06,
"loss": 0.7056149482727051,
"step": 3290
},
{
"epoch": 0.5269987024653159,
"grad_norm": 0.02856948785483837,
"learning_rate": 9.905152935435733e-06,
"loss": 0.6655986785888672,
"step": 3300
},
{
"epoch": 0.5285956682303623,
"grad_norm": 0.0438731387257576,
"learning_rate": 9.852463262526383e-06,
"loss": 0.7045872688293457,
"step": 3310
},
{
"epoch": 0.5301926339954087,
"grad_norm": 0.04025017097592354,
"learning_rate": 9.799777686136025e-06,
"loss": 0.6844919204711915,
"step": 3320
},
{
"epoch": 0.5317895997604551,
"grad_norm": 0.04453453794121742,
"learning_rate": 9.747097669137352e-06,
"loss": 0.6857856750488281,
"step": 3330
},
{
"epoch": 0.5333865655255016,
"grad_norm": 0.034065935760736465,
"learning_rate": 9.694424674248687e-06,
"loss": 0.6782355785369873,
"step": 3340
},
{
"epoch": 0.534983531290548,
"grad_norm": 0.03808142989873886,
"learning_rate": 9.641760163993394e-06,
"loss": 0.7040555000305175,
"step": 3350
},
{
"epoch": 0.5365804970555944,
"grad_norm": 0.03482348844408989,
"learning_rate": 9.58910560065923e-06,
"loss": 0.6730469226837158,
"step": 3360
},
{
"epoch": 0.5381774628206408,
"grad_norm": 0.04541369527578354,
"learning_rate": 9.536462446257777e-06,
"loss": 0.7102465152740478,
"step": 3370
},
{
"epoch": 0.5397744285856872,
"grad_norm": 0.043802615255117416,
"learning_rate": 9.483832162483841e-06,
"loss": 0.6941490173339844,
"step": 3380
},
{
"epoch": 0.5413713943507336,
"grad_norm": 0.03652375563979149,
"learning_rate": 9.431216210674849e-06,
"loss": 0.704205322265625,
"step": 3390
},
{
"epoch": 0.54296836011578,
"grad_norm": 0.03886988013982773,
"learning_rate": 9.378616051770287e-06,
"loss": 0.6582465171813965,
"step": 3400
},
{
"epoch": 0.5445653258808264,
"grad_norm": 0.044551681727170944,
"learning_rate": 9.326033146271144e-06,
"loss": 0.7078223705291748,
"step": 3410
},
{
"epoch": 0.5461622916458728,
"grad_norm": 0.03762778267264366,
"learning_rate": 9.273468954199333e-06,
"loss": 0.6956658840179444,
"step": 3420
},
{
"epoch": 0.5477592574109192,
"grad_norm": 0.03794670104980469,
"learning_rate": 9.220924935057184e-06,
"loss": 0.6778861522674561,
"step": 3430
},
{
"epoch": 0.5493562231759657,
"grad_norm": 0.03766616806387901,
"learning_rate": 9.168402547786897e-06,
"loss": 0.6759885311126709,
"step": 3440
},
{
"epoch": 0.5509531889410121,
"grad_norm": 0.04172630235552788,
"learning_rate": 9.115903250730035e-06,
"loss": 0.6739662170410157,
"step": 3450
},
{
"epoch": 0.5525501547060585,
"grad_norm": 0.04936975613236427,
"learning_rate": 9.063428501587043e-06,
"loss": 0.6581857681274415,
"step": 3460
},
{
"epoch": 0.5541471204711049,
"grad_norm": 0.029507510364055634,
"learning_rate": 9.010979757376766e-06,
"loss": 0.6508080005645752,
"step": 3470
},
{
"epoch": 0.5557440862361513,
"grad_norm": 0.03853528946638107,
"learning_rate": 8.958558474395987e-06,
"loss": 0.6520394325256348,
"step": 3480
},
{
"epoch": 0.5573410520011978,
"grad_norm": 0.04168470576405525,
"learning_rate": 8.906166108178999e-06,
"loss": 0.6652052402496338,
"step": 3490
},
{
"epoch": 0.5589380177662442,
"grad_norm": 0.05398240312933922,
"learning_rate": 8.853804113457203e-06,
"loss": 0.6777332782745361,
"step": 3500
},
{
"epoch": 0.5589380177662442,
"eval_loss": 0.677163302898407,
"eval_runtime": 1733.3138,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 1.606,
"step": 3500
},
{
"epoch": 0.5605349835312905,
"grad_norm": 0.03181201219558716,
"learning_rate": 8.801473944118683e-06,
"loss": 0.687204122543335,
"step": 3510
},
{
"epoch": 0.5621319492963369,
"grad_norm": 0.03610098734498024,
"learning_rate": 8.749177053167865e-06,
"loss": 0.7028386116027832,
"step": 3520
},
{
"epoch": 0.5637289150613833,
"grad_norm": 0.0450638048350811,
"learning_rate": 8.696914892685172e-06,
"loss": 0.6494070529937744,
"step": 3530
},
{
"epoch": 0.5653258808264298,
"grad_norm": 0.053478166460990906,
"learning_rate": 8.644688913786678e-06,
"loss": 0.6962610721588135,
"step": 3540
},
{
"epoch": 0.5669228465914762,
"grad_norm": 0.03924020007252693,
"learning_rate": 8.592500566583856e-06,
"loss": 0.6941705703735351,
"step": 3550
},
{
"epoch": 0.5685198123565226,
"grad_norm": 0.037506818771362305,
"learning_rate": 8.540351300143284e-06,
"loss": 0.6824637413024902,
"step": 3560
},
{
"epoch": 0.570116778121569,
"grad_norm": 0.03883660212159157,
"learning_rate": 8.488242562446416e-06,
"loss": 0.7028771877288819,
"step": 3570
},
{
"epoch": 0.5717137438866154,
"grad_norm": 0.047728635370731354,
"learning_rate": 8.436175800349393e-06,
"loss": 0.7102296829223633,
"step": 3580
},
{
"epoch": 0.5733107096516619,
"grad_norm": 0.0423489511013031,
"learning_rate": 8.384152459542849e-06,
"loss": 0.7113842487335205,
"step": 3590
},
{
"epoch": 0.5749076754167083,
"grad_norm": 0.034012071788311005,
"learning_rate": 8.332173984511774e-06,
"loss": 0.6995905876159668,
"step": 3600
},
{
"epoch": 0.5765046411817547,
"grad_norm": 0.02976168505847454,
"learning_rate": 8.280241818495423e-06,
"loss": 0.7066914558410644,
"step": 3610
},
{
"epoch": 0.5781016069468011,
"grad_norm": 0.04333082213997841,
"learning_rate": 8.228357403447225e-06,
"loss": 0.6559367656707764,
"step": 3620
},
{
"epoch": 0.5796985727118474,
"grad_norm": 0.0726407989859581,
"learning_rate": 8.176522179994744e-06,
"loss": 0.691002082824707,
"step": 3630
},
{
"epoch": 0.5812955384768939,
"grad_norm": 0.035537876188755035,
"learning_rate": 8.124737587399697e-06,
"loss": 0.7173010349273682,
"step": 3640
},
{
"epoch": 0.5828925042419403,
"grad_norm": 0.036615390330553055,
"learning_rate": 8.073005063517973e-06,
"loss": 0.712274169921875,
"step": 3650
},
{
"epoch": 0.5844894700069867,
"grad_norm": 0.02762191742658615,
"learning_rate": 8.021326044759718e-06,
"loss": 0.7043851852416992,
"step": 3660
},
{
"epoch": 0.5860864357720331,
"grad_norm": 0.033354733139276505,
"learning_rate": 7.969701966049447e-06,
"loss": 0.6736726284027099,
"step": 3670
},
{
"epoch": 0.5876834015370795,
"grad_norm": 0.04251250997185707,
"learning_rate": 7.918134260786214e-06,
"loss": 0.679281759262085,
"step": 3680
},
{
"epoch": 0.589280367302126,
"grad_norm": 0.03284266218543053,
"learning_rate": 7.866624360803786e-06,
"loss": 0.6851751804351807,
"step": 3690
},
{
"epoch": 0.5908773330671724,
"grad_norm": 0.050126783549785614,
"learning_rate": 7.81517369633092e-06,
"loss": 0.6820148468017578,
"step": 3700
},
{
"epoch": 0.5924742988322188,
"grad_norm": 0.042198847979307175,
"learning_rate": 7.763783695951623e-06,
"loss": 0.6824104309082031,
"step": 3710
},
{
"epoch": 0.5940712645972652,
"grad_norm": 0.03443041443824768,
"learning_rate": 7.7124557865655e-06,
"loss": 0.670203685760498,
"step": 3720
},
{
"epoch": 0.5956682303623116,
"grad_norm": 0.036490943282842636,
"learning_rate": 7.661191393348136e-06,
"loss": 0.7197192192077637,
"step": 3730
},
{
"epoch": 0.5972651961273581,
"grad_norm": 0.03636472299695015,
"learning_rate": 7.609991939711517e-06,
"loss": 0.6888130187988282,
"step": 3740
},
{
"epoch": 0.5988621618924044,
"grad_norm": 0.040862370282411575,
"learning_rate": 7.558858847264502e-06,
"loss": 0.6817411422729492,
"step": 3750
},
{
"epoch": 0.6004591276574508,
"grad_norm": 0.036771345883607864,
"learning_rate": 7.507793535773377e-06,
"loss": 0.7020951271057129,
"step": 3760
},
{
"epoch": 0.6020560934224972,
"grad_norm": 0.036549679934978485,
"learning_rate": 7.456797423122401e-06,
"loss": 0.6926814079284668,
"step": 3770
},
{
"epoch": 0.6036530591875436,
"grad_norm": 0.03661128878593445,
"learning_rate": 7.405871925274457e-06,
"loss": 0.7070647716522217,
"step": 3780
},
{
"epoch": 0.6052500249525901,
"grad_norm": 0.05932987853884697,
"learning_rate": 7.3550184562317295e-06,
"loss": 0.7125431060791015,
"step": 3790
},
{
"epoch": 0.6068469907176365,
"grad_norm": 0.03799287602305412,
"learning_rate": 7.3042384279964394e-06,
"loss": 0.6607150554656982,
"step": 3800
},
{
"epoch": 0.6084439564826829,
"grad_norm": 0.031429167836904526,
"learning_rate": 7.253533250531656e-06,
"loss": 0.6806385517120361,
"step": 3810
},
{
"epoch": 0.6100409222477293,
"grad_norm": 0.05243317410349846,
"learning_rate": 7.202904331722127e-06,
"loss": 0.6834061145782471,
"step": 3820
},
{
"epoch": 0.6116378880127757,
"grad_norm": 0.0379512719810009,
"learning_rate": 7.152353077335189e-06,
"loss": 0.665333604812622,
"step": 3830
},
{
"epoch": 0.6132348537778222,
"grad_norm": 0.03895451873540878,
"learning_rate": 7.101880890981751e-06,
"loss": 0.6566737651824951,
"step": 3840
},
{
"epoch": 0.6148318195428686,
"grad_norm": 0.03916362673044205,
"learning_rate": 7.0514891740773105e-06,
"loss": 0.7027400970458985,
"step": 3850
},
{
"epoch": 0.616428785307915,
"grad_norm": 0.03747180104255676,
"learning_rate": 7.001179325803037e-06,
"loss": 0.6675007820129395,
"step": 3860
},
{
"epoch": 0.6180257510729614,
"grad_norm": 0.04694506898522377,
"learning_rate": 6.95095274306693e-06,
"loss": 0.6993120193481446,
"step": 3870
},
{
"epoch": 0.6196227168380077,
"grad_norm": 0.035580579191446304,
"learning_rate": 6.900810820465036e-06,
"loss": 0.7056044578552246,
"step": 3880
},
{
"epoch": 0.6212196826030542,
"grad_norm": 0.03560471907258034,
"learning_rate": 6.8507549502427105e-06,
"loss": 0.7067145824432373,
"step": 3890
},
{
"epoch": 0.6228166483681006,
"grad_norm": 0.03395050764083862,
"learning_rate": 6.800786522255974e-06,
"loss": 0.6799113750457764,
"step": 3900
},
{
"epoch": 0.624413614133147,
"grad_norm": 0.04874083772301674,
"learning_rate": 6.7509069239329295e-06,
"loss": 0.6687004089355468,
"step": 3910
},
{
"epoch": 0.6260105798981934,
"grad_norm": 0.03185804560780525,
"learning_rate": 6.701117540235204e-06,
"loss": 0.6309501171112061,
"step": 3920
},
{
"epoch": 0.6276075456632398,
"grad_norm": 0.04211097210645676,
"learning_rate": 6.651419753619535e-06,
"loss": 0.7135450839996338,
"step": 3930
},
{
"epoch": 0.6292045114282863,
"grad_norm": 0.03911832720041275,
"learning_rate": 6.601814943999363e-06,
"loss": 0.7103962421417236,
"step": 3940
},
{
"epoch": 0.6308014771933327,
"grad_norm": 0.033491455018520355,
"learning_rate": 6.552304488706512e-06,
"loss": 0.7135549068450928,
"step": 3950
},
{
"epoch": 0.6323984429583791,
"grad_norm": 0.031158218160271645,
"learning_rate": 6.502889762452969e-06,
"loss": 0.6667858600616455,
"step": 3960
},
{
"epoch": 0.6339954087234255,
"grad_norm": 0.037267692387104034,
"learning_rate": 6.453572137292689e-06,
"loss": 0.7077006816864013,
"step": 3970
},
{
"epoch": 0.635592374488472,
"grad_norm": 0.03646966442465782,
"learning_rate": 6.404352982583511e-06,
"loss": 0.663360595703125,
"step": 3980
},
{
"epoch": 0.6371893402535184,
"grad_norm": 0.0357484444975853,
"learning_rate": 6.355233664949132e-06,
"loss": 0.6508517742156983,
"step": 3990
},
{
"epoch": 0.6387863060185647,
"grad_norm": 0.03508693352341652,
"learning_rate": 6.306215548241174e-06,
"loss": 0.669168758392334,
"step": 4000
},
{
"epoch": 0.6387863060185647,
"eval_loss": 0.6750220060348511,
"eval_runtime": 1753.5267,
"eval_samples_per_second": 3.174,
"eval_steps_per_second": 1.587,
"step": 4000
},
{
"epoch": 0.6403832717836111,
"grad_norm": 0.03497602418065071,
"learning_rate": 6.257299993501289e-06,
"loss": 0.6791082859039307,
"step": 4010
},
{
"epoch": 0.6419802375486575,
"grad_norm": 0.04195614159107208,
"learning_rate": 6.208488358923393e-06,
"loss": 0.6831603050231934,
"step": 4020
},
{
"epoch": 0.6435772033137039,
"grad_norm": 0.049627698957920074,
"learning_rate": 6.1597819998159506e-06,
"loss": 0.7127910137176514,
"step": 4030
},
{
"epoch": 0.6451741690787504,
"grad_norm": 0.03807014226913452,
"learning_rate": 6.111182268564322e-06,
"loss": 0.6687079429626465,
"step": 4040
},
{
"epoch": 0.6467711348437968,
"grad_norm": 0.0430082343518734,
"learning_rate": 6.062690514593237e-06,
"loss": 0.6742999076843261,
"step": 4050
},
{
"epoch": 0.6483681006088432,
"grad_norm": 0.032011304050683975,
"learning_rate": 6.014308084329327e-06,
"loss": 0.693959903717041,
"step": 4060
},
{
"epoch": 0.6499650663738896,
"grad_norm": 0.0382765531539917,
"learning_rate": 5.966036321163709e-06,
"loss": 0.6869026184082031,
"step": 4070
},
{
"epoch": 0.651562032138936,
"grad_norm": 0.038456711918115616,
"learning_rate": 5.917876565414729e-06,
"loss": 0.6802726268768311,
"step": 4080
},
{
"epoch": 0.6531589979039825,
"grad_norm": 0.05505223199725151,
"learning_rate": 5.8698301542907144e-06,
"loss": 0.6540178775787353,
"step": 4090
},
{
"epoch": 0.6547559636690289,
"grad_norm": 0.06022993102669716,
"learning_rate": 5.8218984218528514e-06,
"loss": 0.6731237888336181,
"step": 4100
},
{
"epoch": 0.6563529294340753,
"grad_norm": 0.04032977297902107,
"learning_rate": 5.774082698978154e-06,
"loss": 0.7120064258575439,
"step": 4110
},
{
"epoch": 0.6579498951991216,
"grad_norm": 0.04511059820652008,
"learning_rate": 5.726384313322504e-06,
"loss": 0.6774604797363282,
"step": 4120
},
{
"epoch": 0.659546860964168,
"grad_norm": 0.04290354251861572,
"learning_rate": 5.678804589283777e-06,
"loss": 0.6660148143768311,
"step": 4130
},
{
"epoch": 0.6611438267292145,
"grad_norm": 0.03994971513748169,
"learning_rate": 5.6313448479650944e-06,
"loss": 0.6876091957092285,
"step": 4140
},
{
"epoch": 0.6627407924942609,
"grad_norm": 0.03699030727148056,
"learning_rate": 5.584006407138114e-06,
"loss": 0.6590198516845703,
"step": 4150
},
{
"epoch": 0.6643377582593073,
"grad_norm": 0.041105858981609344,
"learning_rate": 5.536790581206451e-06,
"loss": 0.6922510147094727,
"step": 4160
},
{
"epoch": 0.6659347240243537,
"grad_norm": 0.036437924951314926,
"learning_rate": 5.489698681169196e-06,
"loss": 0.7056612968444824,
"step": 4170
},
{
"epoch": 0.6675316897894001,
"grad_norm": 0.03376461938023567,
"learning_rate": 5.442732014584495e-06,
"loss": 0.6548415184020996,
"step": 4180
},
{
"epoch": 0.6691286555544466,
"grad_norm": 0.042223311960697174,
"learning_rate": 5.395891885533244e-06,
"loss": 0.6739361763000489,
"step": 4190
},
{
"epoch": 0.670725621319493,
"grad_norm": 0.03633170202374458,
"learning_rate": 5.349179594582898e-06,
"loss": 0.7088988304138184,
"step": 4200
},
{
"epoch": 0.6723225870845394,
"grad_norm": 0.037932299077510834,
"learning_rate": 5.302596438751339e-06,
"loss": 0.693552827835083,
"step": 4210
},
{
"epoch": 0.6739195528495858,
"grad_norm": 0.03990362584590912,
"learning_rate": 5.256143711470869e-06,
"loss": 0.6921378135681152,
"step": 4220
},
{
"epoch": 0.6755165186146322,
"grad_norm": 0.037905026227235794,
"learning_rate": 5.20982270255231e-06,
"loss": 0.6850985050201416,
"step": 4230
},
{
"epoch": 0.6771134843796787,
"grad_norm": 0.0379810705780983,
"learning_rate": 5.163634698149163e-06,
"loss": 0.6650650978088379,
"step": 4240
},
{
"epoch": 0.678710450144725,
"grad_norm": 0.02918444201350212,
"learning_rate": 5.11758098072193e-06,
"loss": 0.6373987674713135,
"step": 4250
},
{
"epoch": 0.6803074159097714,
"grad_norm": 0.03856576979160309,
"learning_rate": 5.071662829002477e-06,
"loss": 0.7125391483306884,
"step": 4260
},
{
"epoch": 0.6819043816748178,
"grad_norm": 0.03352683037519455,
"learning_rate": 5.02588151795854e-06,
"loss": 0.6727629661560058,
"step": 4270
},
{
"epoch": 0.6835013474398642,
"grad_norm": 0.04891199618577957,
"learning_rate": 4.98023831875833e-06,
"loss": 0.722879695892334,
"step": 4280
},
{
"epoch": 0.6850983132049107,
"grad_norm": 0.04167689383029938,
"learning_rate": 4.9347344987352305e-06,
"loss": 0.6740260124206543,
"step": 4290
},
{
"epoch": 0.6866952789699571,
"grad_norm": 0.03634532913565636,
"learning_rate": 4.889371321352607e-06,
"loss": 0.649868106842041,
"step": 4300
},
{
"epoch": 0.6882922447350035,
"grad_norm": 0.04563901200890541,
"learning_rate": 4.844150046168722e-06,
"loss": 0.6745593547821045,
"step": 4310
},
{
"epoch": 0.6898892105000499,
"grad_norm": 0.04015154391527176,
"learning_rate": 4.799071928801784e-06,
"loss": 0.7058079719543457,
"step": 4320
},
{
"epoch": 0.6914861762650963,
"grad_norm": 0.03595186024904251,
"learning_rate": 4.7541382208950505e-06,
"loss": 0.6532906532287598,
"step": 4330
},
{
"epoch": 0.6930831420301428,
"grad_norm": 0.030780136585235596,
"learning_rate": 4.709350170082103e-06,
"loss": 0.6842369079589844,
"step": 4340
},
{
"epoch": 0.6946801077951892,
"grad_norm": 0.04692146182060242,
"learning_rate": 4.6647090199522025e-06,
"loss": 0.7000434398651123,
"step": 4350
},
{
"epoch": 0.6962770735602356,
"grad_norm": 0.0310053788125515,
"learning_rate": 4.620216010015725e-06,
"loss": 0.6727443218231202,
"step": 4360
},
{
"epoch": 0.6978740393252819,
"grad_norm": 0.03317731246352196,
"learning_rate": 4.575872375669793e-06,
"loss": 0.6653543949127197,
"step": 4370
},
{
"epoch": 0.6994710050903283,
"grad_norm": 0.05431032553315163,
"learning_rate": 4.531679348163955e-06,
"loss": 0.691961145401001,
"step": 4380
},
{
"epoch": 0.7010679708553748,
"grad_norm": 0.037718210369348526,
"learning_rate": 4.487638154565978e-06,
"loss": 0.6595858573913574,
"step": 4390
},
{
"epoch": 0.7026649366204212,
"grad_norm": 0.03125704079866409,
"learning_rate": 4.443750017727814e-06,
"loss": 0.7066734790802002,
"step": 4400
},
{
"epoch": 0.7042619023854676,
"grad_norm": 0.050476983189582825,
"learning_rate": 4.400016156251611e-06,
"loss": 0.6906495094299316,
"step": 4410
},
{
"epoch": 0.705858868150514,
"grad_norm": 0.034239206463098526,
"learning_rate": 4.356437784455896e-06,
"loss": 0.698778486251831,
"step": 4420
},
{
"epoch": 0.7074558339155604,
"grad_norm": 0.03537076339125633,
"learning_rate": 4.313016112341861e-06,
"loss": 0.6730549335479736,
"step": 4430
},
{
"epoch": 0.7090527996806069,
"grad_norm": 0.0366840697824955,
"learning_rate": 4.269752345559761e-06,
"loss": 0.6689012527465821,
"step": 4440
},
{
"epoch": 0.7106497654456533,
"grad_norm": 0.04745423048734665,
"learning_rate": 4.226647685375428e-06,
"loss": 0.6668778419494629,
"step": 4450
},
{
"epoch": 0.7122467312106997,
"grad_norm": 0.043418820947408676,
"learning_rate": 4.183703328636924e-06,
"loss": 0.6641871929168701,
"step": 4460
},
{
"epoch": 0.7138436969757461,
"grad_norm": 0.0369136743247509,
"learning_rate": 4.140920467741325e-06,
"loss": 0.6592461585998535,
"step": 4470
},
{
"epoch": 0.7154406627407925,
"grad_norm": 0.034368593245744705,
"learning_rate": 4.098300290601581e-06,
"loss": 0.6683683395385742,
"step": 4480
},
{
"epoch": 0.7170376285058389,
"grad_norm": 0.042594823986291885,
"learning_rate": 4.055843980613561e-06,
"loss": 0.6916409015655518,
"step": 4490
},
{
"epoch": 0.7186345942708853,
"grad_norm": 0.042385537177324295,
"learning_rate": 4.013552716623185e-06,
"loss": 0.6435957431793213,
"step": 4500
},
{
"epoch": 0.7186345942708853,
"eval_loss": 0.6736165881156921,
"eval_runtime": 1722.8539,
"eval_samples_per_second": 3.231,
"eval_steps_per_second": 1.615,
"step": 4500
},
{
"epoch": 0.7202315600359317,
"grad_norm": 0.04700292646884918,
"learning_rate": 3.971427672893672e-06,
"loss": 0.688172435760498,
"step": 4510
},
{
"epoch": 0.7218285258009781,
"grad_norm": 0.040195874869823456,
"learning_rate": 3.929470019072972e-06,
"loss": 0.6703804016113282,
"step": 4520
},
{
"epoch": 0.7234254915660245,
"grad_norm": 0.036519281566143036,
"learning_rate": 3.8876809201612695e-06,
"loss": 0.6743984222412109,
"step": 4530
},
{
"epoch": 0.725022457331071,
"grad_norm": 0.03687750920653343,
"learning_rate": 3.846061536478626e-06,
"loss": 0.6919309139251709,
"step": 4540
},
{
"epoch": 0.7266194230961174,
"grad_norm": 0.034661829471588135,
"learning_rate": 3.804613023632788e-06,
"loss": 0.6852948665618896,
"step": 4550
},
{
"epoch": 0.7282163888611638,
"grad_norm": 0.04090991988778114,
"learning_rate": 3.763336532487076e-06,
"loss": 0.6639271259307862,
"step": 4560
},
{
"epoch": 0.7298133546262102,
"grad_norm": 0.053740449249744415,
"learning_rate": 3.7222332091284384e-06,
"loss": 0.7086254119873047,
"step": 4570
},
{
"epoch": 0.7314103203912566,
"grad_norm": 0.05317610129714012,
"learning_rate": 3.6813041948356408e-06,
"loss": 0.688640546798706,
"step": 4580
},
{
"epoch": 0.7330072861563031,
"grad_norm": 0.04328848421573639,
"learning_rate": 3.640550626047562e-06,
"loss": 0.6996944904327392,
"step": 4590
},
{
"epoch": 0.7346042519213495,
"grad_norm": 0.03426145389676094,
"learning_rate": 3.5999736343316406e-06,
"loss": 0.7036723613739013,
"step": 4600
},
{
"epoch": 0.7362012176863958,
"grad_norm": 0.04445657134056091,
"learning_rate": 3.559574346352459e-06,
"loss": 0.6727498054504395,
"step": 4610
},
{
"epoch": 0.7377981834514422,
"grad_norm": 0.034136440604925156,
"learning_rate": 3.5193538838404716e-06,
"loss": 0.6709301471710205,
"step": 4620
},
{
"epoch": 0.7393951492164886,
"grad_norm": 0.03670594096183777,
"learning_rate": 3.4793133635608334e-06,
"loss": 0.7098387241363525,
"step": 4630
},
{
"epoch": 0.740992114981535,
"grad_norm": 0.04140735790133476,
"learning_rate": 3.4394538972824167e-06,
"loss": 0.6899067401885987,
"step": 4640
},
{
"epoch": 0.7425890807465815,
"grad_norm": 0.03637846186757088,
"learning_rate": 3.3997765917469348e-06,
"loss": 0.6745570659637451,
"step": 4650
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.03964189440011978,
"learning_rate": 3.3602825486381886e-06,
"loss": 0.6715593338012695,
"step": 4660
},
{
"epoch": 0.7457830122766743,
"grad_norm": 0.036266524344682693,
"learning_rate": 3.3209728645515204e-06,
"loss": 0.6992376804351806,
"step": 4670
},
{
"epoch": 0.7473799780417207,
"grad_norm": 0.050657473504543304,
"learning_rate": 3.281848630963327e-06,
"loss": 0.689194917678833,
"step": 4680
},
{
"epoch": 0.7489769438067672,
"grad_norm": 0.03800279274582863,
"learning_rate": 3.2429109342007747e-06,
"loss": 0.7068089962005615,
"step": 4690
},
{
"epoch": 0.7505739095718136,
"grad_norm": 0.034397054463624954,
"learning_rate": 3.204160855411638e-06,
"loss": 0.7055879592895508,
"step": 4700
},
{
"epoch": 0.75217087533686,
"grad_norm": 0.03663492202758789,
"learning_rate": 3.1655994705342595e-06,
"loss": 0.6516903400421142,
"step": 4710
},
{
"epoch": 0.7537678411019064,
"grad_norm": 0.10431419312953949,
"learning_rate": 3.1272278502676932e-06,
"loss": 0.6833277702331543,
"step": 4720
},
{
"epoch": 0.7553648068669528,
"grad_norm": 0.04803668335080147,
"learning_rate": 3.0890470600419785e-06,
"loss": 0.7252533435821533,
"step": 4730
},
{
"epoch": 0.7569617726319992,
"grad_norm": 0.052451301366090775,
"learning_rate": 3.051058159988539e-06,
"loss": 0.6383802413940429,
"step": 4740
},
{
"epoch": 0.7585587383970456,
"grad_norm": 0.04214588552713394,
"learning_rate": 3.0132622049107672e-06,
"loss": 0.6385763645172119,
"step": 4750
},
{
"epoch": 0.760155704162092,
"grad_norm": 0.04353920742869377,
"learning_rate": 2.975660244254721e-06,
"loss": 0.6771360874176026,
"step": 4760
},
{
"epoch": 0.7617526699271384,
"grad_norm": 0.034283965826034546,
"learning_rate": 2.9382533220799893e-06,
"loss": 0.6694541931152344,
"step": 4770
},
{
"epoch": 0.7633496356921848,
"grad_norm": 0.04156842082738876,
"learning_rate": 2.901042477030709e-06,
"loss": 0.719163179397583,
"step": 4780
},
{
"epoch": 0.7649466014572313,
"grad_norm": 0.038868360221385956,
"learning_rate": 2.864028742306725e-06,
"loss": 0.6658177852630616,
"step": 4790
},
{
"epoch": 0.7665435672222777,
"grad_norm": 0.03701276332139969,
"learning_rate": 2.827213145634887e-06,
"loss": 0.6825634956359863,
"step": 4800
},
{
"epoch": 0.7681405329873241,
"grad_norm": 0.043009012937545776,
"learning_rate": 2.7905967092405305e-06,
"loss": 0.6813971519470214,
"step": 4810
},
{
"epoch": 0.7697374987523705,
"grad_norm": 0.03427146002650261,
"learning_rate": 2.754180449819092e-06,
"loss": 0.6715961933135987,
"step": 4820
},
{
"epoch": 0.7713344645174169,
"grad_norm": 0.04296226054430008,
"learning_rate": 2.717965378507864e-06,
"loss": 0.6857268810272217,
"step": 4830
},
{
"epoch": 0.7729314302824634,
"grad_norm": 0.0429498665034771,
"learning_rate": 2.681952500857944e-06,
"loss": 0.6708025932312012,
"step": 4840
},
{
"epoch": 0.7745283960475098,
"grad_norm": 0.03906711935997009,
"learning_rate": 2.6461428168062973e-06,
"loss": 0.6925381660461426,
"step": 4850
},
{
"epoch": 0.7761253618125561,
"grad_norm": 0.04651563614606857,
"learning_rate": 2.6105373206479913e-06,
"loss": 0.6757769107818603,
"step": 4860
},
{
"epoch": 0.7777223275776025,
"grad_norm": 0.04317731410264969,
"learning_rate": 2.575137001008592e-06,
"loss": 0.665839433670044,
"step": 4870
},
{
"epoch": 0.7793192933426489,
"grad_norm": 0.03744199126958847,
"learning_rate": 2.539942840816727e-06,
"loss": 0.7045305252075196,
"step": 4880
},
{
"epoch": 0.7809162591076954,
"grad_norm": 0.04245394840836525,
"learning_rate": 2.5049558172767675e-06,
"loss": 0.6957573890686035,
"step": 4890
},
{
"epoch": 0.7825132248727418,
"grad_norm": 0.03942755609750748,
"learning_rate": 2.47017690184172e-06,
"loss": 0.6565013408660889,
"step": 4900
},
{
"epoch": 0.7841101906377882,
"grad_norm": 0.04654339328408241,
"learning_rate": 2.4356070601862327e-06,
"loss": 0.6680932998657226,
"step": 4910
},
{
"epoch": 0.7857071564028346,
"grad_norm": 0.05248269438743591,
"learning_rate": 2.4012472521797923e-06,
"loss": 0.6607920169830322,
"step": 4920
},
{
"epoch": 0.787304122167881,
"grad_norm": 0.04624265059828758,
"learning_rate": 2.3670984318600786e-06,
"loss": 0.6683568954467773,
"step": 4930
},
{
"epoch": 0.7889010879329275,
"grad_norm": 0.04104055091738701,
"learning_rate": 2.333161547406464e-06,
"loss": 0.6837501049041748,
"step": 4940
},
{
"epoch": 0.7904980536979739,
"grad_norm": 0.039029769599437714,
"learning_rate": 2.2994375411136884e-06,
"loss": 0.6702389240264892,
"step": 4950
},
{
"epoch": 0.7920950194630203,
"grad_norm": 0.03849470987915993,
"learning_rate": 2.265927349365691e-06,
"loss": 0.6521383285522461,
"step": 4960
},
{
"epoch": 0.7936919852280667,
"grad_norm": 0.05400224030017853,
"learning_rate": 2.232631902609629e-06,
"loss": 0.6533032894134522,
"step": 4970
},
{
"epoch": 0.795288950993113,
"grad_norm": 0.043887361884117126,
"learning_rate": 2.1995521253300167e-06,
"loss": 0.6794767379760742,
"step": 4980
},
{
"epoch": 0.7968859167581595,
"grad_norm": 0.04501941800117493,
"learning_rate": 2.166688936023078e-06,
"loss": 0.6698923587799073,
"step": 4990
},
{
"epoch": 0.7984828825232059,
"grad_norm": 0.045139558613300323,
"learning_rate": 2.1340432471712367e-06,
"loss": 0.661639404296875,
"step": 5000
},
{
"epoch": 0.7984828825232059,
"eval_loss": 0.6726189255714417,
"eval_runtime": 1709.319,
"eval_samples_per_second": 3.256,
"eval_steps_per_second": 1.628,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 6262,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0986566032425984e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}