Extractor_Adaptor_Qwen3_0.6b / trainer_state.json
abdo-Mansour's picture
End of training
9e7f7c8 verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": 950,
"best_metric": 0.05095840245485306,
"best_model_checkpoint": "/kaggle/working/Llama-Factory-out/checkpoint-700",
"epoch": 4.0,
"eval_steps": 50,
"global_step": 1912,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010473946059177796,
"grad_norm": 21.572948455810547,
"learning_rate": 2.0887728459530028e-07,
"loss": 1.3683,
"step": 5
},
{
"epoch": 0.020947892118355592,
"grad_norm": 28.60955810546875,
"learning_rate": 4.6997389033942563e-07,
"loss": 1.5151,
"step": 10
},
{
"epoch": 0.031421838177533384,
"grad_norm": 23.589828491210938,
"learning_rate": 7.31070496083551e-07,
"loss": 1.5732,
"step": 15
},
{
"epoch": 0.041895784236711184,
"grad_norm": NaN,
"learning_rate": 9.921671018276763e-07,
"loss": 1.9134,
"step": 20
},
{
"epoch": 0.052369730295888976,
"grad_norm": 17.49181365966797,
"learning_rate": 1.2532637075718015e-06,
"loss": 1.4103,
"step": 25
},
{
"epoch": 0.06284367635506677,
"grad_norm": 24.726316452026367,
"learning_rate": 1.5143603133159272e-06,
"loss": 1.1474,
"step": 30
},
{
"epoch": 0.07331762241424457,
"grad_norm": 16.106216430664062,
"learning_rate": 1.7754569190600524e-06,
"loss": 1.1247,
"step": 35
},
{
"epoch": 0.08379156847342237,
"grad_norm": 15.223337173461914,
"learning_rate": 2.036553524804178e-06,
"loss": 0.7444,
"step": 40
},
{
"epoch": 0.09426551453260015,
"grad_norm": 16.522626876831055,
"learning_rate": 2.2976501305483033e-06,
"loss": 0.5635,
"step": 45
},
{
"epoch": 0.10473946059177795,
"grad_norm": 17.7125244140625,
"learning_rate": 2.5587467362924283e-06,
"loss": 0.4239,
"step": 50
},
{
"epoch": 0.10473946059177795,
"eval_loss": 0.36827990412712097,
"eval_runtime": 181.7089,
"eval_samples_per_second": 10.506,
"eval_steps_per_second": 2.631,
"step": 50
},
{
"epoch": 0.11521340665095575,
"grad_norm": 6.594980716705322,
"learning_rate": 2.819843342036554e-06,
"loss": 0.2571,
"step": 55
},
{
"epoch": 0.12568735271013354,
"grad_norm": 5.109836578369141,
"learning_rate": 3.080939947780679e-06,
"loss": 0.1924,
"step": 60
},
{
"epoch": 0.13616129876931135,
"grad_norm": NaN,
"learning_rate": 3.3420365535248045e-06,
"loss": 0.1311,
"step": 65
},
{
"epoch": 0.14663524482848914,
"grad_norm": 5.705305099487305,
"learning_rate": 3.60313315926893e-06,
"loss": 0.1412,
"step": 70
},
{
"epoch": 0.15710919088766692,
"grad_norm": 4.945490837097168,
"learning_rate": 3.864229765013055e-06,
"loss": 0.1438,
"step": 75
},
{
"epoch": 0.16758313694684474,
"grad_norm": 3.300361156463623,
"learning_rate": 4.12532637075718e-06,
"loss": 0.1146,
"step": 80
},
{
"epoch": 0.17805708300602252,
"grad_norm": 4.6746649742126465,
"learning_rate": 4.386422976501306e-06,
"loss": 0.1094,
"step": 85
},
{
"epoch": 0.1885310290652003,
"grad_norm": 3.8035058975219727,
"learning_rate": 4.647519582245431e-06,
"loss": 0.0945,
"step": 90
},
{
"epoch": 0.19900497512437812,
"grad_norm": 3.3794517517089844,
"learning_rate": 4.908616187989557e-06,
"loss": 0.1162,
"step": 95
},
{
"epoch": 0.2094789211835559,
"grad_norm": 3.917008638381958,
"learning_rate": 5.169712793733682e-06,
"loss": 0.1005,
"step": 100
},
{
"epoch": 0.2094789211835559,
"eval_loss": 0.11476331204175949,
"eval_runtime": 181.6618,
"eval_samples_per_second": 10.509,
"eval_steps_per_second": 2.631,
"step": 100
},
{
"epoch": 0.2199528672427337,
"grad_norm": 2.673529863357544,
"learning_rate": 5.4308093994778075e-06,
"loss": 0.077,
"step": 105
},
{
"epoch": 0.2304268133019115,
"grad_norm": 3.3407351970672607,
"learning_rate": 5.691906005221932e-06,
"loss": 0.0906,
"step": 110
},
{
"epoch": 0.2409007593610893,
"grad_norm": 3.2136423587799072,
"learning_rate": 5.9530026109660575e-06,
"loss": 0.1303,
"step": 115
},
{
"epoch": 0.2513747054202671,
"grad_norm": 2.6382622718811035,
"learning_rate": 6.214099216710183e-06,
"loss": 0.0806,
"step": 120
},
{
"epoch": 0.26184865147944486,
"grad_norm": 3.269986867904663,
"learning_rate": 6.475195822454308e-06,
"loss": 0.0827,
"step": 125
},
{
"epoch": 0.2723225975386227,
"grad_norm": 3.785564661026001,
"learning_rate": 6.736292428198435e-06,
"loss": 0.1256,
"step": 130
},
{
"epoch": 0.2827965435978005,
"grad_norm": 2.529221534729004,
"learning_rate": 6.99738903394256e-06,
"loss": 0.1075,
"step": 135
},
{
"epoch": 0.2932704896569783,
"grad_norm": 4.19661283493042,
"learning_rate": 7.258485639686685e-06,
"loss": 0.1118,
"step": 140
},
{
"epoch": 0.30374443571615606,
"grad_norm": 2.460735559463501,
"learning_rate": 7.51958224543081e-06,
"loss": 0.0738,
"step": 145
},
{
"epoch": 0.31421838177533384,
"grad_norm": 2.7781996726989746,
"learning_rate": 7.780678851174935e-06,
"loss": 0.0853,
"step": 150
},
{
"epoch": 0.31421838177533384,
"eval_loss": 0.09081956744194031,
"eval_runtime": 181.8598,
"eval_samples_per_second": 10.497,
"eval_steps_per_second": 2.628,
"step": 150
},
{
"epoch": 0.32469232783451163,
"grad_norm": 2.850130319595337,
"learning_rate": 8.04177545691906e-06,
"loss": 0.0908,
"step": 155
},
{
"epoch": 0.33516627389368947,
"grad_norm": 2.2639331817626953,
"learning_rate": 8.302872062663187e-06,
"loss": 0.0744,
"step": 160
},
{
"epoch": 0.34564021995286726,
"grad_norm": 3.059605121612549,
"learning_rate": 8.563968668407311e-06,
"loss": 0.0624,
"step": 165
},
{
"epoch": 0.35611416601204504,
"grad_norm": 1.4469069242477417,
"learning_rate": 8.825065274151436e-06,
"loss": 0.0644,
"step": 170
},
{
"epoch": 0.3665881120712228,
"grad_norm": 3.2024354934692383,
"learning_rate": 9.086161879895562e-06,
"loss": 0.0921,
"step": 175
},
{
"epoch": 0.3770620581304006,
"grad_norm": 2.3910789489746094,
"learning_rate": 9.347258485639687e-06,
"loss": 0.107,
"step": 180
},
{
"epoch": 0.3875360041895784,
"grad_norm": 2.225024938583374,
"learning_rate": 9.608355091383813e-06,
"loss": 0.0794,
"step": 185
},
{
"epoch": 0.39800995024875624,
"grad_norm": 2.3184664249420166,
"learning_rate": 9.869451697127938e-06,
"loss": 0.0912,
"step": 190
},
{
"epoch": 0.408483896307934,
"grad_norm": 3.909691572189331,
"learning_rate": 1.0130548302872064e-05,
"loss": 0.075,
"step": 195
},
{
"epoch": 0.4189578423671118,
"grad_norm": 2.332878589630127,
"learning_rate": 1.0391644908616189e-05,
"loss": 0.0645,
"step": 200
},
{
"epoch": 0.4189578423671118,
"eval_loss": 0.08308149129152298,
"eval_runtime": 181.6397,
"eval_samples_per_second": 10.51,
"eval_steps_per_second": 2.632,
"step": 200
},
{
"epoch": 0.4294317884262896,
"grad_norm": 2.1909031867980957,
"learning_rate": 1.0652741514360314e-05,
"loss": 0.0566,
"step": 205
},
{
"epoch": 0.4399057344854674,
"grad_norm": 3.95145320892334,
"learning_rate": 1.0913838120104438e-05,
"loss": 0.0751,
"step": 210
},
{
"epoch": 0.4503796805446452,
"grad_norm": 2.0043461322784424,
"learning_rate": 1.1174934725848565e-05,
"loss": 0.0738,
"step": 215
},
{
"epoch": 0.460853626603823,
"grad_norm": 2.2231714725494385,
"learning_rate": 1.1436031331592689e-05,
"loss": 0.0679,
"step": 220
},
{
"epoch": 0.4713275726630008,
"grad_norm": 4.517533302307129,
"learning_rate": 1.1697127937336816e-05,
"loss": 0.0762,
"step": 225
},
{
"epoch": 0.4818015187221786,
"grad_norm": 5.064544677734375,
"learning_rate": 1.1958224543080942e-05,
"loss": 0.1181,
"step": 230
},
{
"epoch": 0.49227546478135636,
"grad_norm": 2.1601788997650146,
"learning_rate": 1.2219321148825067e-05,
"loss": 0.0597,
"step": 235
},
{
"epoch": 0.5027494108405341,
"grad_norm": 2.5625264644622803,
"learning_rate": 1.2480417754569192e-05,
"loss": 0.0624,
"step": 240
},
{
"epoch": 0.513223356899712,
"grad_norm": 3.436384916305542,
"learning_rate": 1.2741514360313316e-05,
"loss": 0.0715,
"step": 245
},
{
"epoch": 0.5236973029588897,
"grad_norm": 6.529380798339844,
"learning_rate": 1.3002610966057443e-05,
"loss": 0.0767,
"step": 250
},
{
"epoch": 0.5236973029588897,
"eval_loss": 0.07550048828125,
"eval_runtime": 181.4419,
"eval_samples_per_second": 10.521,
"eval_steps_per_second": 2.634,
"step": 250
},
{
"epoch": 0.5341712490180676,
"grad_norm": 2.9247043132781982,
"learning_rate": 1.3263707571801567e-05,
"loss": 0.0586,
"step": 255
},
{
"epoch": 0.5446451950772454,
"grad_norm": 4.487779140472412,
"learning_rate": 1.3524804177545694e-05,
"loss": 0.0968,
"step": 260
},
{
"epoch": 0.5551191411364231,
"grad_norm": 2.502134084701538,
"learning_rate": 1.3785900783289818e-05,
"loss": 0.0658,
"step": 265
},
{
"epoch": 0.565593087195601,
"grad_norm": 2.817639112472534,
"learning_rate": 1.4046997389033943e-05,
"loss": 0.0706,
"step": 270
},
{
"epoch": 0.5760670332547787,
"grad_norm": 2.812814235687256,
"learning_rate": 1.4308093994778069e-05,
"loss": 0.0582,
"step": 275
},
{
"epoch": 0.5865409793139565,
"grad_norm": 1.8219794034957886,
"learning_rate": 1.4569190600522194e-05,
"loss": 0.0583,
"step": 280
},
{
"epoch": 0.5970149253731343,
"grad_norm": 2.640019416809082,
"learning_rate": 1.4830287206266321e-05,
"loss": 0.0805,
"step": 285
},
{
"epoch": 0.6074888714323121,
"grad_norm": 3.00846791267395,
"learning_rate": 1.5091383812010445e-05,
"loss": 0.0587,
"step": 290
},
{
"epoch": 0.61796281749149,
"grad_norm": 2.266049861907959,
"learning_rate": 1.535248041775457e-05,
"loss": 0.0622,
"step": 295
},
{
"epoch": 0.6284367635506677,
"grad_norm": 2.43892502784729,
"learning_rate": 1.5613577023498696e-05,
"loss": 0.0826,
"step": 300
},
{
"epoch": 0.6284367635506677,
"eval_loss": 0.0705987811088562,
"eval_runtime": 181.5418,
"eval_samples_per_second": 10.515,
"eval_steps_per_second": 2.633,
"step": 300
},
{
"epoch": 0.6389107096098455,
"grad_norm": 2.757784605026245,
"learning_rate": 1.587467362924282e-05,
"loss": 0.0917,
"step": 305
},
{
"epoch": 0.6493846556690233,
"grad_norm": 3.353879690170288,
"learning_rate": 1.6135770234986947e-05,
"loss": 0.0689,
"step": 310
},
{
"epoch": 0.6598586017282011,
"grad_norm": 2.3030617237091064,
"learning_rate": 1.6396866840731072e-05,
"loss": 0.0589,
"step": 315
},
{
"epoch": 0.6703325477873789,
"grad_norm": 1.9910506010055542,
"learning_rate": 1.6657963446475198e-05,
"loss": 0.0787,
"step": 320
},
{
"epoch": 0.6808064938465567,
"grad_norm": 1.8802602291107178,
"learning_rate": 1.6919060052219323e-05,
"loss": 0.0803,
"step": 325
},
{
"epoch": 0.6912804399057345,
"grad_norm": 2.357010841369629,
"learning_rate": 1.718015665796345e-05,
"loss": 0.065,
"step": 330
},
{
"epoch": 0.7017543859649122,
"grad_norm": 3.608004331588745,
"learning_rate": 1.7441253263707574e-05,
"loss": 0.0958,
"step": 335
},
{
"epoch": 0.7122283320240901,
"grad_norm": 2.5642309188842773,
"learning_rate": 1.77023498694517e-05,
"loss": 0.0859,
"step": 340
},
{
"epoch": 0.7227022780832679,
"grad_norm": 2.9146134853363037,
"learning_rate": 1.7963446475195825e-05,
"loss": 0.0802,
"step": 345
},
{
"epoch": 0.7331762241424457,
"grad_norm": 2.8338112831115723,
"learning_rate": 1.822454308093995e-05,
"loss": 0.0882,
"step": 350
},
{
"epoch": 0.7331762241424457,
"eval_loss": 0.06691395491361618,
"eval_runtime": 181.5091,
"eval_samples_per_second": 10.517,
"eval_steps_per_second": 2.633,
"step": 350
},
{
"epoch": 0.7436501702016235,
"grad_norm": 2.904839277267456,
"learning_rate": 1.8485639686684072e-05,
"loss": 0.0623,
"step": 355
},
{
"epoch": 0.7541241162608012,
"grad_norm": 2.482553243637085,
"learning_rate": 1.87467362924282e-05,
"loss": 0.0611,
"step": 360
},
{
"epoch": 0.7645980623199791,
"grad_norm": 2.968573570251465,
"learning_rate": 1.9007832898172326e-05,
"loss": 0.0829,
"step": 365
},
{
"epoch": 0.7750720083791568,
"grad_norm": 2.859727144241333,
"learning_rate": 1.9268929503916452e-05,
"loss": 0.0555,
"step": 370
},
{
"epoch": 0.7855459544383346,
"grad_norm": 1.7544801235198975,
"learning_rate": 1.9530026109660577e-05,
"loss": 0.0722,
"step": 375
},
{
"epoch": 0.7960199004975125,
"grad_norm": 2.506270408630371,
"learning_rate": 1.97911227154047e-05,
"loss": 0.0706,
"step": 380
},
{
"epoch": 0.8064938465566902,
"grad_norm": 2.7544281482696533,
"learning_rate": 1.9999978891633502e-05,
"loss": 0.0561,
"step": 385
},
{
"epoch": 0.816967792615868,
"grad_norm": 1.2377090454101562,
"learning_rate": 1.9999240108162817e-05,
"loss": 0.0682,
"step": 390
},
{
"epoch": 0.8274417386750458,
"grad_norm": 3.0974531173706055,
"learning_rate": 1.999744599547812e-05,
"loss": 0.0804,
"step": 395
},
{
"epoch": 0.8379156847342236,
"grad_norm": 2.9139633178710938,
"learning_rate": 1.9994596742931747e-05,
"loss": 0.0726,
"step": 400
},
{
"epoch": 0.8379156847342236,
"eval_loss": 0.06348562985658646,
"eval_runtime": 181.4276,
"eval_samples_per_second": 10.522,
"eval_steps_per_second": 2.635,
"step": 400
},
{
"epoch": 0.8483896307934015,
"grad_norm": 3.329805850982666,
"learning_rate": 1.9990692651236494e-05,
"loss": 0.0636,
"step": 405
},
{
"epoch": 0.8588635768525792,
"grad_norm": 1.405851125717163,
"learning_rate": 1.9985734132433876e-05,
"loss": 0.0483,
"step": 410
},
{
"epoch": 0.869337522911757,
"grad_norm": 2.3531923294067383,
"learning_rate": 1.9979721709850634e-05,
"loss": 0.0709,
"step": 415
},
{
"epoch": 0.8798114689709348,
"grad_norm": 1.4560775756835938,
"learning_rate": 1.9972656018043505e-05,
"loss": 0.0576,
"step": 420
},
{
"epoch": 0.8902854150301126,
"grad_norm": 2.4551849365234375,
"learning_rate": 1.996453780273226e-05,
"loss": 0.0861,
"step": 425
},
{
"epoch": 0.9007593610892904,
"grad_norm": 4.548062801361084,
"learning_rate": 1.9955367920720977e-05,
"loss": 0.1325,
"step": 430
},
{
"epoch": 0.9112333071484682,
"grad_norm": 1.5118955373764038,
"learning_rate": 1.9945147339807645e-05,
"loss": 0.06,
"step": 435
},
{
"epoch": 0.921707253207646,
"grad_norm": 2.8457553386688232,
"learning_rate": 1.993387713868199e-05,
"loss": 0.0496,
"step": 440
},
{
"epoch": 0.9321811992668237,
"grad_norm": 2.279599666595459,
"learning_rate": 1.9921558506811648e-05,
"loss": 0.0541,
"step": 445
},
{
"epoch": 0.9426551453260016,
"grad_norm": 1.4517545700073242,
"learning_rate": 1.990819274431662e-05,
"loss": 0.0711,
"step": 450
},
{
"epoch": 0.9426551453260016,
"eval_loss": 0.06195152550935745,
"eval_runtime": 181.5787,
"eval_samples_per_second": 10.513,
"eval_steps_per_second": 2.632,
"step": 450
},
{
"epoch": 0.9531290913851793,
"grad_norm": 2.7663371562957764,
"learning_rate": 1.989378126183207e-05,
"loss": 0.0707,
"step": 455
},
{
"epoch": 0.9636030374443572,
"grad_norm": 2.230884552001953,
"learning_rate": 1.987832558035942e-05,
"loss": 0.0554,
"step": 460
},
{
"epoch": 0.974076983503535,
"grad_norm": 2.8206303119659424,
"learning_rate": 1.9861827331105844e-05,
"loss": 0.0658,
"step": 465
},
{
"epoch": 0.9845509295627127,
"grad_norm": 1.7690904140472412,
"learning_rate": 1.9844288255312098e-05,
"loss": 0.0546,
"step": 470
},
{
"epoch": 0.9950248756218906,
"grad_norm": 2.402695417404175,
"learning_rate": 1.982571020406875e-05,
"loss": 0.0725,
"step": 475
},
{
"epoch": 1.0041895784236712,
"grad_norm": 0.8933857083320618,
"learning_rate": 1.9806095138120824e-05,
"loss": 0.0363,
"step": 480
},
{
"epoch": 1.014663524482849,
"grad_norm": 1.5981252193450928,
"learning_rate": 1.978544512766084e-05,
"loss": 0.0454,
"step": 485
},
{
"epoch": 1.0251374705420266,
"grad_norm": 2.3014566898345947,
"learning_rate": 1.9763762352110344e-05,
"loss": 0.0455,
"step": 490
},
{
"epoch": 1.0356114166012045,
"grad_norm": 2.267174243927002,
"learning_rate": 1.9741049099889874e-05,
"loss": 0.0428,
"step": 495
},
{
"epoch": 1.0460853626603823,
"grad_norm": 2.398452043533325,
"learning_rate": 1.9717307768177457e-05,
"loss": 0.0433,
"step": 500
},
{
"epoch": 1.0460853626603823,
"eval_loss": 0.062211424112319946,
"eval_runtime": 181.7607,
"eval_samples_per_second": 10.503,
"eval_steps_per_second": 2.63,
"step": 500
},
{
"epoch": 1.0565593087195602,
"grad_norm": 2.4606473445892334,
"learning_rate": 1.9692540862655587e-05,
"loss": 0.0563,
"step": 505
},
{
"epoch": 1.067033254778738,
"grad_norm": 0.9938412308692932,
"learning_rate": 1.9666750997246793e-05,
"loss": 0.0429,
"step": 510
},
{
"epoch": 1.0775072008379156,
"grad_norm": 2.087348461151123,
"learning_rate": 1.963994089383774e-05,
"loss": 0.0609,
"step": 515
},
{
"epoch": 1.0879811468970935,
"grad_norm": 1.5083081722259521,
"learning_rate": 1.9612113381991985e-05,
"loss": 0.0538,
"step": 520
},
{
"epoch": 1.0984550929562713,
"grad_norm": 1.1394294500350952,
"learning_rate": 1.9583271398651327e-05,
"loss": 0.0432,
"step": 525
},
{
"epoch": 1.1089290390154491,
"grad_norm": 1.6931722164154053,
"learning_rate": 1.9553417987825837e-05,
"loss": 0.036,
"step": 530
},
{
"epoch": 1.1194029850746268,
"grad_norm": 2.196749687194824,
"learning_rate": 1.952255630027259e-05,
"loss": 0.0504,
"step": 535
},
{
"epoch": 1.1298769311338046,
"grad_norm": 1.8391106128692627,
"learning_rate": 1.949068959316315e-05,
"loss": 0.0391,
"step": 540
},
{
"epoch": 1.1403508771929824,
"grad_norm": 2.4160068035125732,
"learning_rate": 1.9457821229739783e-05,
"loss": 0.0486,
"step": 545
},
{
"epoch": 1.1508248232521603,
"grad_norm": 1.0730011463165283,
"learning_rate": 1.9423954678960502e-05,
"loss": 0.0488,
"step": 550
},
{
"epoch": 1.1508248232521603,
"eval_loss": 0.05938513204455376,
"eval_runtime": 181.8553,
"eval_samples_per_second": 10.497,
"eval_steps_per_second": 2.628,
"step": 550
},
{
"epoch": 1.1612987693113381,
"grad_norm": 1.80950927734375,
"learning_rate": 1.9389093515132965e-05,
"loss": 0.0435,
"step": 555
},
{
"epoch": 1.1717727153705157,
"grad_norm": 2.7154200077056885,
"learning_rate": 1.9353241417537216e-05,
"loss": 0.0611,
"step": 560
},
{
"epoch": 1.1822466614296936,
"grad_norm": 1.1030880212783813,
"learning_rate": 1.9316402170037377e-05,
"loss": 0.0531,
"step": 565
},
{
"epoch": 1.1927206074888714,
"grad_norm": 2.1434154510498047,
"learning_rate": 1.927857966068232e-05,
"loss": 0.0733,
"step": 570
},
{
"epoch": 1.2031945535480493,
"grad_norm": 0.8784016370773315,
"learning_rate": 1.923977788129528e-05,
"loss": 0.0339,
"step": 575
},
{
"epoch": 1.2136684996072271,
"grad_norm": 1.4416366815567017,
"learning_rate": 1.9200000927052586e-05,
"loss": 0.0453,
"step": 580
},
{
"epoch": 1.2241424456664047,
"grad_norm": 0.9367201924324036,
"learning_rate": 1.9159252996051433e-05,
"loss": 0.0442,
"step": 585
},
{
"epoch": 1.2346163917255826,
"grad_norm": 3.147280216217041,
"learning_rate": 1.911753838886681e-05,
"loss": 0.0429,
"step": 590
},
{
"epoch": 1.2450903377847604,
"grad_norm": 2.891639232635498,
"learning_rate": 1.907486150809764e-05,
"loss": 0.0341,
"step": 595
},
{
"epoch": 1.2555642838439383,
"grad_norm": 1.8960820436477661,
"learning_rate": 1.9031226857902087e-05,
"loss": 0.0347,
"step": 600
},
{
"epoch": 1.2555642838439383,
"eval_loss": 0.05871057137846947,
"eval_runtime": 181.3499,
"eval_samples_per_second": 10.527,
"eval_steps_per_second": 2.636,
"step": 600
},
{
"epoch": 1.266038229903116,
"grad_norm": 1.8320516347885132,
"learning_rate": 1.898663904352221e-05,
"loss": 0.0384,
"step": 605
},
{
"epoch": 1.2765121759622937,
"grad_norm": 2.077674150466919,
"learning_rate": 1.894110277079791e-05,
"loss": 0.0845,
"step": 610
},
{
"epoch": 1.2869861220214716,
"grad_norm": 1.9369480609893799,
"learning_rate": 1.8894622845670282e-05,
"loss": 0.0418,
"step": 615
},
{
"epoch": 1.2974600680806494,
"grad_norm": 3.845341682434082,
"learning_rate": 1.8847204173674378e-05,
"loss": 0.0488,
"step": 620
},
{
"epoch": 1.3079340141398272,
"grad_norm": 1.5000770092010498,
"learning_rate": 1.8798851759421473e-05,
"loss": 0.0553,
"step": 625
},
{
"epoch": 1.3184079601990049,
"grad_norm": 1.5684770345687866,
"learning_rate": 1.8749570706070895e-05,
"loss": 0.0492,
"step": 630
},
{
"epoch": 1.3288819062581827,
"grad_norm": 2.115903377532959,
"learning_rate": 1.8699366214791394e-05,
"loss": 0.0424,
"step": 635
},
{
"epoch": 1.3393558523173605,
"grad_norm": 1.7767939567565918,
"learning_rate": 1.8648243584212254e-05,
"loss": 0.0234,
"step": 640
},
{
"epoch": 1.3498297983765384,
"grad_norm": 1.7302303314208984,
"learning_rate": 1.8596208209864022e-05,
"loss": 0.0482,
"step": 645
},
{
"epoch": 1.3603037444357162,
"grad_norm": 1.750826358795166,
"learning_rate": 1.8543265583609096e-05,
"loss": 0.0475,
"step": 650
},
{
"epoch": 1.3603037444357162,
"eval_loss": 0.05913909152150154,
"eval_runtime": 181.2221,
"eval_samples_per_second": 10.534,
"eval_steps_per_second": 2.638,
"step": 650
},
{
"epoch": 1.370777690494894,
"grad_norm": 2.049710512161255,
"learning_rate": 1.8489421293062087e-05,
"loss": 0.044,
"step": 655
},
{
"epoch": 1.3812516365540717,
"grad_norm": 1.9173017740249634,
"learning_rate": 1.8434681021000108e-05,
"loss": 0.0391,
"step": 660
},
{
"epoch": 1.3917255826132495,
"grad_norm": 2.223348379135132,
"learning_rate": 1.8379050544763004e-05,
"loss": 0.0393,
"step": 665
},
{
"epoch": 1.4021995286724274,
"grad_norm": 3.047008752822876,
"learning_rate": 1.8322535735643604e-05,
"loss": 0.044,
"step": 670
},
{
"epoch": 1.4126734747316052,
"grad_norm": 1.5292298793792725,
"learning_rate": 1.8265142558268066e-05,
"loss": 0.0672,
"step": 675
},
{
"epoch": 1.4231474207907828,
"grad_norm": 1.8190603256225586,
"learning_rate": 1.820687706996636e-05,
"loss": 0.0458,
"step": 680
},
{
"epoch": 1.4336213668499607,
"grad_norm": 2.0858137607574463,
"learning_rate": 1.8147745420132965e-05,
"loss": 0.042,
"step": 685
},
{
"epoch": 1.4440953129091385,
"grad_norm": 4.506059646606445,
"learning_rate": 1.8087753849577876e-05,
"loss": 0.0629,
"step": 690
},
{
"epoch": 1.4545692589683163,
"grad_norm": 2.2428197860717773,
"learning_rate": 1.802690868986792e-05,
"loss": 0.0486,
"step": 695
},
{
"epoch": 1.4650432050274942,
"grad_norm": 1.942474365234375,
"learning_rate": 1.7965216362658528e-05,
"loss": 0.0485,
"step": 700
},
{
"epoch": 1.4650432050274942,
"eval_loss": 0.055441830307245255,
"eval_runtime": 182.1133,
"eval_samples_per_second": 10.482,
"eval_steps_per_second": 2.625,
"step": 700
},
{
"epoch": 1.475517151086672,
"grad_norm": 1.306942343711853,
"learning_rate": 1.7902683379015996e-05,
"loss": 0.0518,
"step": 705
},
{
"epoch": 1.4859910971458496,
"grad_norm": 1.9224140644073486,
"learning_rate": 1.7839316338730282e-05,
"loss": 0.0579,
"step": 710
},
{
"epoch": 1.4964650432050275,
"grad_norm": 1.8800877332687378,
"learning_rate": 1.7775121929618462e-05,
"loss": 0.0514,
"step": 715
},
{
"epoch": 1.5069389892642053,
"grad_norm": 1.8557875156402588,
"learning_rate": 1.771010692681892e-05,
"loss": 0.0535,
"step": 720
},
{
"epoch": 1.517412935323383,
"grad_norm": 1.4152109622955322,
"learning_rate": 1.764427819207624e-05,
"loss": 0.0693,
"step": 725
},
{
"epoch": 1.5278868813825608,
"grad_norm": 3.057999849319458,
"learning_rate": 1.7577642673017063e-05,
"loss": 0.0429,
"step": 730
},
{
"epoch": 1.5383608274417386,
"grad_norm": 2.492802619934082,
"learning_rate": 1.7510207402416798e-05,
"loss": 0.04,
"step": 735
},
{
"epoch": 1.5488347735009165,
"grad_norm": 4.143369674682617,
"learning_rate": 1.7441979497457384e-05,
"loss": 0.058,
"step": 740
},
{
"epoch": 1.5593087195600943,
"grad_norm": 1.7152019739151,
"learning_rate": 1.7372966158976143e-05,
"loss": 0.0713,
"step": 745
},
{
"epoch": 1.5697826656192722,
"grad_norm": 2.2295591831207275,
"learning_rate": 1.7303174670705783e-05,
"loss": 0.0421,
"step": 750
},
{
"epoch": 1.5697826656192722,
"eval_loss": 0.05413464084267616,
"eval_runtime": 181.9914,
"eval_samples_per_second": 10.49,
"eval_steps_per_second": 2.626,
"step": 750
},
{
"epoch": 1.58025661167845,
"grad_norm": 1.414204478263855,
"learning_rate": 1.7232612398505676e-05,
"loss": 0.0499,
"step": 755
},
{
"epoch": 1.5907305577376276,
"grad_norm": 2.8413901329040527,
"learning_rate": 1.716128678958445e-05,
"loss": 0.0496,
"step": 760
},
{
"epoch": 1.6012045037968055,
"grad_norm": 1.3030387163162231,
"learning_rate": 1.708920537171402e-05,
"loss": 0.0376,
"step": 765
},
{
"epoch": 1.611678449855983,
"grad_norm": 0.9149934649467468,
"learning_rate": 1.7016375752435088e-05,
"loss": 0.0313,
"step": 770
},
{
"epoch": 1.622152395915161,
"grad_norm": 2.623652935028076,
"learning_rate": 1.694280561825422e-05,
"loss": 0.0612,
"step": 775
},
{
"epoch": 1.6326263419743388,
"grad_norm": 1.9939152002334595,
"learning_rate": 1.6868502733832647e-05,
"loss": 0.0398,
"step": 780
},
{
"epoch": 1.6431002880335166,
"grad_norm": 3.7116594314575195,
"learning_rate": 1.679347494116673e-05,
"loss": 0.0419,
"step": 785
},
{
"epoch": 1.6535742340926944,
"grad_norm": 1.6450990438461304,
"learning_rate": 1.6717730158760334e-05,
"loss": 0.0387,
"step": 790
},
{
"epoch": 1.6640481801518723,
"grad_norm": 1.863366723060608,
"learning_rate": 1.6641276380789107e-05,
"loss": 0.0529,
"step": 795
},
{
"epoch": 1.6745221262110501,
"grad_norm": 1.3787758350372314,
"learning_rate": 1.656412167625674e-05,
"loss": 0.0395,
"step": 800
},
{
"epoch": 1.6745221262110501,
"eval_loss": 0.05458131060004234,
"eval_runtime": 181.5768,
"eval_samples_per_second": 10.513,
"eval_steps_per_second": 2.632,
"step": 800
},
{
"epoch": 1.684996072270228,
"grad_norm": 1.3715674877166748,
"learning_rate": 1.6486274188143386e-05,
"loss": 0.0335,
"step": 805
},
{
"epoch": 1.6954700183294056,
"grad_norm": 1.6836681365966797,
"learning_rate": 1.6407742132546216e-05,
"loss": 0.042,
"step": 810
},
{
"epoch": 1.7059439643885834,
"grad_norm": 2.448378324508667,
"learning_rate": 1.6328533797812315e-05,
"loss": 0.0419,
"step": 815
},
{
"epoch": 1.716417910447761,
"grad_norm": 1.39069664478302,
"learning_rate": 1.6248657543663887e-05,
"loss": 0.0371,
"step": 820
},
{
"epoch": 1.7268918565069389,
"grad_norm": 2.460034132003784,
"learning_rate": 1.6168121800315993e-05,
"loss": 0.0481,
"step": 825
},
{
"epoch": 1.7373658025661167,
"grad_norm": 2.401494026184082,
"learning_rate": 1.60869350675868e-05,
"loss": 0.0769,
"step": 830
},
{
"epoch": 1.7478397486252946,
"grad_norm": 2.444972038269043,
"learning_rate": 1.6005105914000508e-05,
"loss": 0.0403,
"step": 835
},
{
"epoch": 1.7583136946844724,
"grad_norm": 1.6803293228149414,
"learning_rate": 1.5922642975883014e-05,
"loss": 0.0433,
"step": 840
},
{
"epoch": 1.7687876407436502,
"grad_norm": 1.660657286643982,
"learning_rate": 1.5839554956450435e-05,
"loss": 0.043,
"step": 845
},
{
"epoch": 1.779261586802828,
"grad_norm": 1.6761749982833862,
"learning_rate": 1.5755850624890563e-05,
"loss": 0.0483,
"step": 850
},
{
"epoch": 1.779261586802828,
"eval_loss": 0.05199718102812767,
"eval_runtime": 181.8182,
"eval_samples_per_second": 10.499,
"eval_steps_per_second": 2.629,
"step": 850
},
{
"epoch": 1.7897355328620057,
"grad_norm": 1.660902738571167,
"learning_rate": 1.5671538815437346e-05,
"loss": 0.0451,
"step": 855
},
{
"epoch": 1.8002094789211835,
"grad_norm": 2.425180673599243,
"learning_rate": 1.558662842643852e-05,
"loss": 0.0514,
"step": 860
},
{
"epoch": 1.8106834249803614,
"grad_norm": 1.8615056276321411,
"learning_rate": 1.5501128419416475e-05,
"loss": 0.0951,
"step": 865
},
{
"epoch": 1.821157371039539,
"grad_norm": 2.117887258529663,
"learning_rate": 1.541504781812244e-05,
"loss": 0.0443,
"step": 870
},
{
"epoch": 1.8316313170987168,
"grad_norm": 1.9007426500320435,
"learning_rate": 1.532839570758411e-05,
"loss": 0.0539,
"step": 875
},
{
"epoch": 1.8421052631578947,
"grad_norm": 1.0283795595169067,
"learning_rate": 1.5241181233146798e-05,
"loss": 0.0439,
"step": 880
},
{
"epoch": 1.8525792092170725,
"grad_norm": 1.4137732982635498,
"learning_rate": 1.5153413599508241e-05,
"loss": 0.0454,
"step": 885
},
{
"epoch": 1.8630531552762504,
"grad_norm": 1.5199006795883179,
"learning_rate": 1.5065102069747117e-05,
"loss": 0.0521,
"step": 890
},
{
"epoch": 1.8735271013354282,
"grad_norm": 1.8887778520584106,
"learning_rate": 1.4976255964345407e-05,
"loss": 0.0379,
"step": 895
},
{
"epoch": 1.884001047394606,
"grad_norm": 0.687090277671814,
"learning_rate": 1.488688466020471e-05,
"loss": 0.0421,
"step": 900
},
{
"epoch": 1.884001047394606,
"eval_loss": 0.055315304547548294,
"eval_runtime": 181.7379,
"eval_samples_per_second": 10.504,
"eval_steps_per_second": 2.63,
"step": 900
},
{
"epoch": 1.8944749934537837,
"grad_norm": 3.8431344032287598,
"learning_rate": 1.4796997589656605e-05,
"loss": 0.0493,
"step": 905
},
{
"epoch": 1.9049489395129615,
"grad_norm": 3.010768413543701,
"learning_rate": 1.470660423946713e-05,
"loss": 0.0429,
"step": 910
},
{
"epoch": 1.9154228855721394,
"grad_norm": 1.5146229267120361,
"learning_rate": 1.4615714149835557e-05,
"loss": 0.0349,
"step": 915
},
{
"epoch": 1.925896831631317,
"grad_norm": 1.2837982177734375,
"learning_rate": 1.4524336913387509e-05,
"loss": 0.031,
"step": 920
},
{
"epoch": 1.9363707776904948,
"grad_norm": 1.4640088081359863,
"learning_rate": 1.4432482174162539e-05,
"loss": 0.0433,
"step": 925
},
{
"epoch": 1.9468447237496727,
"grad_norm": 2.3506968021392822,
"learning_rate": 1.4340159626596295e-05,
"loss": 0.0344,
"step": 930
},
{
"epoch": 1.9573186698088505,
"grad_norm": 1.7294262647628784,
"learning_rate": 1.4247379014497356e-05,
"loss": 0.0448,
"step": 935
},
{
"epoch": 1.9677926158680283,
"grad_norm": 2.0124881267547607,
"learning_rate": 1.4154150130018867e-05,
"loss": 0.0531,
"step": 940
},
{
"epoch": 1.9782665619272062,
"grad_norm": 1.9695724248886108,
"learning_rate": 1.4060482812625055e-05,
"loss": 0.0509,
"step": 945
},
{
"epoch": 1.988740507986384,
"grad_norm": 5.048811435699463,
"learning_rate": 1.3966386948052777e-05,
"loss": 0.0735,
"step": 950
},
{
"epoch": 1.988740507986384,
"eval_loss": 0.05095840245485306,
"eval_runtime": 181.2754,
"eval_samples_per_second": 10.531,
"eval_steps_per_second": 2.637,
"step": 950
},
{
"epoch": 1.9992144540455616,
"grad_norm": 2.204068899154663,
"learning_rate": 1.3871872467268155e-05,
"loss": 0.0462,
"step": 955
},
{
"epoch": 2.0083791568473424,
"grad_norm": 1.12019681930542,
"learning_rate": 1.3776949345418466e-05,
"loss": 0.0368,
"step": 960
},
{
"epoch": 2.01885310290652,
"grad_norm": 0.8073732256889343,
"learning_rate": 1.3681627600779353e-05,
"loss": 0.0284,
"step": 965
},
{
"epoch": 2.029327048965698,
"grad_norm": 1.6881890296936035,
"learning_rate": 1.3585917293697473e-05,
"loss": 0.025,
"step": 970
},
{
"epoch": 2.0398009950248754,
"grad_norm": 2.6855087280273438,
"learning_rate": 1.3489828525528732e-05,
"loss": 0.0447,
"step": 975
},
{
"epoch": 2.0502749410840533,
"grad_norm": 2.1619064807891846,
"learning_rate": 1.3393371437572183e-05,
"loss": 0.0254,
"step": 980
},
{
"epoch": 2.060748887143231,
"grad_norm": 2.9052109718322754,
"learning_rate": 1.329655620999969e-05,
"loss": 0.0427,
"step": 985
},
{
"epoch": 2.071222833202409,
"grad_norm": 1.7290070056915283,
"learning_rate": 1.3199393060781507e-05,
"loss": 0.0315,
"step": 990
},
{
"epoch": 2.081696779261587,
"grad_norm": 1.7127286195755005,
"learning_rate": 1.3101892244607872e-05,
"loss": 0.0256,
"step": 995
},
{
"epoch": 2.0921707253207646,
"grad_norm": 1.0866358280181885,
"learning_rate": 1.3004064051806712e-05,
"loss": 0.0233,
"step": 1000
},
{
"epoch": 2.0921707253207646,
"eval_loss": 0.05503799021244049,
"eval_runtime": 181.6508,
"eval_samples_per_second": 10.509,
"eval_steps_per_second": 2.631,
"step": 1000
},
{
"epoch": 2.1026446713799425,
"grad_norm": 2.119222402572632,
"learning_rate": 1.2905918807257578e-05,
"loss": 0.0234,
"step": 1005
},
{
"epoch": 2.1131186174391203,
"grad_norm": 2.4023685455322266,
"learning_rate": 1.2807466869301978e-05,
"loss": 0.0284,
"step": 1010
},
{
"epoch": 2.123592563498298,
"grad_norm": 1.3008413314819336,
"learning_rate": 1.2708718628650125e-05,
"loss": 0.0245,
"step": 1015
},
{
"epoch": 2.134066509557476,
"grad_norm": 1.8224750757217407,
"learning_rate": 1.260968450728429e-05,
"loss": 0.0439,
"step": 1020
},
{
"epoch": 2.1445404556166534,
"grad_norm": 1.3979074954986572,
"learning_rate": 1.2510374957358877e-05,
"loss": 0.0272,
"step": 1025
},
{
"epoch": 2.1550144016758312,
"grad_norm": 1.3777137994766235,
"learning_rate": 1.2410800460097265e-05,
"loss": 0.0158,
"step": 1030
},
{
"epoch": 2.165488347735009,
"grad_norm": 1.4102022647857666,
"learning_rate": 1.2310971524685638e-05,
"loss": 0.0236,
"step": 1035
},
{
"epoch": 2.175962293794187,
"grad_norm": 1.0941966772079468,
"learning_rate": 1.2210898687163808e-05,
"loss": 0.03,
"step": 1040
},
{
"epoch": 2.1864362398533648,
"grad_norm": 1.8256818056106567,
"learning_rate": 1.2110592509313261e-05,
"loss": 0.0387,
"step": 1045
},
{
"epoch": 2.1969101859125426,
"grad_norm": 1.2805190086364746,
"learning_rate": 1.201006357754243e-05,
"loss": 0.027,
"step": 1050
},
{
"epoch": 2.1969101859125426,
"eval_loss": 0.054396990686655045,
"eval_runtime": 181.6161,
"eval_samples_per_second": 10.511,
"eval_steps_per_second": 2.632,
"step": 1050
},
{
"epoch": 2.2073841319717205,
"grad_norm": 1.5364525318145752,
"learning_rate": 1.1909322501769407e-05,
"loss": 0.0205,
"step": 1055
},
{
"epoch": 2.2178580780308983,
"grad_norm": 2.694061040878296,
"learning_rate": 1.1808379914302166e-05,
"loss": 0.0347,
"step": 1060
},
{
"epoch": 2.228332024090076,
"grad_norm": 1.2438369989395142,
"learning_rate": 1.1707246468716411e-05,
"loss": 0.0503,
"step": 1065
},
{
"epoch": 2.2388059701492535,
"grad_norm": 1.5222554206848145,
"learning_rate": 1.1605932838731194e-05,
"loss": 0.0438,
"step": 1070
},
{
"epoch": 2.2492799162084314,
"grad_norm": 1.7822566032409668,
"learning_rate": 1.15044497170824e-05,
"loss": 0.0345,
"step": 1075
},
{
"epoch": 2.259753862267609,
"grad_norm": 1.48551607131958,
"learning_rate": 1.1402807814394216e-05,
"loss": 0.0342,
"step": 1080
},
{
"epoch": 2.270227808326787,
"grad_norm": 2.0183334350585938,
"learning_rate": 1.130101785804874e-05,
"loss": 0.0277,
"step": 1085
},
{
"epoch": 2.280701754385965,
"grad_norm": 1.0673748254776,
"learning_rate": 1.1199090591053784e-05,
"loss": 0.0237,
"step": 1090
},
{
"epoch": 2.2911757004451427,
"grad_norm": 1.9523701667785645,
"learning_rate": 1.1097036770909055e-05,
"loss": 0.0403,
"step": 1095
},
{
"epoch": 2.3016496465043206,
"grad_norm": 0.7670222520828247,
"learning_rate": 1.0994867168470806e-05,
"loss": 0.0213,
"step": 1100
},
{
"epoch": 2.3016496465043206,
"eval_loss": 0.05162982642650604,
"eval_runtime": 182.2699,
"eval_samples_per_second": 10.473,
"eval_steps_per_second": 2.622,
"step": 1100
},
{
"epoch": 2.3121235925634984,
"grad_norm": 1.686271071434021,
"learning_rate": 1.0892592566815061e-05,
"loss": 0.0303,
"step": 1105
},
{
"epoch": 2.3225975386226763,
"grad_norm": 1.5811524391174316,
"learning_rate": 1.079022376009955e-05,
"loss": 0.0193,
"step": 1110
},
{
"epoch": 2.333071484681854,
"grad_norm": 1.9558700323104858,
"learning_rate": 1.0687771552424504e-05,
"loss": 0.0269,
"step": 1115
},
{
"epoch": 2.3435454307410315,
"grad_norm": 1.3908772468566895,
"learning_rate": 1.0585246756692366e-05,
"loss": 0.0307,
"step": 1120
},
{
"epoch": 2.3540193768002093,
"grad_norm": 1.5732723474502563,
"learning_rate": 1.0482660193466594e-05,
"loss": 0.0184,
"step": 1125
},
{
"epoch": 2.364493322859387,
"grad_norm": 1.5866297483444214,
"learning_rate": 1.0380022689829638e-05,
"loss": 0.0263,
"step": 1130
},
{
"epoch": 2.374967268918565,
"grad_norm": 0.7292336821556091,
"learning_rate": 1.0277345078240258e-05,
"loss": 0.0465,
"step": 1135
},
{
"epoch": 2.385441214977743,
"grad_norm": 1.587586522102356,
"learning_rate": 1.0174638195390235e-05,
"loss": 0.0402,
"step": 1140
},
{
"epoch": 2.3959151610369207,
"grad_norm": 1.3230594396591187,
"learning_rate": 1.0071912881060668e-05,
"loss": 0.0274,
"step": 1145
},
{
"epoch": 2.4063891070960985,
"grad_norm": 1.5415374040603638,
"learning_rate": 9.969179976977939e-06,
"loss": 0.0284,
"step": 1150
},
{
"epoch": 2.4063891070960985,
"eval_loss": 0.052570246160030365,
"eval_runtime": 181.6844,
"eval_samples_per_second": 10.507,
"eval_steps_per_second": 2.631,
"step": 1150
},
{
"epoch": 2.4168630531552764,
"grad_norm": 0.8958898782730103,
"learning_rate": 9.866450325669456e-06,
"loss": 0.0231,
"step": 1155
},
{
"epoch": 2.4273369992144542,
"grad_norm": 2.100008487701416,
"learning_rate": 9.763734769319317e-06,
"loss": 0.0357,
"step": 1160
},
{
"epoch": 2.4378109452736316,
"grad_norm": 1.323148488998413,
"learning_rate": 9.661044148624038e-06,
"loss": 0.0237,
"step": 1165
},
{
"epoch": 2.4482848913328095,
"grad_norm": 2.1606085300445557,
"learning_rate": 9.5583893016484e-06,
"loss": 0.0279,
"step": 1170
},
{
"epoch": 2.4587588373919873,
"grad_norm": 1.4878783226013184,
"learning_rate": 9.455781062681583e-06,
"loss": 0.025,
"step": 1175
},
{
"epoch": 2.469232783451165,
"grad_norm": 0.9704115986824036,
"learning_rate": 9.353230261093723e-06,
"loss": 0.0177,
"step": 1180
},
{
"epoch": 2.479706729510343,
"grad_norm": 3.0599184036254883,
"learning_rate": 9.250747720192961e-06,
"loss": 0.0339,
"step": 1185
},
{
"epoch": 2.490180675569521,
"grad_norm": 1.2243415117263794,
"learning_rate": 9.148344256083131e-06,
"loss": 0.0327,
"step": 1190
},
{
"epoch": 2.5006546216286987,
"grad_norm": 0.6634637117385864,
"learning_rate": 9.046030676522242e-06,
"loss": 0.027,
"step": 1195
},
{
"epoch": 2.5111285676878765,
"grad_norm": 0.6147317290306091,
"learning_rate": 8.943817779781788e-06,
"loss": 0.0175,
"step": 1200
},
{
"epoch": 2.5111285676878765,
"eval_loss": 0.05241983383893967,
"eval_runtime": 181.61,
"eval_samples_per_second": 10.512,
"eval_steps_per_second": 2.632,
"step": 1200
},
{
"epoch": 2.5216025137470544,
"grad_norm": 1.175798773765564,
"learning_rate": 8.841716353507118e-06,
"loss": 0.036,
"step": 1205
},
{
"epoch": 2.532076459806232,
"grad_norm": 3.135117292404175,
"learning_rate": 8.739737173578875e-06,
"loss": 0.039,
"step": 1210
},
{
"epoch": 2.54255040586541,
"grad_norm": 1.2280455827713013,
"learning_rate": 8.637891002975708e-06,
"loss": 0.0242,
"step": 1215
},
{
"epoch": 2.5530243519245874,
"grad_norm": 1.851010799407959,
"learning_rate": 8.536188590638334e-06,
"loss": 0.027,
"step": 1220
},
{
"epoch": 2.5634982979837653,
"grad_norm": 1.7395970821380615,
"learning_rate": 8.4346406703351e-06,
"loss": 0.0241,
"step": 1225
},
{
"epoch": 2.573972244042943,
"grad_norm": 1.3405005931854248,
"learning_rate": 8.3332579595291e-06,
"loss": 0.0321,
"step": 1230
},
{
"epoch": 2.584446190102121,
"grad_norm": 2.150904417037964,
"learning_rate": 8.232051158247074e-06,
"loss": 0.0325,
"step": 1235
},
{
"epoch": 2.594920136161299,
"grad_norm": 1.6793160438537598,
"learning_rate": 8.131030947950109e-06,
"loss": 0.0351,
"step": 1240
},
{
"epoch": 2.6053940822204766,
"grad_norm": 1.7281907796859741,
"learning_rate": 8.030207990406286e-06,
"loss": 0.0485,
"step": 1245
},
{
"epoch": 2.6158680282796545,
"grad_norm": 1.0809645652770996,
"learning_rate": 7.929592926565468e-06,
"loss": 0.0218,
"step": 1250
},
{
"epoch": 2.6158680282796545,
"eval_loss": 0.05264349281787872,
"eval_runtime": 181.5098,
"eval_samples_per_second": 10.517,
"eval_steps_per_second": 2.633,
"step": 1250
},
{
"epoch": 2.6263419743388323,
"grad_norm": 1.1241612434387207,
"learning_rate": 7.829196375436197e-06,
"loss": 0.029,
"step": 1255
},
{
"epoch": 2.6368159203980097,
"grad_norm": 1.4399851560592651,
"learning_rate": 7.729028932964995e-06,
"loss": 0.0337,
"step": 1260
},
{
"epoch": 2.6472898664571876,
"grad_norm": 2.769148588180542,
"learning_rate": 7.629101170918041e-06,
"loss": 0.0398,
"step": 1265
},
{
"epoch": 2.6577638125163654,
"grad_norm": 1.6929821968078613,
"learning_rate": 7.529423635765401e-06,
"loss": 0.0182,
"step": 1270
},
{
"epoch": 2.6682377585755432,
"grad_norm": 1.0924474000930786,
"learning_rate": 7.430006847567972e-06,
"loss": 0.0385,
"step": 1275
},
{
"epoch": 2.678711704634721,
"grad_norm": 1.542842984199524,
"learning_rate": 7.330861298867173e-06,
"loss": 0.0311,
"step": 1280
},
{
"epoch": 2.689185650693899,
"grad_norm": 1.0925610065460205,
"learning_rate": 7.2319974535775405e-06,
"loss": 0.0309,
"step": 1285
},
{
"epoch": 2.6996595967530768,
"grad_norm": 1.2981770038604736,
"learning_rate": 7.133425745882375e-06,
"loss": 0.0392,
"step": 1290
},
{
"epoch": 2.7101335428122546,
"grad_norm": 1.56510329246521,
"learning_rate": 7.035156579132506e-06,
"loss": 0.0279,
"step": 1295
},
{
"epoch": 2.7206074888714324,
"grad_norm": 1.6105190515518188,
"learning_rate": 6.93720032474829e-06,
"loss": 0.0253,
"step": 1300
},
{
"epoch": 2.7206074888714324,
"eval_loss": 0.051075223833322525,
"eval_runtime": 181.7152,
"eval_samples_per_second": 10.505,
"eval_steps_per_second": 2.63,
"step": 1300
},
{
"epoch": 2.7310814349306103,
"grad_norm": 2.5957469940185547,
"learning_rate": 6.839567321125035e-06,
"loss": 0.019,
"step": 1305
},
{
"epoch": 2.741555380989788,
"grad_norm": 1.354457974433899,
"learning_rate": 6.74226787254185e-06,
"loss": 0.0274,
"step": 1310
},
{
"epoch": 2.752029327048966,
"grad_norm": 1.0121866464614868,
"learning_rate": 6.645312248074132e-06,
"loss": 0.0193,
"step": 1315
},
{
"epoch": 2.7625032731081434,
"grad_norm": 1.7300618886947632,
"learning_rate": 6.54871068050976e-06,
"loss": 0.0208,
"step": 1320
},
{
"epoch": 2.772977219167321,
"grad_norm": 1.365108609199524,
"learning_rate": 6.452473365269115e-06,
"loss": 0.0267,
"step": 1325
},
{
"epoch": 2.783451165226499,
"grad_norm": 2.3114993572235107,
"learning_rate": 6.356610459329038e-06,
"loss": 0.028,
"step": 1330
},
{
"epoch": 2.793925111285677,
"grad_norm": 1.1482765674591064,
"learning_rate": 6.261132080150868e-06,
"loss": 0.0304,
"step": 1335
},
{
"epoch": 2.8043990573448547,
"grad_norm": 1.3784815073013306,
"learning_rate": 6.166048304612624e-06,
"loss": 0.0245,
"step": 1340
},
{
"epoch": 2.8148730034040326,
"grad_norm": 1.6406880617141724,
"learning_rate": 6.071369167945482e-06,
"loss": 0.027,
"step": 1345
},
{
"epoch": 2.8253469494632104,
"grad_norm": 1.8636596202850342,
"learning_rate": 5.9771046626746585e-06,
"loss": 0.0227,
"step": 1350
},
{
"epoch": 2.8253469494632104,
"eval_loss": 0.05176674574613571,
"eval_runtime": 181.8891,
"eval_samples_per_second": 10.495,
"eval_steps_per_second": 2.628,
"step": 1350
},
{
"epoch": 2.835820895522388,
"grad_norm": 1.8853999376296997,
"learning_rate": 5.883264737564776e-06,
"loss": 0.0326,
"step": 1355
},
{
"epoch": 2.8462948415815656,
"grad_norm": 1.3684381246566772,
"learning_rate": 5.789859296569871e-06,
"loss": 0.018,
"step": 1360
},
{
"epoch": 2.8567687876407435,
"grad_norm": 1.627061367034912,
"learning_rate": 5.696898197788108e-06,
"loss": 0.0293,
"step": 1365
},
{
"epoch": 2.8672427336999213,
"grad_norm": 2.071784496307373,
"learning_rate": 5.6043912524213685e-06,
"loss": 0.0246,
"step": 1370
},
{
"epoch": 2.877716679759099,
"grad_norm": 1.5565595626831055,
"learning_rate": 5.512348223739754e-06,
"loss": 0.0163,
"step": 1375
},
{
"epoch": 2.888190625818277,
"grad_norm": 2.5211095809936523,
"learning_rate": 5.4207788260511505e-06,
"loss": 0.0386,
"step": 1380
},
{
"epoch": 2.898664571877455,
"grad_norm": 1.5942156314849854,
"learning_rate": 5.329692723675994e-06,
"loss": 0.0302,
"step": 1385
},
{
"epoch": 2.9091385179366327,
"grad_norm": 1.4718657732009888,
"learning_rate": 5.239099529927281e-06,
"loss": 0.0318,
"step": 1390
},
{
"epoch": 2.9196124639958105,
"grad_norm": 0.7544646859169006,
"learning_rate": 5.1490088060959495e-06,
"loss": 0.0162,
"step": 1395
},
{
"epoch": 2.9300864100549884,
"grad_norm": 1.2517889738082886,
"learning_rate": 5.0594300604418086e-06,
"loss": 0.0304,
"step": 1400
},
{
"epoch": 2.9300864100549884,
"eval_loss": 0.05129832401871681,
"eval_runtime": 181.4791,
"eval_samples_per_second": 10.519,
"eval_steps_per_second": 2.634,
"step": 1400
},
{
"epoch": 2.940560356114166,
"grad_norm": 0.8101089000701904,
"learning_rate": 4.970372747190006e-06,
"loss": 0.0431,
"step": 1405
},
{
"epoch": 2.951034302173344,
"grad_norm": 1.6314613819122314,
"learning_rate": 4.881846265533209e-06,
"loss": 0.0378,
"step": 1410
},
{
"epoch": 2.9615082482325215,
"grad_norm": 1.186647891998291,
"learning_rate": 4.793859958639635e-06,
"loss": 0.0281,
"step": 1415
},
{
"epoch": 2.9719821942916993,
"grad_norm": 2.1646673679351807,
"learning_rate": 4.7064231126669355e-06,
"loss": 0.0343,
"step": 1420
},
{
"epoch": 2.982456140350877,
"grad_norm": 1.3391481637954712,
"learning_rate": 4.6195449557821495e-06,
"loss": 0.0197,
"step": 1425
},
{
"epoch": 2.992930086410055,
"grad_norm": 2.9808108806610107,
"learning_rate": 4.5332346571877405e-06,
"loss": 0.0302,
"step": 1430
},
{
"epoch": 3.0020947892118355,
"grad_norm": 0.9604336619377136,
"learning_rate": 4.447501326153865e-06,
"loss": 0.0252,
"step": 1435
},
{
"epoch": 3.0125687352710133,
"grad_norm": 1.2666419744491577,
"learning_rate": 4.3623540110569935e-06,
"loss": 0.0179,
"step": 1440
},
{
"epoch": 3.023042681330191,
"grad_norm": 1.4494256973266602,
"learning_rate": 4.277801698424918e-06,
"loss": 0.0218,
"step": 1445
},
{
"epoch": 3.033516627389369,
"grad_norm": 1.1630330085754395,
"learning_rate": 4.1938533119883014e-06,
"loss": 0.018,
"step": 1450
},
{
"epoch": 3.033516627389369,
"eval_loss": 0.05160898342728615,
"eval_runtime": 182.0457,
"eval_samples_per_second": 10.486,
"eval_steps_per_second": 2.626,
"step": 1450
},
{
"epoch": 3.043990573448547,
"grad_norm": 2.2805240154266357,
"learning_rate": 4.110517711738881e-06,
"loss": 0.027,
"step": 1455
},
{
"epoch": 3.0544645195077247,
"grad_norm": 0.7012156248092651,
"learning_rate": 4.0278036929943574e-06,
"loss": 0.0225,
"step": 1460
},
{
"epoch": 3.0649384655669025,
"grad_norm": 1.6349064111709595,
"learning_rate": 3.945719985470128e-06,
"loss": 0.0171,
"step": 1465
},
{
"epoch": 3.07541241162608,
"grad_norm": 1.5148468017578125,
"learning_rate": 3.8642752523579595e-06,
"loss": 0.014,
"step": 1470
},
{
"epoch": 3.0858863576852578,
"grad_norm": 0.9480647444725037,
"learning_rate": 3.7834780894116575e-06,
"loss": 0.0152,
"step": 1475
},
{
"epoch": 3.0963603037444356,
"grad_norm": 2.8382086753845215,
"learning_rate": 3.7033370240398527e-06,
"loss": 0.0239,
"step": 1480
},
{
"epoch": 3.1068342498036134,
"grad_norm": 2.1970698833465576,
"learning_rate": 3.6238605144060314e-06,
"loss": 0.0261,
"step": 1485
},
{
"epoch": 3.1173081958627913,
"grad_norm": 1.1678617000579834,
"learning_rate": 3.545056948535839e-06,
"loss": 0.0158,
"step": 1490
},
{
"epoch": 3.127782141921969,
"grad_norm": 1.8681138753890991,
"learning_rate": 3.466934643431795e-06,
"loss": 0.0175,
"step": 1495
},
{
"epoch": 3.138256087981147,
"grad_norm": 1.5951310396194458,
"learning_rate": 3.389501844195525e-06,
"loss": 0.0193,
"step": 1500
},
{
"epoch": 3.138256087981147,
"eval_loss": 0.05427511781454086,
"eval_runtime": 182.0489,
"eval_samples_per_second": 10.486,
"eval_steps_per_second": 2.626,
"step": 1500
},
{
"epoch": 3.148730034040325,
"grad_norm": 1.1853766441345215,
"learning_rate": 3.3127667231575587e-06,
"loss": 0.0211,
"step": 1505
},
{
"epoch": 3.1592039800995027,
"grad_norm": 2.4959716796875,
"learning_rate": 3.2367373790147973e-06,
"loss": 0.0143,
"step": 1510
},
{
"epoch": 3.1696779261586805,
"grad_norm": 0.8805971741676331,
"learning_rate": 3.1614218359757985e-06,
"loss": 0.0185,
"step": 1515
},
{
"epoch": 3.180151872217858,
"grad_norm": 2.49381160736084,
"learning_rate": 3.0868280429138754e-06,
"loss": 0.0161,
"step": 1520
},
{
"epoch": 3.1906258182770357,
"grad_norm": 1.2514592409133911,
"learning_rate": 3.0129638725281683e-06,
"loss": 0.0198,
"step": 1525
},
{
"epoch": 3.2010997643362136,
"grad_norm": 3.421593427658081,
"learning_rate": 2.9398371205127495e-06,
"loss": 0.0203,
"step": 1530
},
{
"epoch": 3.2115737103953914,
"grad_norm": 1.6247831583023071,
"learning_rate": 2.8674555047338694e-06,
"loss": 0.0165,
"step": 1535
},
{
"epoch": 3.2220476564545693,
"grad_norm": 2.246312141418457,
"learning_rate": 2.7958266644153974e-06,
"loss": 0.0342,
"step": 1540
},
{
"epoch": 3.232521602513747,
"grad_norm": 2.949176788330078,
"learning_rate": 2.7249581593325647e-06,
"loss": 0.0252,
"step": 1545
},
{
"epoch": 3.242995548572925,
"grad_norm": 1.9428445100784302,
"learning_rate": 2.654857469014113e-06,
"loss": 0.0243,
"step": 1550
},
{
"epoch": 3.242995548572925,
"eval_loss": 0.05600380152463913,
"eval_runtime": 182.3825,
"eval_samples_per_second": 10.467,
"eval_steps_per_second": 2.621,
"step": 1550
},
{
"epoch": 3.2534694946321028,
"grad_norm": 1.8825373649597168,
"learning_rate": 2.585531991952893e-06,
"loss": 0.0167,
"step": 1555
},
{
"epoch": 3.2639434406912806,
"grad_norm": 2.28324818611145,
"learning_rate": 2.51698904482501e-06,
"loss": 0.0258,
"step": 1560
},
{
"epoch": 3.274417386750458,
"grad_norm": 1.9099152088165283,
"learning_rate": 2.44923586171763e-06,
"loss": 0.0499,
"step": 1565
},
{
"epoch": 3.284891332809636,
"grad_norm": 2.5200655460357666,
"learning_rate": 2.382279593365482e-06,
"loss": 0.021,
"step": 1570
},
{
"epoch": 3.2953652788688137,
"grad_norm": 1.6834214925765991,
"learning_rate": 2.3161273063961542e-06,
"loss": 0.0219,
"step": 1575
},
{
"epoch": 3.3058392249279915,
"grad_norm": 2.1367030143737793,
"learning_rate": 2.2507859825842883e-06,
"loss": 0.0199,
"step": 1580
},
{
"epoch": 3.3163131709871694,
"grad_norm": 0.7622693777084351,
"learning_rate": 2.1862625181147123e-06,
"loss": 0.0149,
"step": 1585
},
{
"epoch": 3.326787117046347,
"grad_norm": 1.3212164640426636,
"learning_rate": 2.122563722854604e-06,
"loss": 0.0165,
"step": 1590
},
{
"epoch": 3.337261063105525,
"grad_norm": 1.5809417963027954,
"learning_rate": 2.059696319634782e-06,
"loss": 0.015,
"step": 1595
},
{
"epoch": 3.347735009164703,
"grad_norm": 1.2320683002471924,
"learning_rate": 1.9976669435401597e-06,
"loss": 0.0213,
"step": 1600
},
{
"epoch": 3.347735009164703,
"eval_loss": 0.055280230939388275,
"eval_runtime": 182.1887,
"eval_samples_per_second": 10.478,
"eval_steps_per_second": 2.624,
"step": 1600
},
{
"epoch": 3.3582089552238807,
"grad_norm": 1.0370845794677734,
"learning_rate": 1.936482141209486e-06,
"loss": 0.0237,
"step": 1605
},
{
"epoch": 3.3686829012830586,
"grad_norm": 1.2540106773376465,
"learning_rate": 1.8761483701443984e-06,
"loss": 0.0214,
"step": 1610
},
{
"epoch": 3.3791568473422364,
"grad_norm": 1.8267788887023926,
"learning_rate": 1.8166719980278858e-06,
"loss": 0.0202,
"step": 1615
},
{
"epoch": 3.389630793401414,
"grad_norm": 1.5350995063781738,
"learning_rate": 1.758059302052255e-06,
"loss": 0.0206,
"step": 1620
},
{
"epoch": 3.4001047394605917,
"grad_norm": 1.1958850622177124,
"learning_rate": 1.7003164682566165e-06,
"loss": 0.0139,
"step": 1625
},
{
"epoch": 3.4105786855197695,
"grad_norm": 2.2496140003204346,
"learning_rate": 1.6434495908740022e-06,
"loss": 0.0153,
"step": 1630
},
{
"epoch": 3.4210526315789473,
"grad_norm": 0.9056265950202942,
"learning_rate": 1.587464671688187e-06,
"loss": 0.0178,
"step": 1635
},
{
"epoch": 3.431526577638125,
"grad_norm": 0.940555989742279,
"learning_rate": 1.5323676194002456e-06,
"loss": 0.0159,
"step": 1640
},
{
"epoch": 3.442000523697303,
"grad_norm": 1.699397087097168,
"learning_rate": 1.4781642490049398e-06,
"loss": 0.0188,
"step": 1645
},
{
"epoch": 3.452474469756481,
"grad_norm": 1.2530186176300049,
"learning_rate": 1.4248602811770108e-06,
"loss": 0.0157,
"step": 1650
},
{
"epoch": 3.452474469756481,
"eval_loss": 0.055322494357824326,
"eval_runtime": 183.0468,
"eval_samples_per_second": 10.429,
"eval_steps_per_second": 2.611,
"step": 1650
},
{
"epoch": 3.4629484158156587,
"grad_norm": 1.4462732076644897,
"learning_rate": 1.372461341667396e-06,
"loss": 0.026,
"step": 1655
},
{
"epoch": 3.473422361874836,
"grad_norm": 0.42883485555648804,
"learning_rate": 1.3209729607095022e-06,
"loss": 0.0144,
"step": 1660
},
{
"epoch": 3.483896307934014,
"grad_norm": 1.2245005369186401,
"learning_rate": 1.2704005724355273e-06,
"loss": 0.0108,
"step": 1665
},
{
"epoch": 3.494370253993192,
"grad_norm": 1.7988877296447754,
"learning_rate": 1.2207495143029325e-06,
"loss": 0.0228,
"step": 1670
},
{
"epoch": 3.5048442000523696,
"grad_norm": 1.7349547147750854,
"learning_rate": 1.172025026531135e-06,
"loss": 0.0216,
"step": 1675
},
{
"epoch": 3.5153181461115475,
"grad_norm": 0.9366742968559265,
"learning_rate": 1.124232251548445e-06,
"loss": 0.0145,
"step": 1680
},
{
"epoch": 3.5257920921707253,
"grad_norm": 1.6843370199203491,
"learning_rate": 1.0773762334493198e-06,
"loss": 0.0311,
"step": 1685
},
{
"epoch": 3.536266038229903,
"grad_norm": 2.704352855682373,
"learning_rate": 1.0314619174620211e-06,
"loss": 0.0526,
"step": 1690
},
{
"epoch": 3.546739984289081,
"grad_norm": 1.5389641523361206,
"learning_rate": 9.86494149426682e-07,
"loss": 0.0153,
"step": 1695
},
{
"epoch": 3.557213930348259,
"grad_norm": 1.7506754398345947,
"learning_rate": 9.424776752838705e-07,
"loss": 0.0264,
"step": 1700
},
{
"epoch": 3.557213930348259,
"eval_loss": 0.05507681146264076,
"eval_runtime": 183.3526,
"eval_samples_per_second": 10.412,
"eval_steps_per_second": 2.607,
"step": 1700
},
{
"epoch": 3.5676878764074367,
"grad_norm": 2.2783095836639404,
"learning_rate": 8.994171405737051e-07,
"loss": 0.0181,
"step": 1705
},
{
"epoch": 3.5781618224666145,
"grad_norm": 1.6380702257156372,
"learning_rate": 8.573170899455529e-07,
"loss": 0.0241,
"step": 1710
},
{
"epoch": 3.5886357685257924,
"grad_norm": 1.6343145370483398,
"learning_rate": 8.161819666783888e-07,
"loss": 0.0193,
"step": 1715
},
{
"epoch": 3.5991097145849698,
"grad_norm": 2.3693206310272217,
"learning_rate": 7.760161122118493e-07,
"loss": 0.0368,
"step": 1720
},
{
"epoch": 3.6095836606441476,
"grad_norm": 1.108860969543457,
"learning_rate": 7.368237656880217e-07,
"loss": 0.0101,
"step": 1725
},
{
"epoch": 3.6200576067033254,
"grad_norm": 1.584486722946167,
"learning_rate": 6.986090635040555e-07,
"loss": 0.0216,
"step": 1730
},
{
"epoch": 3.6305315527625033,
"grad_norm": 0.9664100408554077,
"learning_rate": 6.61376038875593e-07,
"loss": 0.0112,
"step": 1735
},
{
"epoch": 3.641005498821681,
"grad_norm": 1.3716723918914795,
"learning_rate": 6.251286214111018e-07,
"loss": 0.0221,
"step": 1740
},
{
"epoch": 3.651479444880859,
"grad_norm": 1.3973896503448486,
"learning_rate": 5.898706366971451e-07,
"loss": 0.0383,
"step": 1745
},
{
"epoch": 3.661953390940037,
"grad_norm": 2.2058684825897217,
"learning_rate": 5.556058058946212e-07,
"loss": 0.0439,
"step": 1750
},
{
"epoch": 3.661953390940037,
"eval_loss": 0.05486290529370308,
"eval_runtime": 182.894,
"eval_samples_per_second": 10.438,
"eval_steps_per_second": 2.614,
"step": 1750
},
{
"epoch": 3.672427336999214,
"grad_norm": 0.8177819848060608,
"learning_rate": 5.223377453460266e-07,
"loss": 0.0135,
"step": 1755
},
{
"epoch": 3.682901283058392,
"grad_norm": 1.7943897247314453,
"learning_rate": 4.900699661937914e-07,
"loss": 0.0154,
"step": 1760
},
{
"epoch": 3.69337522911757,
"grad_norm": 1.8057630062103271,
"learning_rate": 4.588058740097012e-07,
"loss": 0.0249,
"step": 1765
},
{
"epoch": 3.7038491751767477,
"grad_norm": 1.58455491065979,
"learning_rate": 4.285487684354772e-07,
"loss": 0.0156,
"step": 1770
},
{
"epoch": 3.7143231212359256,
"grad_norm": 2.5056676864624023,
"learning_rate": 3.9930184283452634e-07,
"loss": 0.0214,
"step": 1775
},
{
"epoch": 3.7247970672951034,
"grad_norm": 0.33922508358955383,
"learning_rate": 3.7106818395490685e-07,
"loss": 0.0096,
"step": 1780
},
{
"epoch": 3.7352710133542812,
"grad_norm": 1.9061384201049805,
"learning_rate": 3.438507716035555e-07,
"loss": 0.016,
"step": 1785
},
{
"epoch": 3.745744959413459,
"grad_norm": 2.3094871044158936,
"learning_rate": 3.176524783317947e-07,
"loss": 0.0204,
"step": 1790
},
{
"epoch": 3.756218905472637,
"grad_norm": 0.8052126169204712,
"learning_rate": 2.924760691321571e-07,
"loss": 0.0182,
"step": 1795
},
{
"epoch": 3.7666928515318148,
"grad_norm": 1.3129606246948242,
"learning_rate": 2.683242011465703e-07,
"loss": 0.0164,
"step": 1800
},
{
"epoch": 3.7666928515318148,
"eval_loss": 0.05502132698893547,
"eval_runtime": 182.2185,
"eval_samples_per_second": 10.476,
"eval_steps_per_second": 2.623,
"step": 1800
},
{
"epoch": 3.7771667975909926,
"grad_norm": 1.7071999311447144,
"learning_rate": 2.45199423385919e-07,
"loss": 0.0214,
"step": 1805
},
{
"epoch": 3.7876407436501704,
"grad_norm": 0.963501513004303,
"learning_rate": 2.2310417646101535e-07,
"loss": 0.0176,
"step": 1810
},
{
"epoch": 3.798114689709348,
"grad_norm": 1.2818574905395508,
"learning_rate": 2.0204079232502006e-07,
"loss": 0.0204,
"step": 1815
},
{
"epoch": 3.8085886357685257,
"grad_norm": 1.4152429103851318,
"learning_rate": 1.8201149402732432e-07,
"loss": 0.0136,
"step": 1820
},
{
"epoch": 3.8190625818277035,
"grad_norm": 1.5160934925079346,
"learning_rate": 1.630183954789233e-07,
"loss": 0.0158,
"step": 1825
},
{
"epoch": 3.8295365278868814,
"grad_norm": 1.2240071296691895,
"learning_rate": 1.4506350122932e-07,
"loss": 0.0106,
"step": 1830
},
{
"epoch": 3.840010473946059,
"grad_norm": 1.8110445737838745,
"learning_rate": 1.2814870625495357e-07,
"loss": 0.0141,
"step": 1835
},
{
"epoch": 3.850484420005237,
"grad_norm": 0.8142175078392029,
"learning_rate": 1.1227579575921022e-07,
"loss": 0.0103,
"step": 1840
},
{
"epoch": 3.860958366064415,
"grad_norm": 2.131216287612915,
"learning_rate": 9.744644498400513e-08,
"loss": 0.0142,
"step": 1845
},
{
"epoch": 3.8714323121235923,
"grad_norm": 1.8197873830795288,
"learning_rate": 8.366221903297944e-08,
"loss": 0.0245,
"step": 1850
},
{
"epoch": 3.8714323121235923,
"eval_loss": 0.055017318576574326,
"eval_runtime": 182.4144,
"eval_samples_per_second": 10.465,
"eval_steps_per_second": 2.62,
"step": 1850
},
{
"epoch": 3.88190625818277,
"grad_norm": 1.9622763395309448,
"learning_rate": 7.092457270631459e-08,
"loss": 0.0266,
"step": 1855
},
{
"epoch": 3.892380204241948,
"grad_norm": 0.672971785068512,
"learning_rate": 5.9234850347197335e-08,
"loss": 0.0117,
"step": 1860
},
{
"epoch": 3.902854150301126,
"grad_norm": 1.1201688051223755,
"learning_rate": 4.8594285699928854e-08,
"loss": 0.0208,
"step": 1865
},
{
"epoch": 3.9133280963603037,
"grad_norm": 0.9653613567352295,
"learning_rate": 3.900400177971775e-08,
"loss": 0.0275,
"step": 1870
},
{
"epoch": 3.9238020424194815,
"grad_norm": 1.7483731508255005,
"learning_rate": 3.04650107541582e-08,
"loss": 0.0229,
"step": 1875
},
{
"epoch": 3.9342759884786593,
"grad_norm": 0.9113327264785767,
"learning_rate": 2.2978213836400974e-08,
"loss": 0.0241,
"step": 1880
},
{
"epoch": 3.944749934537837,
"grad_norm": 0.7190056443214417,
"learning_rate": 1.6544401190040638e-08,
"loss": 0.0086,
"step": 1885
},
{
"epoch": 3.955223880597015,
"grad_norm": 0.8524140119552612,
"learning_rate": 1.1164251845718899e-08,
"loss": 0.0201,
"step": 1890
},
{
"epoch": 3.965697826656193,
"grad_norm": 1.4827734231948853,
"learning_rate": 6.838333629465288e-09,
"loss": 0.0212,
"step": 1895
},
{
"epoch": 3.9761717727153707,
"grad_norm": 1.2435967922210693,
"learning_rate": 3.5671031027595394e-09,
"loss": 0.0168,
"step": 1900
},
{
"epoch": 3.9761717727153707,
"eval_loss": 0.054999224841594696,
"eval_runtime": 182.2951,
"eval_samples_per_second": 10.472,
"eval_steps_per_second": 2.622,
"step": 1900
},
{
"epoch": 3.9866457187745485,
"grad_norm": 1.00032377243042,
"learning_rate": 1.3509055143490213e-09,
"loss": 0.0186,
"step": 1905
},
{
"epoch": 3.9971196648337264,
"grad_norm": 0.9600237011909485,
"learning_rate": 1.8997476381565905e-10,
"loss": 0.0143,
"step": 1910
},
{
"epoch": 4.0,
"step": 1912,
"total_flos": 1.5735198828619366e+17,
"train_loss": 0.07373801102690306,
"train_runtime": 28333.2715,
"train_samples_per_second": 2.156,
"train_steps_per_second": 0.067
}
],
"logging_steps": 5,
"max_steps": 1912,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5735198828619366e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}