{ "best_global_step": 950, "best_metric": 0.05095840245485306, "best_model_checkpoint": "/kaggle/working/Llama-Factory-out/checkpoint-700", "epoch": 4.0, "eval_steps": 50, "global_step": 1912, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010473946059177796, "grad_norm": 21.572948455810547, "learning_rate": 2.0887728459530028e-07, "loss": 1.3683, "step": 5 }, { "epoch": 0.020947892118355592, "grad_norm": 28.60955810546875, "learning_rate": 4.6997389033942563e-07, "loss": 1.5151, "step": 10 }, { "epoch": 0.031421838177533384, "grad_norm": 23.589828491210938, "learning_rate": 7.31070496083551e-07, "loss": 1.5732, "step": 15 }, { "epoch": 0.041895784236711184, "grad_norm": NaN, "learning_rate": 9.921671018276763e-07, "loss": 1.9134, "step": 20 }, { "epoch": 0.052369730295888976, "grad_norm": 17.49181365966797, "learning_rate": 1.2532637075718015e-06, "loss": 1.4103, "step": 25 }, { "epoch": 0.06284367635506677, "grad_norm": 24.726316452026367, "learning_rate": 1.5143603133159272e-06, "loss": 1.1474, "step": 30 }, { "epoch": 0.07331762241424457, "grad_norm": 16.106216430664062, "learning_rate": 1.7754569190600524e-06, "loss": 1.1247, "step": 35 }, { "epoch": 0.08379156847342237, "grad_norm": 15.223337173461914, "learning_rate": 2.036553524804178e-06, "loss": 0.7444, "step": 40 }, { "epoch": 0.09426551453260015, "grad_norm": 16.522626876831055, "learning_rate": 2.2976501305483033e-06, "loss": 0.5635, "step": 45 }, { "epoch": 0.10473946059177795, "grad_norm": 17.7125244140625, "learning_rate": 2.5587467362924283e-06, "loss": 0.4239, "step": 50 }, { "epoch": 0.10473946059177795, "eval_loss": 0.36827990412712097, "eval_runtime": 181.7089, "eval_samples_per_second": 10.506, "eval_steps_per_second": 2.631, "step": 50 }, { "epoch": 0.11521340665095575, "grad_norm": 6.594980716705322, "learning_rate": 2.819843342036554e-06, "loss": 0.2571, "step": 55 }, { "epoch": 0.12568735271013354, "grad_norm": 5.109836578369141, "learning_rate": 3.080939947780679e-06, "loss": 0.1924, "step": 60 }, { "epoch": 0.13616129876931135, "grad_norm": NaN, "learning_rate": 3.3420365535248045e-06, "loss": 0.1311, "step": 65 }, { "epoch": 0.14663524482848914, "grad_norm": 5.705305099487305, "learning_rate": 3.60313315926893e-06, "loss": 0.1412, "step": 70 }, { "epoch": 0.15710919088766692, "grad_norm": 4.945490837097168, "learning_rate": 3.864229765013055e-06, "loss": 0.1438, "step": 75 }, { "epoch": 0.16758313694684474, "grad_norm": 3.300361156463623, "learning_rate": 4.12532637075718e-06, "loss": 0.1146, "step": 80 }, { "epoch": 0.17805708300602252, "grad_norm": 4.6746649742126465, "learning_rate": 4.386422976501306e-06, "loss": 0.1094, "step": 85 }, { "epoch": 0.1885310290652003, "grad_norm": 3.8035058975219727, "learning_rate": 4.647519582245431e-06, "loss": 0.0945, "step": 90 }, { "epoch": 0.19900497512437812, "grad_norm": 3.3794517517089844, "learning_rate": 4.908616187989557e-06, "loss": 0.1162, "step": 95 }, { "epoch": 0.2094789211835559, "grad_norm": 3.917008638381958, "learning_rate": 5.169712793733682e-06, "loss": 0.1005, "step": 100 }, { "epoch": 0.2094789211835559, "eval_loss": 0.11476331204175949, "eval_runtime": 181.6618, "eval_samples_per_second": 10.509, "eval_steps_per_second": 2.631, "step": 100 }, { "epoch": 0.2199528672427337, "grad_norm": 2.673529863357544, "learning_rate": 5.4308093994778075e-06, "loss": 0.077, "step": 105 }, { "epoch": 0.2304268133019115, "grad_norm": 3.3407351970672607, "learning_rate": 5.691906005221932e-06, "loss": 0.0906, "step": 110 }, { "epoch": 0.2409007593610893, "grad_norm": 3.2136423587799072, "learning_rate": 5.9530026109660575e-06, "loss": 0.1303, "step": 115 }, { "epoch": 0.2513747054202671, "grad_norm": 2.6382622718811035, "learning_rate": 6.214099216710183e-06, "loss": 0.0806, "step": 120 }, { "epoch": 0.26184865147944486, "grad_norm": 3.269986867904663, "learning_rate": 6.475195822454308e-06, "loss": 0.0827, "step": 125 }, { "epoch": 0.2723225975386227, "grad_norm": 3.785564661026001, "learning_rate": 6.736292428198435e-06, "loss": 0.1256, "step": 130 }, { "epoch": 0.2827965435978005, "grad_norm": 2.529221534729004, "learning_rate": 6.99738903394256e-06, "loss": 0.1075, "step": 135 }, { "epoch": 0.2932704896569783, "grad_norm": 4.19661283493042, "learning_rate": 7.258485639686685e-06, "loss": 0.1118, "step": 140 }, { "epoch": 0.30374443571615606, "grad_norm": 2.460735559463501, "learning_rate": 7.51958224543081e-06, "loss": 0.0738, "step": 145 }, { "epoch": 0.31421838177533384, "grad_norm": 2.7781996726989746, "learning_rate": 7.780678851174935e-06, "loss": 0.0853, "step": 150 }, { "epoch": 0.31421838177533384, "eval_loss": 0.09081956744194031, "eval_runtime": 181.8598, "eval_samples_per_second": 10.497, "eval_steps_per_second": 2.628, "step": 150 }, { "epoch": 0.32469232783451163, "grad_norm": 2.850130319595337, "learning_rate": 8.04177545691906e-06, "loss": 0.0908, "step": 155 }, { "epoch": 0.33516627389368947, "grad_norm": 2.2639331817626953, "learning_rate": 8.302872062663187e-06, "loss": 0.0744, "step": 160 }, { "epoch": 0.34564021995286726, "grad_norm": 3.059605121612549, "learning_rate": 8.563968668407311e-06, "loss": 0.0624, "step": 165 }, { "epoch": 0.35611416601204504, "grad_norm": 1.4469069242477417, "learning_rate": 8.825065274151436e-06, "loss": 0.0644, "step": 170 }, { "epoch": 0.3665881120712228, "grad_norm": 3.2024354934692383, "learning_rate": 9.086161879895562e-06, "loss": 0.0921, "step": 175 }, { "epoch": 0.3770620581304006, "grad_norm": 2.3910789489746094, "learning_rate": 9.347258485639687e-06, "loss": 0.107, "step": 180 }, { "epoch": 0.3875360041895784, "grad_norm": 2.225024938583374, "learning_rate": 9.608355091383813e-06, "loss": 0.0794, "step": 185 }, { "epoch": 0.39800995024875624, "grad_norm": 2.3184664249420166, "learning_rate": 9.869451697127938e-06, "loss": 0.0912, "step": 190 }, { "epoch": 0.408483896307934, "grad_norm": 3.909691572189331, "learning_rate": 1.0130548302872064e-05, "loss": 0.075, "step": 195 }, { "epoch": 0.4189578423671118, "grad_norm": 2.332878589630127, "learning_rate": 1.0391644908616189e-05, "loss": 0.0645, "step": 200 }, { "epoch": 0.4189578423671118, "eval_loss": 0.08308149129152298, "eval_runtime": 181.6397, "eval_samples_per_second": 10.51, "eval_steps_per_second": 2.632, "step": 200 }, { "epoch": 0.4294317884262896, "grad_norm": 2.1909031867980957, "learning_rate": 1.0652741514360314e-05, "loss": 0.0566, "step": 205 }, { "epoch": 0.4399057344854674, "grad_norm": 3.95145320892334, "learning_rate": 1.0913838120104438e-05, "loss": 0.0751, "step": 210 }, { "epoch": 0.4503796805446452, "grad_norm": 2.0043461322784424, "learning_rate": 1.1174934725848565e-05, "loss": 0.0738, "step": 215 }, { "epoch": 0.460853626603823, "grad_norm": 2.2231714725494385, "learning_rate": 1.1436031331592689e-05, "loss": 0.0679, "step": 220 }, { "epoch": 0.4713275726630008, "grad_norm": 4.517533302307129, "learning_rate": 1.1697127937336816e-05, "loss": 0.0762, "step": 225 }, { "epoch": 0.4818015187221786, "grad_norm": 5.064544677734375, "learning_rate": 1.1958224543080942e-05, "loss": 0.1181, "step": 230 }, { "epoch": 0.49227546478135636, "grad_norm": 2.1601788997650146, "learning_rate": 1.2219321148825067e-05, "loss": 0.0597, "step": 235 }, { "epoch": 0.5027494108405341, "grad_norm": 2.5625264644622803, "learning_rate": 1.2480417754569192e-05, "loss": 0.0624, "step": 240 }, { "epoch": 0.513223356899712, "grad_norm": 3.436384916305542, "learning_rate": 1.2741514360313316e-05, "loss": 0.0715, "step": 245 }, { "epoch": 0.5236973029588897, "grad_norm": 6.529380798339844, "learning_rate": 1.3002610966057443e-05, "loss": 0.0767, "step": 250 }, { "epoch": 0.5236973029588897, "eval_loss": 0.07550048828125, "eval_runtime": 181.4419, "eval_samples_per_second": 10.521, "eval_steps_per_second": 2.634, "step": 250 }, { "epoch": 0.5341712490180676, "grad_norm": 2.9247043132781982, "learning_rate": 1.3263707571801567e-05, "loss": 0.0586, "step": 255 }, { "epoch": 0.5446451950772454, "grad_norm": 4.487779140472412, "learning_rate": 1.3524804177545694e-05, "loss": 0.0968, "step": 260 }, { "epoch": 0.5551191411364231, "grad_norm": 2.502134084701538, "learning_rate": 1.3785900783289818e-05, "loss": 0.0658, "step": 265 }, { "epoch": 0.565593087195601, "grad_norm": 2.817639112472534, "learning_rate": 1.4046997389033943e-05, "loss": 0.0706, "step": 270 }, { "epoch": 0.5760670332547787, "grad_norm": 2.812814235687256, "learning_rate": 1.4308093994778069e-05, "loss": 0.0582, "step": 275 }, { "epoch": 0.5865409793139565, "grad_norm": 1.8219794034957886, "learning_rate": 1.4569190600522194e-05, "loss": 0.0583, "step": 280 }, { "epoch": 0.5970149253731343, "grad_norm": 2.640019416809082, "learning_rate": 1.4830287206266321e-05, "loss": 0.0805, "step": 285 }, { "epoch": 0.6074888714323121, "grad_norm": 3.00846791267395, "learning_rate": 1.5091383812010445e-05, "loss": 0.0587, "step": 290 }, { "epoch": 0.61796281749149, "grad_norm": 2.266049861907959, "learning_rate": 1.535248041775457e-05, "loss": 0.0622, "step": 295 }, { "epoch": 0.6284367635506677, "grad_norm": 2.43892502784729, "learning_rate": 1.5613577023498696e-05, "loss": 0.0826, "step": 300 }, { "epoch": 0.6284367635506677, "eval_loss": 0.0705987811088562, "eval_runtime": 181.5418, "eval_samples_per_second": 10.515, "eval_steps_per_second": 2.633, "step": 300 }, { "epoch": 0.6389107096098455, "grad_norm": 2.757784605026245, "learning_rate": 1.587467362924282e-05, "loss": 0.0917, "step": 305 }, { "epoch": 0.6493846556690233, "grad_norm": 3.353879690170288, "learning_rate": 1.6135770234986947e-05, "loss": 0.0689, "step": 310 }, { "epoch": 0.6598586017282011, "grad_norm": 2.3030617237091064, "learning_rate": 1.6396866840731072e-05, "loss": 0.0589, "step": 315 }, { "epoch": 0.6703325477873789, "grad_norm": 1.9910506010055542, "learning_rate": 1.6657963446475198e-05, "loss": 0.0787, "step": 320 }, { "epoch": 0.6808064938465567, "grad_norm": 1.8802602291107178, "learning_rate": 1.6919060052219323e-05, "loss": 0.0803, "step": 325 }, { "epoch": 0.6912804399057345, "grad_norm": 2.357010841369629, "learning_rate": 1.718015665796345e-05, "loss": 0.065, "step": 330 }, { "epoch": 0.7017543859649122, "grad_norm": 3.608004331588745, "learning_rate": 1.7441253263707574e-05, "loss": 0.0958, "step": 335 }, { "epoch": 0.7122283320240901, "grad_norm": 2.5642309188842773, "learning_rate": 1.77023498694517e-05, "loss": 0.0859, "step": 340 }, { "epoch": 0.7227022780832679, "grad_norm": 2.9146134853363037, "learning_rate": 1.7963446475195825e-05, "loss": 0.0802, "step": 345 }, { "epoch": 0.7331762241424457, "grad_norm": 2.8338112831115723, "learning_rate": 1.822454308093995e-05, "loss": 0.0882, "step": 350 }, { "epoch": 0.7331762241424457, "eval_loss": 0.06691395491361618, "eval_runtime": 181.5091, "eval_samples_per_second": 10.517, "eval_steps_per_second": 2.633, "step": 350 }, { "epoch": 0.7436501702016235, "grad_norm": 2.904839277267456, "learning_rate": 1.8485639686684072e-05, "loss": 0.0623, "step": 355 }, { "epoch": 0.7541241162608012, "grad_norm": 2.482553243637085, "learning_rate": 1.87467362924282e-05, "loss": 0.0611, "step": 360 }, { "epoch": 0.7645980623199791, "grad_norm": 2.968573570251465, "learning_rate": 1.9007832898172326e-05, "loss": 0.0829, "step": 365 }, { "epoch": 0.7750720083791568, "grad_norm": 2.859727144241333, "learning_rate": 1.9268929503916452e-05, "loss": 0.0555, "step": 370 }, { "epoch": 0.7855459544383346, "grad_norm": 1.7544801235198975, "learning_rate": 1.9530026109660577e-05, "loss": 0.0722, "step": 375 }, { "epoch": 0.7960199004975125, "grad_norm": 2.506270408630371, "learning_rate": 1.97911227154047e-05, "loss": 0.0706, "step": 380 }, { "epoch": 0.8064938465566902, "grad_norm": 2.7544281482696533, "learning_rate": 1.9999978891633502e-05, "loss": 0.0561, "step": 385 }, { "epoch": 0.816967792615868, "grad_norm": 1.2377090454101562, "learning_rate": 1.9999240108162817e-05, "loss": 0.0682, "step": 390 }, { "epoch": 0.8274417386750458, "grad_norm": 3.0974531173706055, "learning_rate": 1.999744599547812e-05, "loss": 0.0804, "step": 395 }, { "epoch": 0.8379156847342236, "grad_norm": 2.9139633178710938, "learning_rate": 1.9994596742931747e-05, "loss": 0.0726, "step": 400 }, { "epoch": 0.8379156847342236, "eval_loss": 0.06348562985658646, "eval_runtime": 181.4276, "eval_samples_per_second": 10.522, "eval_steps_per_second": 2.635, "step": 400 }, { "epoch": 0.8483896307934015, "grad_norm": 3.329805850982666, "learning_rate": 1.9990692651236494e-05, "loss": 0.0636, "step": 405 }, { "epoch": 0.8588635768525792, "grad_norm": 1.405851125717163, "learning_rate": 1.9985734132433876e-05, "loss": 0.0483, "step": 410 }, { "epoch": 0.869337522911757, "grad_norm": 2.3531923294067383, "learning_rate": 1.9979721709850634e-05, "loss": 0.0709, "step": 415 }, { "epoch": 0.8798114689709348, "grad_norm": 1.4560775756835938, "learning_rate": 1.9972656018043505e-05, "loss": 0.0576, "step": 420 }, { "epoch": 0.8902854150301126, "grad_norm": 2.4551849365234375, "learning_rate": 1.996453780273226e-05, "loss": 0.0861, "step": 425 }, { "epoch": 0.9007593610892904, "grad_norm": 4.548062801361084, "learning_rate": 1.9955367920720977e-05, "loss": 0.1325, "step": 430 }, { "epoch": 0.9112333071484682, "grad_norm": 1.5118955373764038, "learning_rate": 1.9945147339807645e-05, "loss": 0.06, "step": 435 }, { "epoch": 0.921707253207646, "grad_norm": 2.8457553386688232, "learning_rate": 1.993387713868199e-05, "loss": 0.0496, "step": 440 }, { "epoch": 0.9321811992668237, "grad_norm": 2.279599666595459, "learning_rate": 1.9921558506811648e-05, "loss": 0.0541, "step": 445 }, { "epoch": 0.9426551453260016, "grad_norm": 1.4517545700073242, "learning_rate": 1.990819274431662e-05, "loss": 0.0711, "step": 450 }, { "epoch": 0.9426551453260016, "eval_loss": 0.06195152550935745, "eval_runtime": 181.5787, "eval_samples_per_second": 10.513, "eval_steps_per_second": 2.632, "step": 450 }, { "epoch": 0.9531290913851793, "grad_norm": 2.7663371562957764, "learning_rate": 1.989378126183207e-05, "loss": 0.0707, "step": 455 }, { "epoch": 0.9636030374443572, "grad_norm": 2.230884552001953, "learning_rate": 1.987832558035942e-05, "loss": 0.0554, "step": 460 }, { "epoch": 0.974076983503535, "grad_norm": 2.8206303119659424, "learning_rate": 1.9861827331105844e-05, "loss": 0.0658, "step": 465 }, { "epoch": 0.9845509295627127, "grad_norm": 1.7690904140472412, "learning_rate": 1.9844288255312098e-05, "loss": 0.0546, "step": 470 }, { "epoch": 0.9950248756218906, "grad_norm": 2.402695417404175, "learning_rate": 1.982571020406875e-05, "loss": 0.0725, "step": 475 }, { "epoch": 1.0041895784236712, "grad_norm": 0.8933857083320618, "learning_rate": 1.9806095138120824e-05, "loss": 0.0363, "step": 480 }, { "epoch": 1.014663524482849, "grad_norm": 1.5981252193450928, "learning_rate": 1.978544512766084e-05, "loss": 0.0454, "step": 485 }, { "epoch": 1.0251374705420266, "grad_norm": 2.3014566898345947, "learning_rate": 1.9763762352110344e-05, "loss": 0.0455, "step": 490 }, { "epoch": 1.0356114166012045, "grad_norm": 2.267174243927002, "learning_rate": 1.9741049099889874e-05, "loss": 0.0428, "step": 495 }, { "epoch": 1.0460853626603823, "grad_norm": 2.398452043533325, "learning_rate": 1.9717307768177457e-05, "loss": 0.0433, "step": 500 }, { "epoch": 1.0460853626603823, "eval_loss": 0.062211424112319946, "eval_runtime": 181.7607, "eval_samples_per_second": 10.503, "eval_steps_per_second": 2.63, "step": 500 }, { "epoch": 1.0565593087195602, "grad_norm": 2.4606473445892334, "learning_rate": 1.9692540862655587e-05, "loss": 0.0563, "step": 505 }, { "epoch": 1.067033254778738, "grad_norm": 0.9938412308692932, "learning_rate": 1.9666750997246793e-05, "loss": 0.0429, "step": 510 }, { "epoch": 1.0775072008379156, "grad_norm": 2.087348461151123, "learning_rate": 1.963994089383774e-05, "loss": 0.0609, "step": 515 }, { "epoch": 1.0879811468970935, "grad_norm": 1.5083081722259521, "learning_rate": 1.9612113381991985e-05, "loss": 0.0538, "step": 520 }, { "epoch": 1.0984550929562713, "grad_norm": 1.1394294500350952, "learning_rate": 1.9583271398651327e-05, "loss": 0.0432, "step": 525 }, { "epoch": 1.1089290390154491, "grad_norm": 1.6931722164154053, "learning_rate": 1.9553417987825837e-05, "loss": 0.036, "step": 530 }, { "epoch": 1.1194029850746268, "grad_norm": 2.196749687194824, "learning_rate": 1.952255630027259e-05, "loss": 0.0504, "step": 535 }, { "epoch": 1.1298769311338046, "grad_norm": 1.8391106128692627, "learning_rate": 1.949068959316315e-05, "loss": 0.0391, "step": 540 }, { "epoch": 1.1403508771929824, "grad_norm": 2.4160068035125732, "learning_rate": 1.9457821229739783e-05, "loss": 0.0486, "step": 545 }, { "epoch": 1.1508248232521603, "grad_norm": 1.0730011463165283, "learning_rate": 1.9423954678960502e-05, "loss": 0.0488, "step": 550 }, { "epoch": 1.1508248232521603, "eval_loss": 0.05938513204455376, "eval_runtime": 181.8553, "eval_samples_per_second": 10.497, "eval_steps_per_second": 2.628, "step": 550 }, { "epoch": 1.1612987693113381, "grad_norm": 1.80950927734375, "learning_rate": 1.9389093515132965e-05, "loss": 0.0435, "step": 555 }, { "epoch": 1.1717727153705157, "grad_norm": 2.7154200077056885, "learning_rate": 1.9353241417537216e-05, "loss": 0.0611, "step": 560 }, { "epoch": 1.1822466614296936, "grad_norm": 1.1030880212783813, "learning_rate": 1.9316402170037377e-05, "loss": 0.0531, "step": 565 }, { "epoch": 1.1927206074888714, "grad_norm": 2.1434154510498047, "learning_rate": 1.927857966068232e-05, "loss": 0.0733, "step": 570 }, { "epoch": 1.2031945535480493, "grad_norm": 0.8784016370773315, "learning_rate": 1.923977788129528e-05, "loss": 0.0339, "step": 575 }, { "epoch": 1.2136684996072271, "grad_norm": 1.4416366815567017, "learning_rate": 1.9200000927052586e-05, "loss": 0.0453, "step": 580 }, { "epoch": 1.2241424456664047, "grad_norm": 0.9367201924324036, "learning_rate": 1.9159252996051433e-05, "loss": 0.0442, "step": 585 }, { "epoch": 1.2346163917255826, "grad_norm": 3.147280216217041, "learning_rate": 1.911753838886681e-05, "loss": 0.0429, "step": 590 }, { "epoch": 1.2450903377847604, "grad_norm": 2.891639232635498, "learning_rate": 1.907486150809764e-05, "loss": 0.0341, "step": 595 }, { "epoch": 1.2555642838439383, "grad_norm": 1.8960820436477661, "learning_rate": 1.9031226857902087e-05, "loss": 0.0347, "step": 600 }, { "epoch": 1.2555642838439383, "eval_loss": 0.05871057137846947, "eval_runtime": 181.3499, "eval_samples_per_second": 10.527, "eval_steps_per_second": 2.636, "step": 600 }, { "epoch": 1.266038229903116, "grad_norm": 1.8320516347885132, "learning_rate": 1.898663904352221e-05, "loss": 0.0384, "step": 605 }, { "epoch": 1.2765121759622937, "grad_norm": 2.077674150466919, "learning_rate": 1.894110277079791e-05, "loss": 0.0845, "step": 610 }, { "epoch": 1.2869861220214716, "grad_norm": 1.9369480609893799, "learning_rate": 1.8894622845670282e-05, "loss": 0.0418, "step": 615 }, { "epoch": 1.2974600680806494, "grad_norm": 3.845341682434082, "learning_rate": 1.8847204173674378e-05, "loss": 0.0488, "step": 620 }, { "epoch": 1.3079340141398272, "grad_norm": 1.5000770092010498, "learning_rate": 1.8798851759421473e-05, "loss": 0.0553, "step": 625 }, { "epoch": 1.3184079601990049, "grad_norm": 1.5684770345687866, "learning_rate": 1.8749570706070895e-05, "loss": 0.0492, "step": 630 }, { "epoch": 1.3288819062581827, "grad_norm": 2.115903377532959, "learning_rate": 1.8699366214791394e-05, "loss": 0.0424, "step": 635 }, { "epoch": 1.3393558523173605, "grad_norm": 1.7767939567565918, "learning_rate": 1.8648243584212254e-05, "loss": 0.0234, "step": 640 }, { "epoch": 1.3498297983765384, "grad_norm": 1.7302303314208984, "learning_rate": 1.8596208209864022e-05, "loss": 0.0482, "step": 645 }, { "epoch": 1.3603037444357162, "grad_norm": 1.750826358795166, "learning_rate": 1.8543265583609096e-05, "loss": 0.0475, "step": 650 }, { "epoch": 1.3603037444357162, "eval_loss": 0.05913909152150154, "eval_runtime": 181.2221, "eval_samples_per_second": 10.534, "eval_steps_per_second": 2.638, "step": 650 }, { "epoch": 1.370777690494894, "grad_norm": 2.049710512161255, "learning_rate": 1.8489421293062087e-05, "loss": 0.044, "step": 655 }, { "epoch": 1.3812516365540717, "grad_norm": 1.9173017740249634, "learning_rate": 1.8434681021000108e-05, "loss": 0.0391, "step": 660 }, { "epoch": 1.3917255826132495, "grad_norm": 2.223348379135132, "learning_rate": 1.8379050544763004e-05, "loss": 0.0393, "step": 665 }, { "epoch": 1.4021995286724274, "grad_norm": 3.047008752822876, "learning_rate": 1.8322535735643604e-05, "loss": 0.044, "step": 670 }, { "epoch": 1.4126734747316052, "grad_norm": 1.5292298793792725, "learning_rate": 1.8265142558268066e-05, "loss": 0.0672, "step": 675 }, { "epoch": 1.4231474207907828, "grad_norm": 1.8190603256225586, "learning_rate": 1.820687706996636e-05, "loss": 0.0458, "step": 680 }, { "epoch": 1.4336213668499607, "grad_norm": 2.0858137607574463, "learning_rate": 1.8147745420132965e-05, "loss": 0.042, "step": 685 }, { "epoch": 1.4440953129091385, "grad_norm": 4.506059646606445, "learning_rate": 1.8087753849577876e-05, "loss": 0.0629, "step": 690 }, { "epoch": 1.4545692589683163, "grad_norm": 2.2428197860717773, "learning_rate": 1.802690868986792e-05, "loss": 0.0486, "step": 695 }, { "epoch": 1.4650432050274942, "grad_norm": 1.942474365234375, "learning_rate": 1.7965216362658528e-05, "loss": 0.0485, "step": 700 }, { "epoch": 1.4650432050274942, "eval_loss": 0.055441830307245255, "eval_runtime": 182.1133, "eval_samples_per_second": 10.482, "eval_steps_per_second": 2.625, "step": 700 }, { "epoch": 1.475517151086672, "grad_norm": 1.306942343711853, "learning_rate": 1.7902683379015996e-05, "loss": 0.0518, "step": 705 }, { "epoch": 1.4859910971458496, "grad_norm": 1.9224140644073486, "learning_rate": 1.7839316338730282e-05, "loss": 0.0579, "step": 710 }, { "epoch": 1.4964650432050275, "grad_norm": 1.8800877332687378, "learning_rate": 1.7775121929618462e-05, "loss": 0.0514, "step": 715 }, { "epoch": 1.5069389892642053, "grad_norm": 1.8557875156402588, "learning_rate": 1.771010692681892e-05, "loss": 0.0535, "step": 720 }, { "epoch": 1.517412935323383, "grad_norm": 1.4152109622955322, "learning_rate": 1.764427819207624e-05, "loss": 0.0693, "step": 725 }, { "epoch": 1.5278868813825608, "grad_norm": 3.057999849319458, "learning_rate": 1.7577642673017063e-05, "loss": 0.0429, "step": 730 }, { "epoch": 1.5383608274417386, "grad_norm": 2.492802619934082, "learning_rate": 1.7510207402416798e-05, "loss": 0.04, "step": 735 }, { "epoch": 1.5488347735009165, "grad_norm": 4.143369674682617, "learning_rate": 1.7441979497457384e-05, "loss": 0.058, "step": 740 }, { "epoch": 1.5593087195600943, "grad_norm": 1.7152019739151, "learning_rate": 1.7372966158976143e-05, "loss": 0.0713, "step": 745 }, { "epoch": 1.5697826656192722, "grad_norm": 2.2295591831207275, "learning_rate": 1.7303174670705783e-05, "loss": 0.0421, "step": 750 }, { "epoch": 1.5697826656192722, "eval_loss": 0.05413464084267616, "eval_runtime": 181.9914, "eval_samples_per_second": 10.49, "eval_steps_per_second": 2.626, "step": 750 }, { "epoch": 1.58025661167845, "grad_norm": 1.414204478263855, "learning_rate": 1.7232612398505676e-05, "loss": 0.0499, "step": 755 }, { "epoch": 1.5907305577376276, "grad_norm": 2.8413901329040527, "learning_rate": 1.716128678958445e-05, "loss": 0.0496, "step": 760 }, { "epoch": 1.6012045037968055, "grad_norm": 1.3030387163162231, "learning_rate": 1.708920537171402e-05, "loss": 0.0376, "step": 765 }, { "epoch": 1.611678449855983, "grad_norm": 0.9149934649467468, "learning_rate": 1.7016375752435088e-05, "loss": 0.0313, "step": 770 }, { "epoch": 1.622152395915161, "grad_norm": 2.623652935028076, "learning_rate": 1.694280561825422e-05, "loss": 0.0612, "step": 775 }, { "epoch": 1.6326263419743388, "grad_norm": 1.9939152002334595, "learning_rate": 1.6868502733832647e-05, "loss": 0.0398, "step": 780 }, { "epoch": 1.6431002880335166, "grad_norm": 3.7116594314575195, "learning_rate": 1.679347494116673e-05, "loss": 0.0419, "step": 785 }, { "epoch": 1.6535742340926944, "grad_norm": 1.6450990438461304, "learning_rate": 1.6717730158760334e-05, "loss": 0.0387, "step": 790 }, { "epoch": 1.6640481801518723, "grad_norm": 1.863366723060608, "learning_rate": 1.6641276380789107e-05, "loss": 0.0529, "step": 795 }, { "epoch": 1.6745221262110501, "grad_norm": 1.3787758350372314, "learning_rate": 1.656412167625674e-05, "loss": 0.0395, "step": 800 }, { "epoch": 1.6745221262110501, "eval_loss": 0.05458131060004234, "eval_runtime": 181.5768, "eval_samples_per_second": 10.513, "eval_steps_per_second": 2.632, "step": 800 }, { "epoch": 1.684996072270228, "grad_norm": 1.3715674877166748, "learning_rate": 1.6486274188143386e-05, "loss": 0.0335, "step": 805 }, { "epoch": 1.6954700183294056, "grad_norm": 1.6836681365966797, "learning_rate": 1.6407742132546216e-05, "loss": 0.042, "step": 810 }, { "epoch": 1.7059439643885834, "grad_norm": 2.448378324508667, "learning_rate": 1.6328533797812315e-05, "loss": 0.0419, "step": 815 }, { "epoch": 1.716417910447761, "grad_norm": 1.39069664478302, "learning_rate": 1.6248657543663887e-05, "loss": 0.0371, "step": 820 }, { "epoch": 1.7268918565069389, "grad_norm": 2.460034132003784, "learning_rate": 1.6168121800315993e-05, "loss": 0.0481, "step": 825 }, { "epoch": 1.7373658025661167, "grad_norm": 2.401494026184082, "learning_rate": 1.60869350675868e-05, "loss": 0.0769, "step": 830 }, { "epoch": 1.7478397486252946, "grad_norm": 2.444972038269043, "learning_rate": 1.6005105914000508e-05, "loss": 0.0403, "step": 835 }, { "epoch": 1.7583136946844724, "grad_norm": 1.6803293228149414, "learning_rate": 1.5922642975883014e-05, "loss": 0.0433, "step": 840 }, { "epoch": 1.7687876407436502, "grad_norm": 1.660657286643982, "learning_rate": 1.5839554956450435e-05, "loss": 0.043, "step": 845 }, { "epoch": 1.779261586802828, "grad_norm": 1.6761749982833862, "learning_rate": 1.5755850624890563e-05, "loss": 0.0483, "step": 850 }, { "epoch": 1.779261586802828, "eval_loss": 0.05199718102812767, "eval_runtime": 181.8182, "eval_samples_per_second": 10.499, "eval_steps_per_second": 2.629, "step": 850 }, { "epoch": 1.7897355328620057, "grad_norm": 1.660902738571167, "learning_rate": 1.5671538815437346e-05, "loss": 0.0451, "step": 855 }, { "epoch": 1.8002094789211835, "grad_norm": 2.425180673599243, "learning_rate": 1.558662842643852e-05, "loss": 0.0514, "step": 860 }, { "epoch": 1.8106834249803614, "grad_norm": 1.8615056276321411, "learning_rate": 1.5501128419416475e-05, "loss": 0.0951, "step": 865 }, { "epoch": 1.821157371039539, "grad_norm": 2.117887258529663, "learning_rate": 1.541504781812244e-05, "loss": 0.0443, "step": 870 }, { "epoch": 1.8316313170987168, "grad_norm": 1.9007426500320435, "learning_rate": 1.532839570758411e-05, "loss": 0.0539, "step": 875 }, { "epoch": 1.8421052631578947, "grad_norm": 1.0283795595169067, "learning_rate": 1.5241181233146798e-05, "loss": 0.0439, "step": 880 }, { "epoch": 1.8525792092170725, "grad_norm": 1.4137732982635498, "learning_rate": 1.5153413599508241e-05, "loss": 0.0454, "step": 885 }, { "epoch": 1.8630531552762504, "grad_norm": 1.5199006795883179, "learning_rate": 1.5065102069747117e-05, "loss": 0.0521, "step": 890 }, { "epoch": 1.8735271013354282, "grad_norm": 1.8887778520584106, "learning_rate": 1.4976255964345407e-05, "loss": 0.0379, "step": 895 }, { "epoch": 1.884001047394606, "grad_norm": 0.687090277671814, "learning_rate": 1.488688466020471e-05, "loss": 0.0421, "step": 900 }, { "epoch": 1.884001047394606, "eval_loss": 0.055315304547548294, "eval_runtime": 181.7379, "eval_samples_per_second": 10.504, "eval_steps_per_second": 2.63, "step": 900 }, { "epoch": 1.8944749934537837, "grad_norm": 3.8431344032287598, "learning_rate": 1.4796997589656605e-05, "loss": 0.0493, "step": 905 }, { "epoch": 1.9049489395129615, "grad_norm": 3.010768413543701, "learning_rate": 1.470660423946713e-05, "loss": 0.0429, "step": 910 }, { "epoch": 1.9154228855721394, "grad_norm": 1.5146229267120361, "learning_rate": 1.4615714149835557e-05, "loss": 0.0349, "step": 915 }, { "epoch": 1.925896831631317, "grad_norm": 1.2837982177734375, "learning_rate": 1.4524336913387509e-05, "loss": 0.031, "step": 920 }, { "epoch": 1.9363707776904948, "grad_norm": 1.4640088081359863, "learning_rate": 1.4432482174162539e-05, "loss": 0.0433, "step": 925 }, { "epoch": 1.9468447237496727, "grad_norm": 2.3506968021392822, "learning_rate": 1.4340159626596295e-05, "loss": 0.0344, "step": 930 }, { "epoch": 1.9573186698088505, "grad_norm": 1.7294262647628784, "learning_rate": 1.4247379014497356e-05, "loss": 0.0448, "step": 935 }, { "epoch": 1.9677926158680283, "grad_norm": 2.0124881267547607, "learning_rate": 1.4154150130018867e-05, "loss": 0.0531, "step": 940 }, { "epoch": 1.9782665619272062, "grad_norm": 1.9695724248886108, "learning_rate": 1.4060482812625055e-05, "loss": 0.0509, "step": 945 }, { "epoch": 1.988740507986384, "grad_norm": 5.048811435699463, "learning_rate": 1.3966386948052777e-05, "loss": 0.0735, "step": 950 }, { "epoch": 1.988740507986384, "eval_loss": 0.05095840245485306, "eval_runtime": 181.2754, "eval_samples_per_second": 10.531, "eval_steps_per_second": 2.637, "step": 950 }, { "epoch": 1.9992144540455616, "grad_norm": 2.204068899154663, "learning_rate": 1.3871872467268155e-05, "loss": 0.0462, "step": 955 }, { "epoch": 2.0083791568473424, "grad_norm": 1.12019681930542, "learning_rate": 1.3776949345418466e-05, "loss": 0.0368, "step": 960 }, { "epoch": 2.01885310290652, "grad_norm": 0.8073732256889343, "learning_rate": 1.3681627600779353e-05, "loss": 0.0284, "step": 965 }, { "epoch": 2.029327048965698, "grad_norm": 1.6881890296936035, "learning_rate": 1.3585917293697473e-05, "loss": 0.025, "step": 970 }, { "epoch": 2.0398009950248754, "grad_norm": 2.6855087280273438, "learning_rate": 1.3489828525528732e-05, "loss": 0.0447, "step": 975 }, { "epoch": 2.0502749410840533, "grad_norm": 2.1619064807891846, "learning_rate": 1.3393371437572183e-05, "loss": 0.0254, "step": 980 }, { "epoch": 2.060748887143231, "grad_norm": 2.9052109718322754, "learning_rate": 1.329655620999969e-05, "loss": 0.0427, "step": 985 }, { "epoch": 2.071222833202409, "grad_norm": 1.7290070056915283, "learning_rate": 1.3199393060781507e-05, "loss": 0.0315, "step": 990 }, { "epoch": 2.081696779261587, "grad_norm": 1.7127286195755005, "learning_rate": 1.3101892244607872e-05, "loss": 0.0256, "step": 995 }, { "epoch": 2.0921707253207646, "grad_norm": 1.0866358280181885, "learning_rate": 1.3004064051806712e-05, "loss": 0.0233, "step": 1000 }, { "epoch": 2.0921707253207646, "eval_loss": 0.05503799021244049, "eval_runtime": 181.6508, "eval_samples_per_second": 10.509, "eval_steps_per_second": 2.631, "step": 1000 }, { "epoch": 2.1026446713799425, "grad_norm": 2.119222402572632, "learning_rate": 1.2905918807257578e-05, "loss": 0.0234, "step": 1005 }, { "epoch": 2.1131186174391203, "grad_norm": 2.4023685455322266, "learning_rate": 1.2807466869301978e-05, "loss": 0.0284, "step": 1010 }, { "epoch": 2.123592563498298, "grad_norm": 1.3008413314819336, "learning_rate": 1.2708718628650125e-05, "loss": 0.0245, "step": 1015 }, { "epoch": 2.134066509557476, "grad_norm": 1.8224750757217407, "learning_rate": 1.260968450728429e-05, "loss": 0.0439, "step": 1020 }, { "epoch": 2.1445404556166534, "grad_norm": 1.3979074954986572, "learning_rate": 1.2510374957358877e-05, "loss": 0.0272, "step": 1025 }, { "epoch": 2.1550144016758312, "grad_norm": 1.3777137994766235, "learning_rate": 1.2410800460097265e-05, "loss": 0.0158, "step": 1030 }, { "epoch": 2.165488347735009, "grad_norm": 1.4102022647857666, "learning_rate": 1.2310971524685638e-05, "loss": 0.0236, "step": 1035 }, { "epoch": 2.175962293794187, "grad_norm": 1.0941966772079468, "learning_rate": 1.2210898687163808e-05, "loss": 0.03, "step": 1040 }, { "epoch": 2.1864362398533648, "grad_norm": 1.8256818056106567, "learning_rate": 1.2110592509313261e-05, "loss": 0.0387, "step": 1045 }, { "epoch": 2.1969101859125426, "grad_norm": 1.2805190086364746, "learning_rate": 1.201006357754243e-05, "loss": 0.027, "step": 1050 }, { "epoch": 2.1969101859125426, "eval_loss": 0.054396990686655045, "eval_runtime": 181.6161, "eval_samples_per_second": 10.511, "eval_steps_per_second": 2.632, "step": 1050 }, { "epoch": 2.2073841319717205, "grad_norm": 1.5364525318145752, "learning_rate": 1.1909322501769407e-05, "loss": 0.0205, "step": 1055 }, { "epoch": 2.2178580780308983, "grad_norm": 2.694061040878296, "learning_rate": 1.1808379914302166e-05, "loss": 0.0347, "step": 1060 }, { "epoch": 2.228332024090076, "grad_norm": 1.2438369989395142, "learning_rate": 1.1707246468716411e-05, "loss": 0.0503, "step": 1065 }, { "epoch": 2.2388059701492535, "grad_norm": 1.5222554206848145, "learning_rate": 1.1605932838731194e-05, "loss": 0.0438, "step": 1070 }, { "epoch": 2.2492799162084314, "grad_norm": 1.7822566032409668, "learning_rate": 1.15044497170824e-05, "loss": 0.0345, "step": 1075 }, { "epoch": 2.259753862267609, "grad_norm": 1.48551607131958, "learning_rate": 1.1402807814394216e-05, "loss": 0.0342, "step": 1080 }, { "epoch": 2.270227808326787, "grad_norm": 2.0183334350585938, "learning_rate": 1.130101785804874e-05, "loss": 0.0277, "step": 1085 }, { "epoch": 2.280701754385965, "grad_norm": 1.0673748254776, "learning_rate": 1.1199090591053784e-05, "loss": 0.0237, "step": 1090 }, { "epoch": 2.2911757004451427, "grad_norm": 1.9523701667785645, "learning_rate": 1.1097036770909055e-05, "loss": 0.0403, "step": 1095 }, { "epoch": 2.3016496465043206, "grad_norm": 0.7670222520828247, "learning_rate": 1.0994867168470806e-05, "loss": 0.0213, "step": 1100 }, { "epoch": 2.3016496465043206, "eval_loss": 0.05162982642650604, "eval_runtime": 182.2699, "eval_samples_per_second": 10.473, "eval_steps_per_second": 2.622, "step": 1100 }, { "epoch": 2.3121235925634984, "grad_norm": 1.686271071434021, "learning_rate": 1.0892592566815061e-05, "loss": 0.0303, "step": 1105 }, { "epoch": 2.3225975386226763, "grad_norm": 1.5811524391174316, "learning_rate": 1.079022376009955e-05, "loss": 0.0193, "step": 1110 }, { "epoch": 2.333071484681854, "grad_norm": 1.9558700323104858, "learning_rate": 1.0687771552424504e-05, "loss": 0.0269, "step": 1115 }, { "epoch": 2.3435454307410315, "grad_norm": 1.3908772468566895, "learning_rate": 1.0585246756692366e-05, "loss": 0.0307, "step": 1120 }, { "epoch": 2.3540193768002093, "grad_norm": 1.5732723474502563, "learning_rate": 1.0482660193466594e-05, "loss": 0.0184, "step": 1125 }, { "epoch": 2.364493322859387, "grad_norm": 1.5866297483444214, "learning_rate": 1.0380022689829638e-05, "loss": 0.0263, "step": 1130 }, { "epoch": 2.374967268918565, "grad_norm": 0.7292336821556091, "learning_rate": 1.0277345078240258e-05, "loss": 0.0465, "step": 1135 }, { "epoch": 2.385441214977743, "grad_norm": 1.587586522102356, "learning_rate": 1.0174638195390235e-05, "loss": 0.0402, "step": 1140 }, { "epoch": 2.3959151610369207, "grad_norm": 1.3230594396591187, "learning_rate": 1.0071912881060668e-05, "loss": 0.0274, "step": 1145 }, { "epoch": 2.4063891070960985, "grad_norm": 1.5415374040603638, "learning_rate": 9.969179976977939e-06, "loss": 0.0284, "step": 1150 }, { "epoch": 2.4063891070960985, "eval_loss": 0.052570246160030365, "eval_runtime": 181.6844, "eval_samples_per_second": 10.507, "eval_steps_per_second": 2.631, "step": 1150 }, { "epoch": 2.4168630531552764, "grad_norm": 0.8958898782730103, "learning_rate": 9.866450325669456e-06, "loss": 0.0231, "step": 1155 }, { "epoch": 2.4273369992144542, "grad_norm": 2.100008487701416, "learning_rate": 9.763734769319317e-06, "loss": 0.0357, "step": 1160 }, { "epoch": 2.4378109452736316, "grad_norm": 1.323148488998413, "learning_rate": 9.661044148624038e-06, "loss": 0.0237, "step": 1165 }, { "epoch": 2.4482848913328095, "grad_norm": 2.1606085300445557, "learning_rate": 9.5583893016484e-06, "loss": 0.0279, "step": 1170 }, { "epoch": 2.4587588373919873, "grad_norm": 1.4878783226013184, "learning_rate": 9.455781062681583e-06, "loss": 0.025, "step": 1175 }, { "epoch": 2.469232783451165, "grad_norm": 0.9704115986824036, "learning_rate": 9.353230261093723e-06, "loss": 0.0177, "step": 1180 }, { "epoch": 2.479706729510343, "grad_norm": 3.0599184036254883, "learning_rate": 9.250747720192961e-06, "loss": 0.0339, "step": 1185 }, { "epoch": 2.490180675569521, "grad_norm": 1.2243415117263794, "learning_rate": 9.148344256083131e-06, "loss": 0.0327, "step": 1190 }, { "epoch": 2.5006546216286987, "grad_norm": 0.6634637117385864, "learning_rate": 9.046030676522242e-06, "loss": 0.027, "step": 1195 }, { "epoch": 2.5111285676878765, "grad_norm": 0.6147317290306091, "learning_rate": 8.943817779781788e-06, "loss": 0.0175, "step": 1200 }, { "epoch": 2.5111285676878765, "eval_loss": 0.05241983383893967, "eval_runtime": 181.61, "eval_samples_per_second": 10.512, "eval_steps_per_second": 2.632, "step": 1200 }, { "epoch": 2.5216025137470544, "grad_norm": 1.175798773765564, "learning_rate": 8.841716353507118e-06, "loss": 0.036, "step": 1205 }, { "epoch": 2.532076459806232, "grad_norm": 3.135117292404175, "learning_rate": 8.739737173578875e-06, "loss": 0.039, "step": 1210 }, { "epoch": 2.54255040586541, "grad_norm": 1.2280455827713013, "learning_rate": 8.637891002975708e-06, "loss": 0.0242, "step": 1215 }, { "epoch": 2.5530243519245874, "grad_norm": 1.851010799407959, "learning_rate": 8.536188590638334e-06, "loss": 0.027, "step": 1220 }, { "epoch": 2.5634982979837653, "grad_norm": 1.7395970821380615, "learning_rate": 8.4346406703351e-06, "loss": 0.0241, "step": 1225 }, { "epoch": 2.573972244042943, "grad_norm": 1.3405005931854248, "learning_rate": 8.3332579595291e-06, "loss": 0.0321, "step": 1230 }, { "epoch": 2.584446190102121, "grad_norm": 2.150904417037964, "learning_rate": 8.232051158247074e-06, "loss": 0.0325, "step": 1235 }, { "epoch": 2.594920136161299, "grad_norm": 1.6793160438537598, "learning_rate": 8.131030947950109e-06, "loss": 0.0351, "step": 1240 }, { "epoch": 2.6053940822204766, "grad_norm": 1.7281907796859741, "learning_rate": 8.030207990406286e-06, "loss": 0.0485, "step": 1245 }, { "epoch": 2.6158680282796545, "grad_norm": 1.0809645652770996, "learning_rate": 7.929592926565468e-06, "loss": 0.0218, "step": 1250 }, { "epoch": 2.6158680282796545, "eval_loss": 0.05264349281787872, "eval_runtime": 181.5098, "eval_samples_per_second": 10.517, "eval_steps_per_second": 2.633, "step": 1250 }, { "epoch": 2.6263419743388323, "grad_norm": 1.1241612434387207, "learning_rate": 7.829196375436197e-06, "loss": 0.029, "step": 1255 }, { "epoch": 2.6368159203980097, "grad_norm": 1.4399851560592651, "learning_rate": 7.729028932964995e-06, "loss": 0.0337, "step": 1260 }, { "epoch": 2.6472898664571876, "grad_norm": 2.769148588180542, "learning_rate": 7.629101170918041e-06, "loss": 0.0398, "step": 1265 }, { "epoch": 2.6577638125163654, "grad_norm": 1.6929821968078613, "learning_rate": 7.529423635765401e-06, "loss": 0.0182, "step": 1270 }, { "epoch": 2.6682377585755432, "grad_norm": 1.0924474000930786, "learning_rate": 7.430006847567972e-06, "loss": 0.0385, "step": 1275 }, { "epoch": 2.678711704634721, "grad_norm": 1.542842984199524, "learning_rate": 7.330861298867173e-06, "loss": 0.0311, "step": 1280 }, { "epoch": 2.689185650693899, "grad_norm": 1.0925610065460205, "learning_rate": 7.2319974535775405e-06, "loss": 0.0309, "step": 1285 }, { "epoch": 2.6996595967530768, "grad_norm": 1.2981770038604736, "learning_rate": 7.133425745882375e-06, "loss": 0.0392, "step": 1290 }, { "epoch": 2.7101335428122546, "grad_norm": 1.56510329246521, "learning_rate": 7.035156579132506e-06, "loss": 0.0279, "step": 1295 }, { "epoch": 2.7206074888714324, "grad_norm": 1.6105190515518188, "learning_rate": 6.93720032474829e-06, "loss": 0.0253, "step": 1300 }, { "epoch": 2.7206074888714324, "eval_loss": 0.051075223833322525, "eval_runtime": 181.7152, "eval_samples_per_second": 10.505, "eval_steps_per_second": 2.63, "step": 1300 }, { "epoch": 2.7310814349306103, "grad_norm": 2.5957469940185547, "learning_rate": 6.839567321125035e-06, "loss": 0.019, "step": 1305 }, { "epoch": 2.741555380989788, "grad_norm": 1.354457974433899, "learning_rate": 6.74226787254185e-06, "loss": 0.0274, "step": 1310 }, { "epoch": 2.752029327048966, "grad_norm": 1.0121866464614868, "learning_rate": 6.645312248074132e-06, "loss": 0.0193, "step": 1315 }, { "epoch": 2.7625032731081434, "grad_norm": 1.7300618886947632, "learning_rate": 6.54871068050976e-06, "loss": 0.0208, "step": 1320 }, { "epoch": 2.772977219167321, "grad_norm": 1.365108609199524, "learning_rate": 6.452473365269115e-06, "loss": 0.0267, "step": 1325 }, { "epoch": 2.783451165226499, "grad_norm": 2.3114993572235107, "learning_rate": 6.356610459329038e-06, "loss": 0.028, "step": 1330 }, { "epoch": 2.793925111285677, "grad_norm": 1.1482765674591064, "learning_rate": 6.261132080150868e-06, "loss": 0.0304, "step": 1335 }, { "epoch": 2.8043990573448547, "grad_norm": 1.3784815073013306, "learning_rate": 6.166048304612624e-06, "loss": 0.0245, "step": 1340 }, { "epoch": 2.8148730034040326, "grad_norm": 1.6406880617141724, "learning_rate": 6.071369167945482e-06, "loss": 0.027, "step": 1345 }, { "epoch": 2.8253469494632104, "grad_norm": 1.8636596202850342, "learning_rate": 5.9771046626746585e-06, "loss": 0.0227, "step": 1350 }, { "epoch": 2.8253469494632104, "eval_loss": 0.05176674574613571, "eval_runtime": 181.8891, "eval_samples_per_second": 10.495, "eval_steps_per_second": 2.628, "step": 1350 }, { "epoch": 2.835820895522388, "grad_norm": 1.8853999376296997, "learning_rate": 5.883264737564776e-06, "loss": 0.0326, "step": 1355 }, { "epoch": 2.8462948415815656, "grad_norm": 1.3684381246566772, "learning_rate": 5.789859296569871e-06, "loss": 0.018, "step": 1360 }, { "epoch": 2.8567687876407435, "grad_norm": 1.627061367034912, "learning_rate": 5.696898197788108e-06, "loss": 0.0293, "step": 1365 }, { "epoch": 2.8672427336999213, "grad_norm": 2.071784496307373, "learning_rate": 5.6043912524213685e-06, "loss": 0.0246, "step": 1370 }, { "epoch": 2.877716679759099, "grad_norm": 1.5565595626831055, "learning_rate": 5.512348223739754e-06, "loss": 0.0163, "step": 1375 }, { "epoch": 2.888190625818277, "grad_norm": 2.5211095809936523, "learning_rate": 5.4207788260511505e-06, "loss": 0.0386, "step": 1380 }, { "epoch": 2.898664571877455, "grad_norm": 1.5942156314849854, "learning_rate": 5.329692723675994e-06, "loss": 0.0302, "step": 1385 }, { "epoch": 2.9091385179366327, "grad_norm": 1.4718657732009888, "learning_rate": 5.239099529927281e-06, "loss": 0.0318, "step": 1390 }, { "epoch": 2.9196124639958105, "grad_norm": 0.7544646859169006, "learning_rate": 5.1490088060959495e-06, "loss": 0.0162, "step": 1395 }, { "epoch": 2.9300864100549884, "grad_norm": 1.2517889738082886, "learning_rate": 5.0594300604418086e-06, "loss": 0.0304, "step": 1400 }, { "epoch": 2.9300864100549884, "eval_loss": 0.05129832401871681, "eval_runtime": 181.4791, "eval_samples_per_second": 10.519, "eval_steps_per_second": 2.634, "step": 1400 }, { "epoch": 2.940560356114166, "grad_norm": 0.8101089000701904, "learning_rate": 4.970372747190006e-06, "loss": 0.0431, "step": 1405 }, { "epoch": 2.951034302173344, "grad_norm": 1.6314613819122314, "learning_rate": 4.881846265533209e-06, "loss": 0.0378, "step": 1410 }, { "epoch": 2.9615082482325215, "grad_norm": 1.186647891998291, "learning_rate": 4.793859958639635e-06, "loss": 0.0281, "step": 1415 }, { "epoch": 2.9719821942916993, "grad_norm": 2.1646673679351807, "learning_rate": 4.7064231126669355e-06, "loss": 0.0343, "step": 1420 }, { "epoch": 2.982456140350877, "grad_norm": 1.3391481637954712, "learning_rate": 4.6195449557821495e-06, "loss": 0.0197, "step": 1425 }, { "epoch": 2.992930086410055, "grad_norm": 2.9808108806610107, "learning_rate": 4.5332346571877405e-06, "loss": 0.0302, "step": 1430 }, { "epoch": 3.0020947892118355, "grad_norm": 0.9604336619377136, "learning_rate": 4.447501326153865e-06, "loss": 0.0252, "step": 1435 }, { "epoch": 3.0125687352710133, "grad_norm": 1.2666419744491577, "learning_rate": 4.3623540110569935e-06, "loss": 0.0179, "step": 1440 }, { "epoch": 3.023042681330191, "grad_norm": 1.4494256973266602, "learning_rate": 4.277801698424918e-06, "loss": 0.0218, "step": 1445 }, { "epoch": 3.033516627389369, "grad_norm": 1.1630330085754395, "learning_rate": 4.1938533119883014e-06, "loss": 0.018, "step": 1450 }, { "epoch": 3.033516627389369, "eval_loss": 0.05160898342728615, "eval_runtime": 182.0457, "eval_samples_per_second": 10.486, "eval_steps_per_second": 2.626, "step": 1450 }, { "epoch": 3.043990573448547, "grad_norm": 2.2805240154266357, "learning_rate": 4.110517711738881e-06, "loss": 0.027, "step": 1455 }, { "epoch": 3.0544645195077247, "grad_norm": 0.7012156248092651, "learning_rate": 4.0278036929943574e-06, "loss": 0.0225, "step": 1460 }, { "epoch": 3.0649384655669025, "grad_norm": 1.6349064111709595, "learning_rate": 3.945719985470128e-06, "loss": 0.0171, "step": 1465 }, { "epoch": 3.07541241162608, "grad_norm": 1.5148468017578125, "learning_rate": 3.8642752523579595e-06, "loss": 0.014, "step": 1470 }, { "epoch": 3.0858863576852578, "grad_norm": 0.9480647444725037, "learning_rate": 3.7834780894116575e-06, "loss": 0.0152, "step": 1475 }, { "epoch": 3.0963603037444356, "grad_norm": 2.8382086753845215, "learning_rate": 3.7033370240398527e-06, "loss": 0.0239, "step": 1480 }, { "epoch": 3.1068342498036134, "grad_norm": 2.1970698833465576, "learning_rate": 3.6238605144060314e-06, "loss": 0.0261, "step": 1485 }, { "epoch": 3.1173081958627913, "grad_norm": 1.1678617000579834, "learning_rate": 3.545056948535839e-06, "loss": 0.0158, "step": 1490 }, { "epoch": 3.127782141921969, "grad_norm": 1.8681138753890991, "learning_rate": 3.466934643431795e-06, "loss": 0.0175, "step": 1495 }, { "epoch": 3.138256087981147, "grad_norm": 1.5951310396194458, "learning_rate": 3.389501844195525e-06, "loss": 0.0193, "step": 1500 }, { "epoch": 3.138256087981147, "eval_loss": 0.05427511781454086, "eval_runtime": 182.0489, "eval_samples_per_second": 10.486, "eval_steps_per_second": 2.626, "step": 1500 }, { "epoch": 3.148730034040325, "grad_norm": 1.1853766441345215, "learning_rate": 3.3127667231575587e-06, "loss": 0.0211, "step": 1505 }, { "epoch": 3.1592039800995027, "grad_norm": 2.4959716796875, "learning_rate": 3.2367373790147973e-06, "loss": 0.0143, "step": 1510 }, { "epoch": 3.1696779261586805, "grad_norm": 0.8805971741676331, "learning_rate": 3.1614218359757985e-06, "loss": 0.0185, "step": 1515 }, { "epoch": 3.180151872217858, "grad_norm": 2.49381160736084, "learning_rate": 3.0868280429138754e-06, "loss": 0.0161, "step": 1520 }, { "epoch": 3.1906258182770357, "grad_norm": 1.2514592409133911, "learning_rate": 3.0129638725281683e-06, "loss": 0.0198, "step": 1525 }, { "epoch": 3.2010997643362136, "grad_norm": 3.421593427658081, "learning_rate": 2.9398371205127495e-06, "loss": 0.0203, "step": 1530 }, { "epoch": 3.2115737103953914, "grad_norm": 1.6247831583023071, "learning_rate": 2.8674555047338694e-06, "loss": 0.0165, "step": 1535 }, { "epoch": 3.2220476564545693, "grad_norm": 2.246312141418457, "learning_rate": 2.7958266644153974e-06, "loss": 0.0342, "step": 1540 }, { "epoch": 3.232521602513747, "grad_norm": 2.949176788330078, "learning_rate": 2.7249581593325647e-06, "loss": 0.0252, "step": 1545 }, { "epoch": 3.242995548572925, "grad_norm": 1.9428445100784302, "learning_rate": 2.654857469014113e-06, "loss": 0.0243, "step": 1550 }, { "epoch": 3.242995548572925, "eval_loss": 0.05600380152463913, "eval_runtime": 182.3825, "eval_samples_per_second": 10.467, "eval_steps_per_second": 2.621, "step": 1550 }, { "epoch": 3.2534694946321028, "grad_norm": 1.8825373649597168, "learning_rate": 2.585531991952893e-06, "loss": 0.0167, "step": 1555 }, { "epoch": 3.2639434406912806, "grad_norm": 2.28324818611145, "learning_rate": 2.51698904482501e-06, "loss": 0.0258, "step": 1560 }, { "epoch": 3.274417386750458, "grad_norm": 1.9099152088165283, "learning_rate": 2.44923586171763e-06, "loss": 0.0499, "step": 1565 }, { "epoch": 3.284891332809636, "grad_norm": 2.5200655460357666, "learning_rate": 2.382279593365482e-06, "loss": 0.021, "step": 1570 }, { "epoch": 3.2953652788688137, "grad_norm": 1.6834214925765991, "learning_rate": 2.3161273063961542e-06, "loss": 0.0219, "step": 1575 }, { "epoch": 3.3058392249279915, "grad_norm": 2.1367030143737793, "learning_rate": 2.2507859825842883e-06, "loss": 0.0199, "step": 1580 }, { "epoch": 3.3163131709871694, "grad_norm": 0.7622693777084351, "learning_rate": 2.1862625181147123e-06, "loss": 0.0149, "step": 1585 }, { "epoch": 3.326787117046347, "grad_norm": 1.3212164640426636, "learning_rate": 2.122563722854604e-06, "loss": 0.0165, "step": 1590 }, { "epoch": 3.337261063105525, "grad_norm": 1.5809417963027954, "learning_rate": 2.059696319634782e-06, "loss": 0.015, "step": 1595 }, { "epoch": 3.347735009164703, "grad_norm": 1.2320683002471924, "learning_rate": 1.9976669435401597e-06, "loss": 0.0213, "step": 1600 }, { "epoch": 3.347735009164703, "eval_loss": 0.055280230939388275, "eval_runtime": 182.1887, "eval_samples_per_second": 10.478, "eval_steps_per_second": 2.624, "step": 1600 }, { "epoch": 3.3582089552238807, "grad_norm": 1.0370845794677734, "learning_rate": 1.936482141209486e-06, "loss": 0.0237, "step": 1605 }, { "epoch": 3.3686829012830586, "grad_norm": 1.2540106773376465, "learning_rate": 1.8761483701443984e-06, "loss": 0.0214, "step": 1610 }, { "epoch": 3.3791568473422364, "grad_norm": 1.8267788887023926, "learning_rate": 1.8166719980278858e-06, "loss": 0.0202, "step": 1615 }, { "epoch": 3.389630793401414, "grad_norm": 1.5350995063781738, "learning_rate": 1.758059302052255e-06, "loss": 0.0206, "step": 1620 }, { "epoch": 3.4001047394605917, "grad_norm": 1.1958850622177124, "learning_rate": 1.7003164682566165e-06, "loss": 0.0139, "step": 1625 }, { "epoch": 3.4105786855197695, "grad_norm": 2.2496140003204346, "learning_rate": 1.6434495908740022e-06, "loss": 0.0153, "step": 1630 }, { "epoch": 3.4210526315789473, "grad_norm": 0.9056265950202942, "learning_rate": 1.587464671688187e-06, "loss": 0.0178, "step": 1635 }, { "epoch": 3.431526577638125, "grad_norm": 0.940555989742279, "learning_rate": 1.5323676194002456e-06, "loss": 0.0159, "step": 1640 }, { "epoch": 3.442000523697303, "grad_norm": 1.699397087097168, "learning_rate": 1.4781642490049398e-06, "loss": 0.0188, "step": 1645 }, { "epoch": 3.452474469756481, "grad_norm": 1.2530186176300049, "learning_rate": 1.4248602811770108e-06, "loss": 0.0157, "step": 1650 }, { "epoch": 3.452474469756481, "eval_loss": 0.055322494357824326, "eval_runtime": 183.0468, "eval_samples_per_second": 10.429, "eval_steps_per_second": 2.611, "step": 1650 }, { "epoch": 3.4629484158156587, "grad_norm": 1.4462732076644897, "learning_rate": 1.372461341667396e-06, "loss": 0.026, "step": 1655 }, { "epoch": 3.473422361874836, "grad_norm": 0.42883485555648804, "learning_rate": 1.3209729607095022e-06, "loss": 0.0144, "step": 1660 }, { "epoch": 3.483896307934014, "grad_norm": 1.2245005369186401, "learning_rate": 1.2704005724355273e-06, "loss": 0.0108, "step": 1665 }, { "epoch": 3.494370253993192, "grad_norm": 1.7988877296447754, "learning_rate": 1.2207495143029325e-06, "loss": 0.0228, "step": 1670 }, { "epoch": 3.5048442000523696, "grad_norm": 1.7349547147750854, "learning_rate": 1.172025026531135e-06, "loss": 0.0216, "step": 1675 }, { "epoch": 3.5153181461115475, "grad_norm": 0.9366742968559265, "learning_rate": 1.124232251548445e-06, "loss": 0.0145, "step": 1680 }, { "epoch": 3.5257920921707253, "grad_norm": 1.6843370199203491, "learning_rate": 1.0773762334493198e-06, "loss": 0.0311, "step": 1685 }, { "epoch": 3.536266038229903, "grad_norm": 2.704352855682373, "learning_rate": 1.0314619174620211e-06, "loss": 0.0526, "step": 1690 }, { "epoch": 3.546739984289081, "grad_norm": 1.5389641523361206, "learning_rate": 9.86494149426682e-07, "loss": 0.0153, "step": 1695 }, { "epoch": 3.557213930348259, "grad_norm": 1.7506754398345947, "learning_rate": 9.424776752838705e-07, "loss": 0.0264, "step": 1700 }, { "epoch": 3.557213930348259, "eval_loss": 0.05507681146264076, "eval_runtime": 183.3526, "eval_samples_per_second": 10.412, "eval_steps_per_second": 2.607, "step": 1700 }, { "epoch": 3.5676878764074367, "grad_norm": 2.2783095836639404, "learning_rate": 8.994171405737051e-07, "loss": 0.0181, "step": 1705 }, { "epoch": 3.5781618224666145, "grad_norm": 1.6380702257156372, "learning_rate": 8.573170899455529e-07, "loss": 0.0241, "step": 1710 }, { "epoch": 3.5886357685257924, "grad_norm": 1.6343145370483398, "learning_rate": 8.161819666783888e-07, "loss": 0.0193, "step": 1715 }, { "epoch": 3.5991097145849698, "grad_norm": 2.3693206310272217, "learning_rate": 7.760161122118493e-07, "loss": 0.0368, "step": 1720 }, { "epoch": 3.6095836606441476, "grad_norm": 1.108860969543457, "learning_rate": 7.368237656880217e-07, "loss": 0.0101, "step": 1725 }, { "epoch": 3.6200576067033254, "grad_norm": 1.584486722946167, "learning_rate": 6.986090635040555e-07, "loss": 0.0216, "step": 1730 }, { "epoch": 3.6305315527625033, "grad_norm": 0.9664100408554077, "learning_rate": 6.61376038875593e-07, "loss": 0.0112, "step": 1735 }, { "epoch": 3.641005498821681, "grad_norm": 1.3716723918914795, "learning_rate": 6.251286214111018e-07, "loss": 0.0221, "step": 1740 }, { "epoch": 3.651479444880859, "grad_norm": 1.3973896503448486, "learning_rate": 5.898706366971451e-07, "loss": 0.0383, "step": 1745 }, { "epoch": 3.661953390940037, "grad_norm": 2.2058684825897217, "learning_rate": 5.556058058946212e-07, "loss": 0.0439, "step": 1750 }, { "epoch": 3.661953390940037, "eval_loss": 0.05486290529370308, "eval_runtime": 182.894, "eval_samples_per_second": 10.438, "eval_steps_per_second": 2.614, "step": 1750 }, { "epoch": 3.672427336999214, "grad_norm": 0.8177819848060608, "learning_rate": 5.223377453460266e-07, "loss": 0.0135, "step": 1755 }, { "epoch": 3.682901283058392, "grad_norm": 1.7943897247314453, "learning_rate": 4.900699661937914e-07, "loss": 0.0154, "step": 1760 }, { "epoch": 3.69337522911757, "grad_norm": 1.8057630062103271, "learning_rate": 4.588058740097012e-07, "loss": 0.0249, "step": 1765 }, { "epoch": 3.7038491751767477, "grad_norm": 1.58455491065979, "learning_rate": 4.285487684354772e-07, "loss": 0.0156, "step": 1770 }, { "epoch": 3.7143231212359256, "grad_norm": 2.5056676864624023, "learning_rate": 3.9930184283452634e-07, "loss": 0.0214, "step": 1775 }, { "epoch": 3.7247970672951034, "grad_norm": 0.33922508358955383, "learning_rate": 3.7106818395490685e-07, "loss": 0.0096, "step": 1780 }, { "epoch": 3.7352710133542812, "grad_norm": 1.9061384201049805, "learning_rate": 3.438507716035555e-07, "loss": 0.016, "step": 1785 }, { "epoch": 3.745744959413459, "grad_norm": 2.3094871044158936, "learning_rate": 3.176524783317947e-07, "loss": 0.0204, "step": 1790 }, { "epoch": 3.756218905472637, "grad_norm": 0.8052126169204712, "learning_rate": 2.924760691321571e-07, "loss": 0.0182, "step": 1795 }, { "epoch": 3.7666928515318148, "grad_norm": 1.3129606246948242, "learning_rate": 2.683242011465703e-07, "loss": 0.0164, "step": 1800 }, { "epoch": 3.7666928515318148, "eval_loss": 0.05502132698893547, "eval_runtime": 182.2185, "eval_samples_per_second": 10.476, "eval_steps_per_second": 2.623, "step": 1800 }, { "epoch": 3.7771667975909926, "grad_norm": 1.7071999311447144, "learning_rate": 2.45199423385919e-07, "loss": 0.0214, "step": 1805 }, { "epoch": 3.7876407436501704, "grad_norm": 0.963501513004303, "learning_rate": 2.2310417646101535e-07, "loss": 0.0176, "step": 1810 }, { "epoch": 3.798114689709348, "grad_norm": 1.2818574905395508, "learning_rate": 2.0204079232502006e-07, "loss": 0.0204, "step": 1815 }, { "epoch": 3.8085886357685257, "grad_norm": 1.4152429103851318, "learning_rate": 1.8201149402732432e-07, "loss": 0.0136, "step": 1820 }, { "epoch": 3.8190625818277035, "grad_norm": 1.5160934925079346, "learning_rate": 1.630183954789233e-07, "loss": 0.0158, "step": 1825 }, { "epoch": 3.8295365278868814, "grad_norm": 1.2240071296691895, "learning_rate": 1.4506350122932e-07, "loss": 0.0106, "step": 1830 }, { "epoch": 3.840010473946059, "grad_norm": 1.8110445737838745, "learning_rate": 1.2814870625495357e-07, "loss": 0.0141, "step": 1835 }, { "epoch": 3.850484420005237, "grad_norm": 0.8142175078392029, "learning_rate": 1.1227579575921022e-07, "loss": 0.0103, "step": 1840 }, { "epoch": 3.860958366064415, "grad_norm": 2.131216287612915, "learning_rate": 9.744644498400513e-08, "loss": 0.0142, "step": 1845 }, { "epoch": 3.8714323121235923, "grad_norm": 1.8197873830795288, "learning_rate": 8.366221903297944e-08, "loss": 0.0245, "step": 1850 }, { "epoch": 3.8714323121235923, "eval_loss": 0.055017318576574326, "eval_runtime": 182.4144, "eval_samples_per_second": 10.465, "eval_steps_per_second": 2.62, "step": 1850 }, { "epoch": 3.88190625818277, "grad_norm": 1.9622763395309448, "learning_rate": 7.092457270631459e-08, "loss": 0.0266, "step": 1855 }, { "epoch": 3.892380204241948, "grad_norm": 0.672971785068512, "learning_rate": 5.9234850347197335e-08, "loss": 0.0117, "step": 1860 }, { "epoch": 3.902854150301126, "grad_norm": 1.1201688051223755, "learning_rate": 4.8594285699928854e-08, "loss": 0.0208, "step": 1865 }, { "epoch": 3.9133280963603037, "grad_norm": 0.9653613567352295, "learning_rate": 3.900400177971775e-08, "loss": 0.0275, "step": 1870 }, { "epoch": 3.9238020424194815, "grad_norm": 1.7483731508255005, "learning_rate": 3.04650107541582e-08, "loss": 0.0229, "step": 1875 }, { "epoch": 3.9342759884786593, "grad_norm": 0.9113327264785767, "learning_rate": 2.2978213836400974e-08, "loss": 0.0241, "step": 1880 }, { "epoch": 3.944749934537837, "grad_norm": 0.7190056443214417, "learning_rate": 1.6544401190040638e-08, "loss": 0.0086, "step": 1885 }, { "epoch": 3.955223880597015, "grad_norm": 0.8524140119552612, "learning_rate": 1.1164251845718899e-08, "loss": 0.0201, "step": 1890 }, { "epoch": 3.965697826656193, "grad_norm": 1.4827734231948853, "learning_rate": 6.838333629465288e-09, "loss": 0.0212, "step": 1895 }, { "epoch": 3.9761717727153707, "grad_norm": 1.2435967922210693, "learning_rate": 3.5671031027595394e-09, "loss": 0.0168, "step": 1900 }, { "epoch": 3.9761717727153707, "eval_loss": 0.054999224841594696, "eval_runtime": 182.2951, "eval_samples_per_second": 10.472, "eval_steps_per_second": 2.622, "step": 1900 }, { "epoch": 3.9866457187745485, "grad_norm": 1.00032377243042, "learning_rate": 1.3509055143490213e-09, "loss": 0.0186, "step": 1905 }, { "epoch": 3.9971196648337264, "grad_norm": 0.9600237011909485, "learning_rate": 1.8997476381565905e-10, "loss": 0.0143, "step": 1910 }, { "epoch": 4.0, "step": 1912, "total_flos": 1.5735198828619366e+17, "train_loss": 0.07373801102690306, "train_runtime": 28333.2715, "train_samples_per_second": 2.156, "train_steps_per_second": 0.067 } ], "logging_steps": 5, "max_steps": 1912, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5735198828619366e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }