Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_global_step": 950, | |
| "best_metric": 0.05095840245485306, | |
| "best_model_checkpoint": "/kaggle/working/Llama-Factory-out/checkpoint-700", | |
| "epoch": 4.0, | |
| "eval_steps": 50, | |
| "global_step": 1912, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010473946059177796, | |
| "grad_norm": 21.572948455810547, | |
| "learning_rate": 2.0887728459530028e-07, | |
| "loss": 1.3683, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.020947892118355592, | |
| "grad_norm": 28.60955810546875, | |
| "learning_rate": 4.6997389033942563e-07, | |
| "loss": 1.5151, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.031421838177533384, | |
| "grad_norm": 23.589828491210938, | |
| "learning_rate": 7.31070496083551e-07, | |
| "loss": 1.5732, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.041895784236711184, | |
| "grad_norm": NaN, | |
| "learning_rate": 9.921671018276763e-07, | |
| "loss": 1.9134, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.052369730295888976, | |
| "grad_norm": 17.49181365966797, | |
| "learning_rate": 1.2532637075718015e-06, | |
| "loss": 1.4103, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06284367635506677, | |
| "grad_norm": 24.726316452026367, | |
| "learning_rate": 1.5143603133159272e-06, | |
| "loss": 1.1474, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07331762241424457, | |
| "grad_norm": 16.106216430664062, | |
| "learning_rate": 1.7754569190600524e-06, | |
| "loss": 1.1247, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08379156847342237, | |
| "grad_norm": 15.223337173461914, | |
| "learning_rate": 2.036553524804178e-06, | |
| "loss": 0.7444, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09426551453260015, | |
| "grad_norm": 16.522626876831055, | |
| "learning_rate": 2.2976501305483033e-06, | |
| "loss": 0.5635, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.10473946059177795, | |
| "grad_norm": 17.7125244140625, | |
| "learning_rate": 2.5587467362924283e-06, | |
| "loss": 0.4239, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10473946059177795, | |
| "eval_loss": 0.36827990412712097, | |
| "eval_runtime": 181.7089, | |
| "eval_samples_per_second": 10.506, | |
| "eval_steps_per_second": 2.631, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11521340665095575, | |
| "grad_norm": 6.594980716705322, | |
| "learning_rate": 2.819843342036554e-06, | |
| "loss": 0.2571, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.12568735271013354, | |
| "grad_norm": 5.109836578369141, | |
| "learning_rate": 3.080939947780679e-06, | |
| "loss": 0.1924, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13616129876931135, | |
| "grad_norm": NaN, | |
| "learning_rate": 3.3420365535248045e-06, | |
| "loss": 0.1311, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.14663524482848914, | |
| "grad_norm": 5.705305099487305, | |
| "learning_rate": 3.60313315926893e-06, | |
| "loss": 0.1412, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15710919088766692, | |
| "grad_norm": 4.945490837097168, | |
| "learning_rate": 3.864229765013055e-06, | |
| "loss": 0.1438, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.16758313694684474, | |
| "grad_norm": 3.300361156463623, | |
| "learning_rate": 4.12532637075718e-06, | |
| "loss": 0.1146, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17805708300602252, | |
| "grad_norm": 4.6746649742126465, | |
| "learning_rate": 4.386422976501306e-06, | |
| "loss": 0.1094, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1885310290652003, | |
| "grad_norm": 3.8035058975219727, | |
| "learning_rate": 4.647519582245431e-06, | |
| "loss": 0.0945, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.19900497512437812, | |
| "grad_norm": 3.3794517517089844, | |
| "learning_rate": 4.908616187989557e-06, | |
| "loss": 0.1162, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2094789211835559, | |
| "grad_norm": 3.917008638381958, | |
| "learning_rate": 5.169712793733682e-06, | |
| "loss": 0.1005, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2094789211835559, | |
| "eval_loss": 0.11476331204175949, | |
| "eval_runtime": 181.6618, | |
| "eval_samples_per_second": 10.509, | |
| "eval_steps_per_second": 2.631, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2199528672427337, | |
| "grad_norm": 2.673529863357544, | |
| "learning_rate": 5.4308093994778075e-06, | |
| "loss": 0.077, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2304268133019115, | |
| "grad_norm": 3.3407351970672607, | |
| "learning_rate": 5.691906005221932e-06, | |
| "loss": 0.0906, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2409007593610893, | |
| "grad_norm": 3.2136423587799072, | |
| "learning_rate": 5.9530026109660575e-06, | |
| "loss": 0.1303, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2513747054202671, | |
| "grad_norm": 2.6382622718811035, | |
| "learning_rate": 6.214099216710183e-06, | |
| "loss": 0.0806, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.26184865147944486, | |
| "grad_norm": 3.269986867904663, | |
| "learning_rate": 6.475195822454308e-06, | |
| "loss": 0.0827, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2723225975386227, | |
| "grad_norm": 3.785564661026001, | |
| "learning_rate": 6.736292428198435e-06, | |
| "loss": 0.1256, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2827965435978005, | |
| "grad_norm": 2.529221534729004, | |
| "learning_rate": 6.99738903394256e-06, | |
| "loss": 0.1075, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2932704896569783, | |
| "grad_norm": 4.19661283493042, | |
| "learning_rate": 7.258485639686685e-06, | |
| "loss": 0.1118, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.30374443571615606, | |
| "grad_norm": 2.460735559463501, | |
| "learning_rate": 7.51958224543081e-06, | |
| "loss": 0.0738, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.31421838177533384, | |
| "grad_norm": 2.7781996726989746, | |
| "learning_rate": 7.780678851174935e-06, | |
| "loss": 0.0853, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.31421838177533384, | |
| "eval_loss": 0.09081956744194031, | |
| "eval_runtime": 181.8598, | |
| "eval_samples_per_second": 10.497, | |
| "eval_steps_per_second": 2.628, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.32469232783451163, | |
| "grad_norm": 2.850130319595337, | |
| "learning_rate": 8.04177545691906e-06, | |
| "loss": 0.0908, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.33516627389368947, | |
| "grad_norm": 2.2639331817626953, | |
| "learning_rate": 8.302872062663187e-06, | |
| "loss": 0.0744, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.34564021995286726, | |
| "grad_norm": 3.059605121612549, | |
| "learning_rate": 8.563968668407311e-06, | |
| "loss": 0.0624, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.35611416601204504, | |
| "grad_norm": 1.4469069242477417, | |
| "learning_rate": 8.825065274151436e-06, | |
| "loss": 0.0644, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3665881120712228, | |
| "grad_norm": 3.2024354934692383, | |
| "learning_rate": 9.086161879895562e-06, | |
| "loss": 0.0921, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3770620581304006, | |
| "grad_norm": 2.3910789489746094, | |
| "learning_rate": 9.347258485639687e-06, | |
| "loss": 0.107, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3875360041895784, | |
| "grad_norm": 2.225024938583374, | |
| "learning_rate": 9.608355091383813e-06, | |
| "loss": 0.0794, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.39800995024875624, | |
| "grad_norm": 2.3184664249420166, | |
| "learning_rate": 9.869451697127938e-06, | |
| "loss": 0.0912, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.408483896307934, | |
| "grad_norm": 3.909691572189331, | |
| "learning_rate": 1.0130548302872064e-05, | |
| "loss": 0.075, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4189578423671118, | |
| "grad_norm": 2.332878589630127, | |
| "learning_rate": 1.0391644908616189e-05, | |
| "loss": 0.0645, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4189578423671118, | |
| "eval_loss": 0.08308149129152298, | |
| "eval_runtime": 181.6397, | |
| "eval_samples_per_second": 10.51, | |
| "eval_steps_per_second": 2.632, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4294317884262896, | |
| "grad_norm": 2.1909031867980957, | |
| "learning_rate": 1.0652741514360314e-05, | |
| "loss": 0.0566, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.4399057344854674, | |
| "grad_norm": 3.95145320892334, | |
| "learning_rate": 1.0913838120104438e-05, | |
| "loss": 0.0751, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4503796805446452, | |
| "grad_norm": 2.0043461322784424, | |
| "learning_rate": 1.1174934725848565e-05, | |
| "loss": 0.0738, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.460853626603823, | |
| "grad_norm": 2.2231714725494385, | |
| "learning_rate": 1.1436031331592689e-05, | |
| "loss": 0.0679, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4713275726630008, | |
| "grad_norm": 4.517533302307129, | |
| "learning_rate": 1.1697127937336816e-05, | |
| "loss": 0.0762, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4818015187221786, | |
| "grad_norm": 5.064544677734375, | |
| "learning_rate": 1.1958224543080942e-05, | |
| "loss": 0.1181, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.49227546478135636, | |
| "grad_norm": 2.1601788997650146, | |
| "learning_rate": 1.2219321148825067e-05, | |
| "loss": 0.0597, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5027494108405341, | |
| "grad_norm": 2.5625264644622803, | |
| "learning_rate": 1.2480417754569192e-05, | |
| "loss": 0.0624, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.513223356899712, | |
| "grad_norm": 3.436384916305542, | |
| "learning_rate": 1.2741514360313316e-05, | |
| "loss": 0.0715, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5236973029588897, | |
| "grad_norm": 6.529380798339844, | |
| "learning_rate": 1.3002610966057443e-05, | |
| "loss": 0.0767, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5236973029588897, | |
| "eval_loss": 0.07550048828125, | |
| "eval_runtime": 181.4419, | |
| "eval_samples_per_second": 10.521, | |
| "eval_steps_per_second": 2.634, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5341712490180676, | |
| "grad_norm": 2.9247043132781982, | |
| "learning_rate": 1.3263707571801567e-05, | |
| "loss": 0.0586, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5446451950772454, | |
| "grad_norm": 4.487779140472412, | |
| "learning_rate": 1.3524804177545694e-05, | |
| "loss": 0.0968, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5551191411364231, | |
| "grad_norm": 2.502134084701538, | |
| "learning_rate": 1.3785900783289818e-05, | |
| "loss": 0.0658, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.565593087195601, | |
| "grad_norm": 2.817639112472534, | |
| "learning_rate": 1.4046997389033943e-05, | |
| "loss": 0.0706, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5760670332547787, | |
| "grad_norm": 2.812814235687256, | |
| "learning_rate": 1.4308093994778069e-05, | |
| "loss": 0.0582, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5865409793139565, | |
| "grad_norm": 1.8219794034957886, | |
| "learning_rate": 1.4569190600522194e-05, | |
| "loss": 0.0583, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 2.640019416809082, | |
| "learning_rate": 1.4830287206266321e-05, | |
| "loss": 0.0805, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6074888714323121, | |
| "grad_norm": 3.00846791267395, | |
| "learning_rate": 1.5091383812010445e-05, | |
| "loss": 0.0587, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.61796281749149, | |
| "grad_norm": 2.266049861907959, | |
| "learning_rate": 1.535248041775457e-05, | |
| "loss": 0.0622, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.6284367635506677, | |
| "grad_norm": 2.43892502784729, | |
| "learning_rate": 1.5613577023498696e-05, | |
| "loss": 0.0826, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6284367635506677, | |
| "eval_loss": 0.0705987811088562, | |
| "eval_runtime": 181.5418, | |
| "eval_samples_per_second": 10.515, | |
| "eval_steps_per_second": 2.633, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6389107096098455, | |
| "grad_norm": 2.757784605026245, | |
| "learning_rate": 1.587467362924282e-05, | |
| "loss": 0.0917, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6493846556690233, | |
| "grad_norm": 3.353879690170288, | |
| "learning_rate": 1.6135770234986947e-05, | |
| "loss": 0.0689, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6598586017282011, | |
| "grad_norm": 2.3030617237091064, | |
| "learning_rate": 1.6396866840731072e-05, | |
| "loss": 0.0589, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6703325477873789, | |
| "grad_norm": 1.9910506010055542, | |
| "learning_rate": 1.6657963446475198e-05, | |
| "loss": 0.0787, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6808064938465567, | |
| "grad_norm": 1.8802602291107178, | |
| "learning_rate": 1.6919060052219323e-05, | |
| "loss": 0.0803, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6912804399057345, | |
| "grad_norm": 2.357010841369629, | |
| "learning_rate": 1.718015665796345e-05, | |
| "loss": 0.065, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 3.608004331588745, | |
| "learning_rate": 1.7441253263707574e-05, | |
| "loss": 0.0958, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7122283320240901, | |
| "grad_norm": 2.5642309188842773, | |
| "learning_rate": 1.77023498694517e-05, | |
| "loss": 0.0859, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7227022780832679, | |
| "grad_norm": 2.9146134853363037, | |
| "learning_rate": 1.7963446475195825e-05, | |
| "loss": 0.0802, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7331762241424457, | |
| "grad_norm": 2.8338112831115723, | |
| "learning_rate": 1.822454308093995e-05, | |
| "loss": 0.0882, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7331762241424457, | |
| "eval_loss": 0.06691395491361618, | |
| "eval_runtime": 181.5091, | |
| "eval_samples_per_second": 10.517, | |
| "eval_steps_per_second": 2.633, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7436501702016235, | |
| "grad_norm": 2.904839277267456, | |
| "learning_rate": 1.8485639686684072e-05, | |
| "loss": 0.0623, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7541241162608012, | |
| "grad_norm": 2.482553243637085, | |
| "learning_rate": 1.87467362924282e-05, | |
| "loss": 0.0611, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7645980623199791, | |
| "grad_norm": 2.968573570251465, | |
| "learning_rate": 1.9007832898172326e-05, | |
| "loss": 0.0829, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7750720083791568, | |
| "grad_norm": 2.859727144241333, | |
| "learning_rate": 1.9268929503916452e-05, | |
| "loss": 0.0555, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7855459544383346, | |
| "grad_norm": 1.7544801235198975, | |
| "learning_rate": 1.9530026109660577e-05, | |
| "loss": 0.0722, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.7960199004975125, | |
| "grad_norm": 2.506270408630371, | |
| "learning_rate": 1.97911227154047e-05, | |
| "loss": 0.0706, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8064938465566902, | |
| "grad_norm": 2.7544281482696533, | |
| "learning_rate": 1.9999978891633502e-05, | |
| "loss": 0.0561, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.816967792615868, | |
| "grad_norm": 1.2377090454101562, | |
| "learning_rate": 1.9999240108162817e-05, | |
| "loss": 0.0682, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8274417386750458, | |
| "grad_norm": 3.0974531173706055, | |
| "learning_rate": 1.999744599547812e-05, | |
| "loss": 0.0804, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8379156847342236, | |
| "grad_norm": 2.9139633178710938, | |
| "learning_rate": 1.9994596742931747e-05, | |
| "loss": 0.0726, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8379156847342236, | |
| "eval_loss": 0.06348562985658646, | |
| "eval_runtime": 181.4276, | |
| "eval_samples_per_second": 10.522, | |
| "eval_steps_per_second": 2.635, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8483896307934015, | |
| "grad_norm": 3.329805850982666, | |
| "learning_rate": 1.9990692651236494e-05, | |
| "loss": 0.0636, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8588635768525792, | |
| "grad_norm": 1.405851125717163, | |
| "learning_rate": 1.9985734132433876e-05, | |
| "loss": 0.0483, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.869337522911757, | |
| "grad_norm": 2.3531923294067383, | |
| "learning_rate": 1.9979721709850634e-05, | |
| "loss": 0.0709, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.8798114689709348, | |
| "grad_norm": 1.4560775756835938, | |
| "learning_rate": 1.9972656018043505e-05, | |
| "loss": 0.0576, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8902854150301126, | |
| "grad_norm": 2.4551849365234375, | |
| "learning_rate": 1.996453780273226e-05, | |
| "loss": 0.0861, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9007593610892904, | |
| "grad_norm": 4.548062801361084, | |
| "learning_rate": 1.9955367920720977e-05, | |
| "loss": 0.1325, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9112333071484682, | |
| "grad_norm": 1.5118955373764038, | |
| "learning_rate": 1.9945147339807645e-05, | |
| "loss": 0.06, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.921707253207646, | |
| "grad_norm": 2.8457553386688232, | |
| "learning_rate": 1.993387713868199e-05, | |
| "loss": 0.0496, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9321811992668237, | |
| "grad_norm": 2.279599666595459, | |
| "learning_rate": 1.9921558506811648e-05, | |
| "loss": 0.0541, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9426551453260016, | |
| "grad_norm": 1.4517545700073242, | |
| "learning_rate": 1.990819274431662e-05, | |
| "loss": 0.0711, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9426551453260016, | |
| "eval_loss": 0.06195152550935745, | |
| "eval_runtime": 181.5787, | |
| "eval_samples_per_second": 10.513, | |
| "eval_steps_per_second": 2.632, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9531290913851793, | |
| "grad_norm": 2.7663371562957764, | |
| "learning_rate": 1.989378126183207e-05, | |
| "loss": 0.0707, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9636030374443572, | |
| "grad_norm": 2.230884552001953, | |
| "learning_rate": 1.987832558035942e-05, | |
| "loss": 0.0554, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.974076983503535, | |
| "grad_norm": 2.8206303119659424, | |
| "learning_rate": 1.9861827331105844e-05, | |
| "loss": 0.0658, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.9845509295627127, | |
| "grad_norm": 1.7690904140472412, | |
| "learning_rate": 1.9844288255312098e-05, | |
| "loss": 0.0546, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9950248756218906, | |
| "grad_norm": 2.402695417404175, | |
| "learning_rate": 1.982571020406875e-05, | |
| "loss": 0.0725, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.0041895784236712, | |
| "grad_norm": 0.8933857083320618, | |
| "learning_rate": 1.9806095138120824e-05, | |
| "loss": 0.0363, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.014663524482849, | |
| "grad_norm": 1.5981252193450928, | |
| "learning_rate": 1.978544512766084e-05, | |
| "loss": 0.0454, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.0251374705420266, | |
| "grad_norm": 2.3014566898345947, | |
| "learning_rate": 1.9763762352110344e-05, | |
| "loss": 0.0455, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0356114166012045, | |
| "grad_norm": 2.267174243927002, | |
| "learning_rate": 1.9741049099889874e-05, | |
| "loss": 0.0428, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.0460853626603823, | |
| "grad_norm": 2.398452043533325, | |
| "learning_rate": 1.9717307768177457e-05, | |
| "loss": 0.0433, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0460853626603823, | |
| "eval_loss": 0.062211424112319946, | |
| "eval_runtime": 181.7607, | |
| "eval_samples_per_second": 10.503, | |
| "eval_steps_per_second": 2.63, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0565593087195602, | |
| "grad_norm": 2.4606473445892334, | |
| "learning_rate": 1.9692540862655587e-05, | |
| "loss": 0.0563, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.067033254778738, | |
| "grad_norm": 0.9938412308692932, | |
| "learning_rate": 1.9666750997246793e-05, | |
| "loss": 0.0429, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0775072008379156, | |
| "grad_norm": 2.087348461151123, | |
| "learning_rate": 1.963994089383774e-05, | |
| "loss": 0.0609, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.0879811468970935, | |
| "grad_norm": 1.5083081722259521, | |
| "learning_rate": 1.9612113381991985e-05, | |
| "loss": 0.0538, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0984550929562713, | |
| "grad_norm": 1.1394294500350952, | |
| "learning_rate": 1.9583271398651327e-05, | |
| "loss": 0.0432, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.1089290390154491, | |
| "grad_norm": 1.6931722164154053, | |
| "learning_rate": 1.9553417987825837e-05, | |
| "loss": 0.036, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1194029850746268, | |
| "grad_norm": 2.196749687194824, | |
| "learning_rate": 1.952255630027259e-05, | |
| "loss": 0.0504, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.1298769311338046, | |
| "grad_norm": 1.8391106128692627, | |
| "learning_rate": 1.949068959316315e-05, | |
| "loss": 0.0391, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "grad_norm": 2.4160068035125732, | |
| "learning_rate": 1.9457821229739783e-05, | |
| "loss": 0.0486, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.1508248232521603, | |
| "grad_norm": 1.0730011463165283, | |
| "learning_rate": 1.9423954678960502e-05, | |
| "loss": 0.0488, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1508248232521603, | |
| "eval_loss": 0.05938513204455376, | |
| "eval_runtime": 181.8553, | |
| "eval_samples_per_second": 10.497, | |
| "eval_steps_per_second": 2.628, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1612987693113381, | |
| "grad_norm": 1.80950927734375, | |
| "learning_rate": 1.9389093515132965e-05, | |
| "loss": 0.0435, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.1717727153705157, | |
| "grad_norm": 2.7154200077056885, | |
| "learning_rate": 1.9353241417537216e-05, | |
| "loss": 0.0611, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1822466614296936, | |
| "grad_norm": 1.1030880212783813, | |
| "learning_rate": 1.9316402170037377e-05, | |
| "loss": 0.0531, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.1927206074888714, | |
| "grad_norm": 2.1434154510498047, | |
| "learning_rate": 1.927857966068232e-05, | |
| "loss": 0.0733, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.2031945535480493, | |
| "grad_norm": 0.8784016370773315, | |
| "learning_rate": 1.923977788129528e-05, | |
| "loss": 0.0339, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.2136684996072271, | |
| "grad_norm": 1.4416366815567017, | |
| "learning_rate": 1.9200000927052586e-05, | |
| "loss": 0.0453, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.2241424456664047, | |
| "grad_norm": 0.9367201924324036, | |
| "learning_rate": 1.9159252996051433e-05, | |
| "loss": 0.0442, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.2346163917255826, | |
| "grad_norm": 3.147280216217041, | |
| "learning_rate": 1.911753838886681e-05, | |
| "loss": 0.0429, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2450903377847604, | |
| "grad_norm": 2.891639232635498, | |
| "learning_rate": 1.907486150809764e-05, | |
| "loss": 0.0341, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.2555642838439383, | |
| "grad_norm": 1.8960820436477661, | |
| "learning_rate": 1.9031226857902087e-05, | |
| "loss": 0.0347, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2555642838439383, | |
| "eval_loss": 0.05871057137846947, | |
| "eval_runtime": 181.3499, | |
| "eval_samples_per_second": 10.527, | |
| "eval_steps_per_second": 2.636, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.266038229903116, | |
| "grad_norm": 1.8320516347885132, | |
| "learning_rate": 1.898663904352221e-05, | |
| "loss": 0.0384, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.2765121759622937, | |
| "grad_norm": 2.077674150466919, | |
| "learning_rate": 1.894110277079791e-05, | |
| "loss": 0.0845, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2869861220214716, | |
| "grad_norm": 1.9369480609893799, | |
| "learning_rate": 1.8894622845670282e-05, | |
| "loss": 0.0418, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.2974600680806494, | |
| "grad_norm": 3.845341682434082, | |
| "learning_rate": 1.8847204173674378e-05, | |
| "loss": 0.0488, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.3079340141398272, | |
| "grad_norm": 1.5000770092010498, | |
| "learning_rate": 1.8798851759421473e-05, | |
| "loss": 0.0553, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.3184079601990049, | |
| "grad_norm": 1.5684770345687866, | |
| "learning_rate": 1.8749570706070895e-05, | |
| "loss": 0.0492, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3288819062581827, | |
| "grad_norm": 2.115903377532959, | |
| "learning_rate": 1.8699366214791394e-05, | |
| "loss": 0.0424, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.3393558523173605, | |
| "grad_norm": 1.7767939567565918, | |
| "learning_rate": 1.8648243584212254e-05, | |
| "loss": 0.0234, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.3498297983765384, | |
| "grad_norm": 1.7302303314208984, | |
| "learning_rate": 1.8596208209864022e-05, | |
| "loss": 0.0482, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.3603037444357162, | |
| "grad_norm": 1.750826358795166, | |
| "learning_rate": 1.8543265583609096e-05, | |
| "loss": 0.0475, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3603037444357162, | |
| "eval_loss": 0.05913909152150154, | |
| "eval_runtime": 181.2221, | |
| "eval_samples_per_second": 10.534, | |
| "eval_steps_per_second": 2.638, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.370777690494894, | |
| "grad_norm": 2.049710512161255, | |
| "learning_rate": 1.8489421293062087e-05, | |
| "loss": 0.044, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.3812516365540717, | |
| "grad_norm": 1.9173017740249634, | |
| "learning_rate": 1.8434681021000108e-05, | |
| "loss": 0.0391, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3917255826132495, | |
| "grad_norm": 2.223348379135132, | |
| "learning_rate": 1.8379050544763004e-05, | |
| "loss": 0.0393, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.4021995286724274, | |
| "grad_norm": 3.047008752822876, | |
| "learning_rate": 1.8322535735643604e-05, | |
| "loss": 0.044, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.4126734747316052, | |
| "grad_norm": 1.5292298793792725, | |
| "learning_rate": 1.8265142558268066e-05, | |
| "loss": 0.0672, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.4231474207907828, | |
| "grad_norm": 1.8190603256225586, | |
| "learning_rate": 1.820687706996636e-05, | |
| "loss": 0.0458, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4336213668499607, | |
| "grad_norm": 2.0858137607574463, | |
| "learning_rate": 1.8147745420132965e-05, | |
| "loss": 0.042, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.4440953129091385, | |
| "grad_norm": 4.506059646606445, | |
| "learning_rate": 1.8087753849577876e-05, | |
| "loss": 0.0629, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4545692589683163, | |
| "grad_norm": 2.2428197860717773, | |
| "learning_rate": 1.802690868986792e-05, | |
| "loss": 0.0486, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.4650432050274942, | |
| "grad_norm": 1.942474365234375, | |
| "learning_rate": 1.7965216362658528e-05, | |
| "loss": 0.0485, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4650432050274942, | |
| "eval_loss": 0.055441830307245255, | |
| "eval_runtime": 182.1133, | |
| "eval_samples_per_second": 10.482, | |
| "eval_steps_per_second": 2.625, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.475517151086672, | |
| "grad_norm": 1.306942343711853, | |
| "learning_rate": 1.7902683379015996e-05, | |
| "loss": 0.0518, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.4859910971458496, | |
| "grad_norm": 1.9224140644073486, | |
| "learning_rate": 1.7839316338730282e-05, | |
| "loss": 0.0579, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.4964650432050275, | |
| "grad_norm": 1.8800877332687378, | |
| "learning_rate": 1.7775121929618462e-05, | |
| "loss": 0.0514, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.5069389892642053, | |
| "grad_norm": 1.8557875156402588, | |
| "learning_rate": 1.771010692681892e-05, | |
| "loss": 0.0535, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.517412935323383, | |
| "grad_norm": 1.4152109622955322, | |
| "learning_rate": 1.764427819207624e-05, | |
| "loss": 0.0693, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.5278868813825608, | |
| "grad_norm": 3.057999849319458, | |
| "learning_rate": 1.7577642673017063e-05, | |
| "loss": 0.0429, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5383608274417386, | |
| "grad_norm": 2.492802619934082, | |
| "learning_rate": 1.7510207402416798e-05, | |
| "loss": 0.04, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.5488347735009165, | |
| "grad_norm": 4.143369674682617, | |
| "learning_rate": 1.7441979497457384e-05, | |
| "loss": 0.058, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5593087195600943, | |
| "grad_norm": 1.7152019739151, | |
| "learning_rate": 1.7372966158976143e-05, | |
| "loss": 0.0713, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.5697826656192722, | |
| "grad_norm": 2.2295591831207275, | |
| "learning_rate": 1.7303174670705783e-05, | |
| "loss": 0.0421, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5697826656192722, | |
| "eval_loss": 0.05413464084267616, | |
| "eval_runtime": 181.9914, | |
| "eval_samples_per_second": 10.49, | |
| "eval_steps_per_second": 2.626, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.58025661167845, | |
| "grad_norm": 1.414204478263855, | |
| "learning_rate": 1.7232612398505676e-05, | |
| "loss": 0.0499, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.5907305577376276, | |
| "grad_norm": 2.8413901329040527, | |
| "learning_rate": 1.716128678958445e-05, | |
| "loss": 0.0496, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.6012045037968055, | |
| "grad_norm": 1.3030387163162231, | |
| "learning_rate": 1.708920537171402e-05, | |
| "loss": 0.0376, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.611678449855983, | |
| "grad_norm": 0.9149934649467468, | |
| "learning_rate": 1.7016375752435088e-05, | |
| "loss": 0.0313, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.622152395915161, | |
| "grad_norm": 2.623652935028076, | |
| "learning_rate": 1.694280561825422e-05, | |
| "loss": 0.0612, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.6326263419743388, | |
| "grad_norm": 1.9939152002334595, | |
| "learning_rate": 1.6868502733832647e-05, | |
| "loss": 0.0398, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6431002880335166, | |
| "grad_norm": 3.7116594314575195, | |
| "learning_rate": 1.679347494116673e-05, | |
| "loss": 0.0419, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.6535742340926944, | |
| "grad_norm": 1.6450990438461304, | |
| "learning_rate": 1.6717730158760334e-05, | |
| "loss": 0.0387, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.6640481801518723, | |
| "grad_norm": 1.863366723060608, | |
| "learning_rate": 1.6641276380789107e-05, | |
| "loss": 0.0529, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.6745221262110501, | |
| "grad_norm": 1.3787758350372314, | |
| "learning_rate": 1.656412167625674e-05, | |
| "loss": 0.0395, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6745221262110501, | |
| "eval_loss": 0.05458131060004234, | |
| "eval_runtime": 181.5768, | |
| "eval_samples_per_second": 10.513, | |
| "eval_steps_per_second": 2.632, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.684996072270228, | |
| "grad_norm": 1.3715674877166748, | |
| "learning_rate": 1.6486274188143386e-05, | |
| "loss": 0.0335, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.6954700183294056, | |
| "grad_norm": 1.6836681365966797, | |
| "learning_rate": 1.6407742132546216e-05, | |
| "loss": 0.042, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.7059439643885834, | |
| "grad_norm": 2.448378324508667, | |
| "learning_rate": 1.6328533797812315e-05, | |
| "loss": 0.0419, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.716417910447761, | |
| "grad_norm": 1.39069664478302, | |
| "learning_rate": 1.6248657543663887e-05, | |
| "loss": 0.0371, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.7268918565069389, | |
| "grad_norm": 2.460034132003784, | |
| "learning_rate": 1.6168121800315993e-05, | |
| "loss": 0.0481, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.7373658025661167, | |
| "grad_norm": 2.401494026184082, | |
| "learning_rate": 1.60869350675868e-05, | |
| "loss": 0.0769, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.7478397486252946, | |
| "grad_norm": 2.444972038269043, | |
| "learning_rate": 1.6005105914000508e-05, | |
| "loss": 0.0403, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.7583136946844724, | |
| "grad_norm": 1.6803293228149414, | |
| "learning_rate": 1.5922642975883014e-05, | |
| "loss": 0.0433, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7687876407436502, | |
| "grad_norm": 1.660657286643982, | |
| "learning_rate": 1.5839554956450435e-05, | |
| "loss": 0.043, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.779261586802828, | |
| "grad_norm": 1.6761749982833862, | |
| "learning_rate": 1.5755850624890563e-05, | |
| "loss": 0.0483, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.779261586802828, | |
| "eval_loss": 0.05199718102812767, | |
| "eval_runtime": 181.8182, | |
| "eval_samples_per_second": 10.499, | |
| "eval_steps_per_second": 2.629, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7897355328620057, | |
| "grad_norm": 1.660902738571167, | |
| "learning_rate": 1.5671538815437346e-05, | |
| "loss": 0.0451, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.8002094789211835, | |
| "grad_norm": 2.425180673599243, | |
| "learning_rate": 1.558662842643852e-05, | |
| "loss": 0.0514, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.8106834249803614, | |
| "grad_norm": 1.8615056276321411, | |
| "learning_rate": 1.5501128419416475e-05, | |
| "loss": 0.0951, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.821157371039539, | |
| "grad_norm": 2.117887258529663, | |
| "learning_rate": 1.541504781812244e-05, | |
| "loss": 0.0443, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8316313170987168, | |
| "grad_norm": 1.9007426500320435, | |
| "learning_rate": 1.532839570758411e-05, | |
| "loss": 0.0539, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 1.0283795595169067, | |
| "learning_rate": 1.5241181233146798e-05, | |
| "loss": 0.0439, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.8525792092170725, | |
| "grad_norm": 1.4137732982635498, | |
| "learning_rate": 1.5153413599508241e-05, | |
| "loss": 0.0454, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.8630531552762504, | |
| "grad_norm": 1.5199006795883179, | |
| "learning_rate": 1.5065102069747117e-05, | |
| "loss": 0.0521, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8735271013354282, | |
| "grad_norm": 1.8887778520584106, | |
| "learning_rate": 1.4976255964345407e-05, | |
| "loss": 0.0379, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.884001047394606, | |
| "grad_norm": 0.687090277671814, | |
| "learning_rate": 1.488688466020471e-05, | |
| "loss": 0.0421, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.884001047394606, | |
| "eval_loss": 0.055315304547548294, | |
| "eval_runtime": 181.7379, | |
| "eval_samples_per_second": 10.504, | |
| "eval_steps_per_second": 2.63, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8944749934537837, | |
| "grad_norm": 3.8431344032287598, | |
| "learning_rate": 1.4796997589656605e-05, | |
| "loss": 0.0493, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.9049489395129615, | |
| "grad_norm": 3.010768413543701, | |
| "learning_rate": 1.470660423946713e-05, | |
| "loss": 0.0429, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.9154228855721394, | |
| "grad_norm": 1.5146229267120361, | |
| "learning_rate": 1.4615714149835557e-05, | |
| "loss": 0.0349, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.925896831631317, | |
| "grad_norm": 1.2837982177734375, | |
| "learning_rate": 1.4524336913387509e-05, | |
| "loss": 0.031, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.9363707776904948, | |
| "grad_norm": 1.4640088081359863, | |
| "learning_rate": 1.4432482174162539e-05, | |
| "loss": 0.0433, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.9468447237496727, | |
| "grad_norm": 2.3506968021392822, | |
| "learning_rate": 1.4340159626596295e-05, | |
| "loss": 0.0344, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.9573186698088505, | |
| "grad_norm": 1.7294262647628784, | |
| "learning_rate": 1.4247379014497356e-05, | |
| "loss": 0.0448, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.9677926158680283, | |
| "grad_norm": 2.0124881267547607, | |
| "learning_rate": 1.4154150130018867e-05, | |
| "loss": 0.0531, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9782665619272062, | |
| "grad_norm": 1.9695724248886108, | |
| "learning_rate": 1.4060482812625055e-05, | |
| "loss": 0.0509, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.988740507986384, | |
| "grad_norm": 5.048811435699463, | |
| "learning_rate": 1.3966386948052777e-05, | |
| "loss": 0.0735, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.988740507986384, | |
| "eval_loss": 0.05095840245485306, | |
| "eval_runtime": 181.2754, | |
| "eval_samples_per_second": 10.531, | |
| "eval_steps_per_second": 2.637, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9992144540455616, | |
| "grad_norm": 2.204068899154663, | |
| "learning_rate": 1.3871872467268155e-05, | |
| "loss": 0.0462, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 2.0083791568473424, | |
| "grad_norm": 1.12019681930542, | |
| "learning_rate": 1.3776949345418466e-05, | |
| "loss": 0.0368, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.01885310290652, | |
| "grad_norm": 0.8073732256889343, | |
| "learning_rate": 1.3681627600779353e-05, | |
| "loss": 0.0284, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 2.029327048965698, | |
| "grad_norm": 1.6881890296936035, | |
| "learning_rate": 1.3585917293697473e-05, | |
| "loss": 0.025, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.0398009950248754, | |
| "grad_norm": 2.6855087280273438, | |
| "learning_rate": 1.3489828525528732e-05, | |
| "loss": 0.0447, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.0502749410840533, | |
| "grad_norm": 2.1619064807891846, | |
| "learning_rate": 1.3393371437572183e-05, | |
| "loss": 0.0254, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.060748887143231, | |
| "grad_norm": 2.9052109718322754, | |
| "learning_rate": 1.329655620999969e-05, | |
| "loss": 0.0427, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 2.071222833202409, | |
| "grad_norm": 1.7290070056915283, | |
| "learning_rate": 1.3199393060781507e-05, | |
| "loss": 0.0315, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.081696779261587, | |
| "grad_norm": 1.7127286195755005, | |
| "learning_rate": 1.3101892244607872e-05, | |
| "loss": 0.0256, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 2.0921707253207646, | |
| "grad_norm": 1.0866358280181885, | |
| "learning_rate": 1.3004064051806712e-05, | |
| "loss": 0.0233, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.0921707253207646, | |
| "eval_loss": 0.05503799021244049, | |
| "eval_runtime": 181.6508, | |
| "eval_samples_per_second": 10.509, | |
| "eval_steps_per_second": 2.631, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.1026446713799425, | |
| "grad_norm": 2.119222402572632, | |
| "learning_rate": 1.2905918807257578e-05, | |
| "loss": 0.0234, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 2.1131186174391203, | |
| "grad_norm": 2.4023685455322266, | |
| "learning_rate": 1.2807466869301978e-05, | |
| "loss": 0.0284, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.123592563498298, | |
| "grad_norm": 1.3008413314819336, | |
| "learning_rate": 1.2708718628650125e-05, | |
| "loss": 0.0245, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 2.134066509557476, | |
| "grad_norm": 1.8224750757217407, | |
| "learning_rate": 1.260968450728429e-05, | |
| "loss": 0.0439, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.1445404556166534, | |
| "grad_norm": 1.3979074954986572, | |
| "learning_rate": 1.2510374957358877e-05, | |
| "loss": 0.0272, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 2.1550144016758312, | |
| "grad_norm": 1.3777137994766235, | |
| "learning_rate": 1.2410800460097265e-05, | |
| "loss": 0.0158, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.165488347735009, | |
| "grad_norm": 1.4102022647857666, | |
| "learning_rate": 1.2310971524685638e-05, | |
| "loss": 0.0236, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 2.175962293794187, | |
| "grad_norm": 1.0941966772079468, | |
| "learning_rate": 1.2210898687163808e-05, | |
| "loss": 0.03, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.1864362398533648, | |
| "grad_norm": 1.8256818056106567, | |
| "learning_rate": 1.2110592509313261e-05, | |
| "loss": 0.0387, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 2.1969101859125426, | |
| "grad_norm": 1.2805190086364746, | |
| "learning_rate": 1.201006357754243e-05, | |
| "loss": 0.027, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.1969101859125426, | |
| "eval_loss": 0.054396990686655045, | |
| "eval_runtime": 181.6161, | |
| "eval_samples_per_second": 10.511, | |
| "eval_steps_per_second": 2.632, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.2073841319717205, | |
| "grad_norm": 1.5364525318145752, | |
| "learning_rate": 1.1909322501769407e-05, | |
| "loss": 0.0205, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 2.2178580780308983, | |
| "grad_norm": 2.694061040878296, | |
| "learning_rate": 1.1808379914302166e-05, | |
| "loss": 0.0347, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.228332024090076, | |
| "grad_norm": 1.2438369989395142, | |
| "learning_rate": 1.1707246468716411e-05, | |
| "loss": 0.0503, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 1.5222554206848145, | |
| "learning_rate": 1.1605932838731194e-05, | |
| "loss": 0.0438, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.2492799162084314, | |
| "grad_norm": 1.7822566032409668, | |
| "learning_rate": 1.15044497170824e-05, | |
| "loss": 0.0345, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 2.259753862267609, | |
| "grad_norm": 1.48551607131958, | |
| "learning_rate": 1.1402807814394216e-05, | |
| "loss": 0.0342, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.270227808326787, | |
| "grad_norm": 2.0183334350585938, | |
| "learning_rate": 1.130101785804874e-05, | |
| "loss": 0.0277, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 2.280701754385965, | |
| "grad_norm": 1.0673748254776, | |
| "learning_rate": 1.1199090591053784e-05, | |
| "loss": 0.0237, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.2911757004451427, | |
| "grad_norm": 1.9523701667785645, | |
| "learning_rate": 1.1097036770909055e-05, | |
| "loss": 0.0403, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 2.3016496465043206, | |
| "grad_norm": 0.7670222520828247, | |
| "learning_rate": 1.0994867168470806e-05, | |
| "loss": 0.0213, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.3016496465043206, | |
| "eval_loss": 0.05162982642650604, | |
| "eval_runtime": 182.2699, | |
| "eval_samples_per_second": 10.473, | |
| "eval_steps_per_second": 2.622, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.3121235925634984, | |
| "grad_norm": 1.686271071434021, | |
| "learning_rate": 1.0892592566815061e-05, | |
| "loss": 0.0303, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 2.3225975386226763, | |
| "grad_norm": 1.5811524391174316, | |
| "learning_rate": 1.079022376009955e-05, | |
| "loss": 0.0193, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.333071484681854, | |
| "grad_norm": 1.9558700323104858, | |
| "learning_rate": 1.0687771552424504e-05, | |
| "loss": 0.0269, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 2.3435454307410315, | |
| "grad_norm": 1.3908772468566895, | |
| "learning_rate": 1.0585246756692366e-05, | |
| "loss": 0.0307, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.3540193768002093, | |
| "grad_norm": 1.5732723474502563, | |
| "learning_rate": 1.0482660193466594e-05, | |
| "loss": 0.0184, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 2.364493322859387, | |
| "grad_norm": 1.5866297483444214, | |
| "learning_rate": 1.0380022689829638e-05, | |
| "loss": 0.0263, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.374967268918565, | |
| "grad_norm": 0.7292336821556091, | |
| "learning_rate": 1.0277345078240258e-05, | |
| "loss": 0.0465, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 2.385441214977743, | |
| "grad_norm": 1.587586522102356, | |
| "learning_rate": 1.0174638195390235e-05, | |
| "loss": 0.0402, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.3959151610369207, | |
| "grad_norm": 1.3230594396591187, | |
| "learning_rate": 1.0071912881060668e-05, | |
| "loss": 0.0274, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 2.4063891070960985, | |
| "grad_norm": 1.5415374040603638, | |
| "learning_rate": 9.969179976977939e-06, | |
| "loss": 0.0284, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.4063891070960985, | |
| "eval_loss": 0.052570246160030365, | |
| "eval_runtime": 181.6844, | |
| "eval_samples_per_second": 10.507, | |
| "eval_steps_per_second": 2.631, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.4168630531552764, | |
| "grad_norm": 0.8958898782730103, | |
| "learning_rate": 9.866450325669456e-06, | |
| "loss": 0.0231, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 2.4273369992144542, | |
| "grad_norm": 2.100008487701416, | |
| "learning_rate": 9.763734769319317e-06, | |
| "loss": 0.0357, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.4378109452736316, | |
| "grad_norm": 1.323148488998413, | |
| "learning_rate": 9.661044148624038e-06, | |
| "loss": 0.0237, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 2.4482848913328095, | |
| "grad_norm": 2.1606085300445557, | |
| "learning_rate": 9.5583893016484e-06, | |
| "loss": 0.0279, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.4587588373919873, | |
| "grad_norm": 1.4878783226013184, | |
| "learning_rate": 9.455781062681583e-06, | |
| "loss": 0.025, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 2.469232783451165, | |
| "grad_norm": 0.9704115986824036, | |
| "learning_rate": 9.353230261093723e-06, | |
| "loss": 0.0177, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.479706729510343, | |
| "grad_norm": 3.0599184036254883, | |
| "learning_rate": 9.250747720192961e-06, | |
| "loss": 0.0339, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 2.490180675569521, | |
| "grad_norm": 1.2243415117263794, | |
| "learning_rate": 9.148344256083131e-06, | |
| "loss": 0.0327, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.5006546216286987, | |
| "grad_norm": 0.6634637117385864, | |
| "learning_rate": 9.046030676522242e-06, | |
| "loss": 0.027, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 2.5111285676878765, | |
| "grad_norm": 0.6147317290306091, | |
| "learning_rate": 8.943817779781788e-06, | |
| "loss": 0.0175, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.5111285676878765, | |
| "eval_loss": 0.05241983383893967, | |
| "eval_runtime": 181.61, | |
| "eval_samples_per_second": 10.512, | |
| "eval_steps_per_second": 2.632, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.5216025137470544, | |
| "grad_norm": 1.175798773765564, | |
| "learning_rate": 8.841716353507118e-06, | |
| "loss": 0.036, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 2.532076459806232, | |
| "grad_norm": 3.135117292404175, | |
| "learning_rate": 8.739737173578875e-06, | |
| "loss": 0.039, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.54255040586541, | |
| "grad_norm": 1.2280455827713013, | |
| "learning_rate": 8.637891002975708e-06, | |
| "loss": 0.0242, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 2.5530243519245874, | |
| "grad_norm": 1.851010799407959, | |
| "learning_rate": 8.536188590638334e-06, | |
| "loss": 0.027, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.5634982979837653, | |
| "grad_norm": 1.7395970821380615, | |
| "learning_rate": 8.4346406703351e-06, | |
| "loss": 0.0241, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 2.573972244042943, | |
| "grad_norm": 1.3405005931854248, | |
| "learning_rate": 8.3332579595291e-06, | |
| "loss": 0.0321, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.584446190102121, | |
| "grad_norm": 2.150904417037964, | |
| "learning_rate": 8.232051158247074e-06, | |
| "loss": 0.0325, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 2.594920136161299, | |
| "grad_norm": 1.6793160438537598, | |
| "learning_rate": 8.131030947950109e-06, | |
| "loss": 0.0351, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.6053940822204766, | |
| "grad_norm": 1.7281907796859741, | |
| "learning_rate": 8.030207990406286e-06, | |
| "loss": 0.0485, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 2.6158680282796545, | |
| "grad_norm": 1.0809645652770996, | |
| "learning_rate": 7.929592926565468e-06, | |
| "loss": 0.0218, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.6158680282796545, | |
| "eval_loss": 0.05264349281787872, | |
| "eval_runtime": 181.5098, | |
| "eval_samples_per_second": 10.517, | |
| "eval_steps_per_second": 2.633, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.6263419743388323, | |
| "grad_norm": 1.1241612434387207, | |
| "learning_rate": 7.829196375436197e-06, | |
| "loss": 0.029, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 2.6368159203980097, | |
| "grad_norm": 1.4399851560592651, | |
| "learning_rate": 7.729028932964995e-06, | |
| "loss": 0.0337, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.6472898664571876, | |
| "grad_norm": 2.769148588180542, | |
| "learning_rate": 7.629101170918041e-06, | |
| "loss": 0.0398, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 2.6577638125163654, | |
| "grad_norm": 1.6929821968078613, | |
| "learning_rate": 7.529423635765401e-06, | |
| "loss": 0.0182, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.6682377585755432, | |
| "grad_norm": 1.0924474000930786, | |
| "learning_rate": 7.430006847567972e-06, | |
| "loss": 0.0385, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 2.678711704634721, | |
| "grad_norm": 1.542842984199524, | |
| "learning_rate": 7.330861298867173e-06, | |
| "loss": 0.0311, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.689185650693899, | |
| "grad_norm": 1.0925610065460205, | |
| "learning_rate": 7.2319974535775405e-06, | |
| "loss": 0.0309, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.6996595967530768, | |
| "grad_norm": 1.2981770038604736, | |
| "learning_rate": 7.133425745882375e-06, | |
| "loss": 0.0392, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.7101335428122546, | |
| "grad_norm": 1.56510329246521, | |
| "learning_rate": 7.035156579132506e-06, | |
| "loss": 0.0279, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.7206074888714324, | |
| "grad_norm": 1.6105190515518188, | |
| "learning_rate": 6.93720032474829e-06, | |
| "loss": 0.0253, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.7206074888714324, | |
| "eval_loss": 0.051075223833322525, | |
| "eval_runtime": 181.7152, | |
| "eval_samples_per_second": 10.505, | |
| "eval_steps_per_second": 2.63, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.7310814349306103, | |
| "grad_norm": 2.5957469940185547, | |
| "learning_rate": 6.839567321125035e-06, | |
| "loss": 0.019, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.741555380989788, | |
| "grad_norm": 1.354457974433899, | |
| "learning_rate": 6.74226787254185e-06, | |
| "loss": 0.0274, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.752029327048966, | |
| "grad_norm": 1.0121866464614868, | |
| "learning_rate": 6.645312248074132e-06, | |
| "loss": 0.0193, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.7625032731081434, | |
| "grad_norm": 1.7300618886947632, | |
| "learning_rate": 6.54871068050976e-06, | |
| "loss": 0.0208, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.772977219167321, | |
| "grad_norm": 1.365108609199524, | |
| "learning_rate": 6.452473365269115e-06, | |
| "loss": 0.0267, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.783451165226499, | |
| "grad_norm": 2.3114993572235107, | |
| "learning_rate": 6.356610459329038e-06, | |
| "loss": 0.028, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.793925111285677, | |
| "grad_norm": 1.1482765674591064, | |
| "learning_rate": 6.261132080150868e-06, | |
| "loss": 0.0304, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.8043990573448547, | |
| "grad_norm": 1.3784815073013306, | |
| "learning_rate": 6.166048304612624e-06, | |
| "loss": 0.0245, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.8148730034040326, | |
| "grad_norm": 1.6406880617141724, | |
| "learning_rate": 6.071369167945482e-06, | |
| "loss": 0.027, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.8253469494632104, | |
| "grad_norm": 1.8636596202850342, | |
| "learning_rate": 5.9771046626746585e-06, | |
| "loss": 0.0227, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.8253469494632104, | |
| "eval_loss": 0.05176674574613571, | |
| "eval_runtime": 181.8891, | |
| "eval_samples_per_second": 10.495, | |
| "eval_steps_per_second": 2.628, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 1.8853999376296997, | |
| "learning_rate": 5.883264737564776e-06, | |
| "loss": 0.0326, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.8462948415815656, | |
| "grad_norm": 1.3684381246566772, | |
| "learning_rate": 5.789859296569871e-06, | |
| "loss": 0.018, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.8567687876407435, | |
| "grad_norm": 1.627061367034912, | |
| "learning_rate": 5.696898197788108e-06, | |
| "loss": 0.0293, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.8672427336999213, | |
| "grad_norm": 2.071784496307373, | |
| "learning_rate": 5.6043912524213685e-06, | |
| "loss": 0.0246, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.877716679759099, | |
| "grad_norm": 1.5565595626831055, | |
| "learning_rate": 5.512348223739754e-06, | |
| "loss": 0.0163, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.888190625818277, | |
| "grad_norm": 2.5211095809936523, | |
| "learning_rate": 5.4207788260511505e-06, | |
| "loss": 0.0386, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.898664571877455, | |
| "grad_norm": 1.5942156314849854, | |
| "learning_rate": 5.329692723675994e-06, | |
| "loss": 0.0302, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.9091385179366327, | |
| "grad_norm": 1.4718657732009888, | |
| "learning_rate": 5.239099529927281e-06, | |
| "loss": 0.0318, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.9196124639958105, | |
| "grad_norm": 0.7544646859169006, | |
| "learning_rate": 5.1490088060959495e-06, | |
| "loss": 0.0162, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.9300864100549884, | |
| "grad_norm": 1.2517889738082886, | |
| "learning_rate": 5.0594300604418086e-06, | |
| "loss": 0.0304, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.9300864100549884, | |
| "eval_loss": 0.05129832401871681, | |
| "eval_runtime": 181.4791, | |
| "eval_samples_per_second": 10.519, | |
| "eval_steps_per_second": 2.634, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.940560356114166, | |
| "grad_norm": 0.8101089000701904, | |
| "learning_rate": 4.970372747190006e-06, | |
| "loss": 0.0431, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.951034302173344, | |
| "grad_norm": 1.6314613819122314, | |
| "learning_rate": 4.881846265533209e-06, | |
| "loss": 0.0378, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.9615082482325215, | |
| "grad_norm": 1.186647891998291, | |
| "learning_rate": 4.793859958639635e-06, | |
| "loss": 0.0281, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 2.9719821942916993, | |
| "grad_norm": 2.1646673679351807, | |
| "learning_rate": 4.7064231126669355e-06, | |
| "loss": 0.0343, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.982456140350877, | |
| "grad_norm": 1.3391481637954712, | |
| "learning_rate": 4.6195449557821495e-06, | |
| "loss": 0.0197, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.992930086410055, | |
| "grad_norm": 2.9808108806610107, | |
| "learning_rate": 4.5332346571877405e-06, | |
| "loss": 0.0302, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 3.0020947892118355, | |
| "grad_norm": 0.9604336619377136, | |
| "learning_rate": 4.447501326153865e-06, | |
| "loss": 0.0252, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 3.0125687352710133, | |
| "grad_norm": 1.2666419744491577, | |
| "learning_rate": 4.3623540110569935e-06, | |
| "loss": 0.0179, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 3.023042681330191, | |
| "grad_norm": 1.4494256973266602, | |
| "learning_rate": 4.277801698424918e-06, | |
| "loss": 0.0218, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 3.033516627389369, | |
| "grad_norm": 1.1630330085754395, | |
| "learning_rate": 4.1938533119883014e-06, | |
| "loss": 0.018, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.033516627389369, | |
| "eval_loss": 0.05160898342728615, | |
| "eval_runtime": 182.0457, | |
| "eval_samples_per_second": 10.486, | |
| "eval_steps_per_second": 2.626, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.043990573448547, | |
| "grad_norm": 2.2805240154266357, | |
| "learning_rate": 4.110517711738881e-06, | |
| "loss": 0.027, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 3.0544645195077247, | |
| "grad_norm": 0.7012156248092651, | |
| "learning_rate": 4.0278036929943574e-06, | |
| "loss": 0.0225, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 3.0649384655669025, | |
| "grad_norm": 1.6349064111709595, | |
| "learning_rate": 3.945719985470128e-06, | |
| "loss": 0.0171, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 3.07541241162608, | |
| "grad_norm": 1.5148468017578125, | |
| "learning_rate": 3.8642752523579595e-06, | |
| "loss": 0.014, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 3.0858863576852578, | |
| "grad_norm": 0.9480647444725037, | |
| "learning_rate": 3.7834780894116575e-06, | |
| "loss": 0.0152, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 3.0963603037444356, | |
| "grad_norm": 2.8382086753845215, | |
| "learning_rate": 3.7033370240398527e-06, | |
| "loss": 0.0239, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.1068342498036134, | |
| "grad_norm": 2.1970698833465576, | |
| "learning_rate": 3.6238605144060314e-06, | |
| "loss": 0.0261, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 3.1173081958627913, | |
| "grad_norm": 1.1678617000579834, | |
| "learning_rate": 3.545056948535839e-06, | |
| "loss": 0.0158, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.127782141921969, | |
| "grad_norm": 1.8681138753890991, | |
| "learning_rate": 3.466934643431795e-06, | |
| "loss": 0.0175, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 3.138256087981147, | |
| "grad_norm": 1.5951310396194458, | |
| "learning_rate": 3.389501844195525e-06, | |
| "loss": 0.0193, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.138256087981147, | |
| "eval_loss": 0.05427511781454086, | |
| "eval_runtime": 182.0489, | |
| "eval_samples_per_second": 10.486, | |
| "eval_steps_per_second": 2.626, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.148730034040325, | |
| "grad_norm": 1.1853766441345215, | |
| "learning_rate": 3.3127667231575587e-06, | |
| "loss": 0.0211, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 3.1592039800995027, | |
| "grad_norm": 2.4959716796875, | |
| "learning_rate": 3.2367373790147973e-06, | |
| "loss": 0.0143, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.1696779261586805, | |
| "grad_norm": 0.8805971741676331, | |
| "learning_rate": 3.1614218359757985e-06, | |
| "loss": 0.0185, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 3.180151872217858, | |
| "grad_norm": 2.49381160736084, | |
| "learning_rate": 3.0868280429138754e-06, | |
| "loss": 0.0161, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.1906258182770357, | |
| "grad_norm": 1.2514592409133911, | |
| "learning_rate": 3.0129638725281683e-06, | |
| "loss": 0.0198, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 3.2010997643362136, | |
| "grad_norm": 3.421593427658081, | |
| "learning_rate": 2.9398371205127495e-06, | |
| "loss": 0.0203, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.2115737103953914, | |
| "grad_norm": 1.6247831583023071, | |
| "learning_rate": 2.8674555047338694e-06, | |
| "loss": 0.0165, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 3.2220476564545693, | |
| "grad_norm": 2.246312141418457, | |
| "learning_rate": 2.7958266644153974e-06, | |
| "loss": 0.0342, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.232521602513747, | |
| "grad_norm": 2.949176788330078, | |
| "learning_rate": 2.7249581593325647e-06, | |
| "loss": 0.0252, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 3.242995548572925, | |
| "grad_norm": 1.9428445100784302, | |
| "learning_rate": 2.654857469014113e-06, | |
| "loss": 0.0243, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.242995548572925, | |
| "eval_loss": 0.05600380152463913, | |
| "eval_runtime": 182.3825, | |
| "eval_samples_per_second": 10.467, | |
| "eval_steps_per_second": 2.621, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.2534694946321028, | |
| "grad_norm": 1.8825373649597168, | |
| "learning_rate": 2.585531991952893e-06, | |
| "loss": 0.0167, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 3.2639434406912806, | |
| "grad_norm": 2.28324818611145, | |
| "learning_rate": 2.51698904482501e-06, | |
| "loss": 0.0258, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.274417386750458, | |
| "grad_norm": 1.9099152088165283, | |
| "learning_rate": 2.44923586171763e-06, | |
| "loss": 0.0499, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 3.284891332809636, | |
| "grad_norm": 2.5200655460357666, | |
| "learning_rate": 2.382279593365482e-06, | |
| "loss": 0.021, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.2953652788688137, | |
| "grad_norm": 1.6834214925765991, | |
| "learning_rate": 2.3161273063961542e-06, | |
| "loss": 0.0219, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 3.3058392249279915, | |
| "grad_norm": 2.1367030143737793, | |
| "learning_rate": 2.2507859825842883e-06, | |
| "loss": 0.0199, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.3163131709871694, | |
| "grad_norm": 0.7622693777084351, | |
| "learning_rate": 2.1862625181147123e-06, | |
| "loss": 0.0149, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 3.326787117046347, | |
| "grad_norm": 1.3212164640426636, | |
| "learning_rate": 2.122563722854604e-06, | |
| "loss": 0.0165, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.337261063105525, | |
| "grad_norm": 1.5809417963027954, | |
| "learning_rate": 2.059696319634782e-06, | |
| "loss": 0.015, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 3.347735009164703, | |
| "grad_norm": 1.2320683002471924, | |
| "learning_rate": 1.9976669435401597e-06, | |
| "loss": 0.0213, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.347735009164703, | |
| "eval_loss": 0.055280230939388275, | |
| "eval_runtime": 182.1887, | |
| "eval_samples_per_second": 10.478, | |
| "eval_steps_per_second": 2.624, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.3582089552238807, | |
| "grad_norm": 1.0370845794677734, | |
| "learning_rate": 1.936482141209486e-06, | |
| "loss": 0.0237, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 3.3686829012830586, | |
| "grad_norm": 1.2540106773376465, | |
| "learning_rate": 1.8761483701443984e-06, | |
| "loss": 0.0214, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.3791568473422364, | |
| "grad_norm": 1.8267788887023926, | |
| "learning_rate": 1.8166719980278858e-06, | |
| "loss": 0.0202, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 3.389630793401414, | |
| "grad_norm": 1.5350995063781738, | |
| "learning_rate": 1.758059302052255e-06, | |
| "loss": 0.0206, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.4001047394605917, | |
| "grad_norm": 1.1958850622177124, | |
| "learning_rate": 1.7003164682566165e-06, | |
| "loss": 0.0139, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 3.4105786855197695, | |
| "grad_norm": 2.2496140003204346, | |
| "learning_rate": 1.6434495908740022e-06, | |
| "loss": 0.0153, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.4210526315789473, | |
| "grad_norm": 0.9056265950202942, | |
| "learning_rate": 1.587464671688187e-06, | |
| "loss": 0.0178, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 3.431526577638125, | |
| "grad_norm": 0.940555989742279, | |
| "learning_rate": 1.5323676194002456e-06, | |
| "loss": 0.0159, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.442000523697303, | |
| "grad_norm": 1.699397087097168, | |
| "learning_rate": 1.4781642490049398e-06, | |
| "loss": 0.0188, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 3.452474469756481, | |
| "grad_norm": 1.2530186176300049, | |
| "learning_rate": 1.4248602811770108e-06, | |
| "loss": 0.0157, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.452474469756481, | |
| "eval_loss": 0.055322494357824326, | |
| "eval_runtime": 183.0468, | |
| "eval_samples_per_second": 10.429, | |
| "eval_steps_per_second": 2.611, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.4629484158156587, | |
| "grad_norm": 1.4462732076644897, | |
| "learning_rate": 1.372461341667396e-06, | |
| "loss": 0.026, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 3.473422361874836, | |
| "grad_norm": 0.42883485555648804, | |
| "learning_rate": 1.3209729607095022e-06, | |
| "loss": 0.0144, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.483896307934014, | |
| "grad_norm": 1.2245005369186401, | |
| "learning_rate": 1.2704005724355273e-06, | |
| "loss": 0.0108, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 3.494370253993192, | |
| "grad_norm": 1.7988877296447754, | |
| "learning_rate": 1.2207495143029325e-06, | |
| "loss": 0.0228, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.5048442000523696, | |
| "grad_norm": 1.7349547147750854, | |
| "learning_rate": 1.172025026531135e-06, | |
| "loss": 0.0216, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 3.5153181461115475, | |
| "grad_norm": 0.9366742968559265, | |
| "learning_rate": 1.124232251548445e-06, | |
| "loss": 0.0145, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.5257920921707253, | |
| "grad_norm": 1.6843370199203491, | |
| "learning_rate": 1.0773762334493198e-06, | |
| "loss": 0.0311, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 3.536266038229903, | |
| "grad_norm": 2.704352855682373, | |
| "learning_rate": 1.0314619174620211e-06, | |
| "loss": 0.0526, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.546739984289081, | |
| "grad_norm": 1.5389641523361206, | |
| "learning_rate": 9.86494149426682e-07, | |
| "loss": 0.0153, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 3.557213930348259, | |
| "grad_norm": 1.7506754398345947, | |
| "learning_rate": 9.424776752838705e-07, | |
| "loss": 0.0264, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.557213930348259, | |
| "eval_loss": 0.05507681146264076, | |
| "eval_runtime": 183.3526, | |
| "eval_samples_per_second": 10.412, | |
| "eval_steps_per_second": 2.607, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.5676878764074367, | |
| "grad_norm": 2.2783095836639404, | |
| "learning_rate": 8.994171405737051e-07, | |
| "loss": 0.0181, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 3.5781618224666145, | |
| "grad_norm": 1.6380702257156372, | |
| "learning_rate": 8.573170899455529e-07, | |
| "loss": 0.0241, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.5886357685257924, | |
| "grad_norm": 1.6343145370483398, | |
| "learning_rate": 8.161819666783888e-07, | |
| "loss": 0.0193, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 3.5991097145849698, | |
| "grad_norm": 2.3693206310272217, | |
| "learning_rate": 7.760161122118493e-07, | |
| "loss": 0.0368, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.6095836606441476, | |
| "grad_norm": 1.108860969543457, | |
| "learning_rate": 7.368237656880217e-07, | |
| "loss": 0.0101, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 3.6200576067033254, | |
| "grad_norm": 1.584486722946167, | |
| "learning_rate": 6.986090635040555e-07, | |
| "loss": 0.0216, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.6305315527625033, | |
| "grad_norm": 0.9664100408554077, | |
| "learning_rate": 6.61376038875593e-07, | |
| "loss": 0.0112, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 3.641005498821681, | |
| "grad_norm": 1.3716723918914795, | |
| "learning_rate": 6.251286214111018e-07, | |
| "loss": 0.0221, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.651479444880859, | |
| "grad_norm": 1.3973896503448486, | |
| "learning_rate": 5.898706366971451e-07, | |
| "loss": 0.0383, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 3.661953390940037, | |
| "grad_norm": 2.2058684825897217, | |
| "learning_rate": 5.556058058946212e-07, | |
| "loss": 0.0439, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.661953390940037, | |
| "eval_loss": 0.05486290529370308, | |
| "eval_runtime": 182.894, | |
| "eval_samples_per_second": 10.438, | |
| "eval_steps_per_second": 2.614, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.672427336999214, | |
| "grad_norm": 0.8177819848060608, | |
| "learning_rate": 5.223377453460266e-07, | |
| "loss": 0.0135, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 3.682901283058392, | |
| "grad_norm": 1.7943897247314453, | |
| "learning_rate": 4.900699661937914e-07, | |
| "loss": 0.0154, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.69337522911757, | |
| "grad_norm": 1.8057630062103271, | |
| "learning_rate": 4.588058740097012e-07, | |
| "loss": 0.0249, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 3.7038491751767477, | |
| "grad_norm": 1.58455491065979, | |
| "learning_rate": 4.285487684354772e-07, | |
| "loss": 0.0156, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.7143231212359256, | |
| "grad_norm": 2.5056676864624023, | |
| "learning_rate": 3.9930184283452634e-07, | |
| "loss": 0.0214, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 3.7247970672951034, | |
| "grad_norm": 0.33922508358955383, | |
| "learning_rate": 3.7106818395490685e-07, | |
| "loss": 0.0096, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.7352710133542812, | |
| "grad_norm": 1.9061384201049805, | |
| "learning_rate": 3.438507716035555e-07, | |
| "loss": 0.016, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 3.745744959413459, | |
| "grad_norm": 2.3094871044158936, | |
| "learning_rate": 3.176524783317947e-07, | |
| "loss": 0.0204, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.756218905472637, | |
| "grad_norm": 0.8052126169204712, | |
| "learning_rate": 2.924760691321571e-07, | |
| "loss": 0.0182, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 3.7666928515318148, | |
| "grad_norm": 1.3129606246948242, | |
| "learning_rate": 2.683242011465703e-07, | |
| "loss": 0.0164, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.7666928515318148, | |
| "eval_loss": 0.05502132698893547, | |
| "eval_runtime": 182.2185, | |
| "eval_samples_per_second": 10.476, | |
| "eval_steps_per_second": 2.623, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.7771667975909926, | |
| "grad_norm": 1.7071999311447144, | |
| "learning_rate": 2.45199423385919e-07, | |
| "loss": 0.0214, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 3.7876407436501704, | |
| "grad_norm": 0.963501513004303, | |
| "learning_rate": 2.2310417646101535e-07, | |
| "loss": 0.0176, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.798114689709348, | |
| "grad_norm": 1.2818574905395508, | |
| "learning_rate": 2.0204079232502006e-07, | |
| "loss": 0.0204, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 3.8085886357685257, | |
| "grad_norm": 1.4152429103851318, | |
| "learning_rate": 1.8201149402732432e-07, | |
| "loss": 0.0136, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.8190625818277035, | |
| "grad_norm": 1.5160934925079346, | |
| "learning_rate": 1.630183954789233e-07, | |
| "loss": 0.0158, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 3.8295365278868814, | |
| "grad_norm": 1.2240071296691895, | |
| "learning_rate": 1.4506350122932e-07, | |
| "loss": 0.0106, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.840010473946059, | |
| "grad_norm": 1.8110445737838745, | |
| "learning_rate": 1.2814870625495357e-07, | |
| "loss": 0.0141, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 3.850484420005237, | |
| "grad_norm": 0.8142175078392029, | |
| "learning_rate": 1.1227579575921022e-07, | |
| "loss": 0.0103, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.860958366064415, | |
| "grad_norm": 2.131216287612915, | |
| "learning_rate": 9.744644498400513e-08, | |
| "loss": 0.0142, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 3.8714323121235923, | |
| "grad_norm": 1.8197873830795288, | |
| "learning_rate": 8.366221903297944e-08, | |
| "loss": 0.0245, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.8714323121235923, | |
| "eval_loss": 0.055017318576574326, | |
| "eval_runtime": 182.4144, | |
| "eval_samples_per_second": 10.465, | |
| "eval_steps_per_second": 2.62, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.88190625818277, | |
| "grad_norm": 1.9622763395309448, | |
| "learning_rate": 7.092457270631459e-08, | |
| "loss": 0.0266, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 3.892380204241948, | |
| "grad_norm": 0.672971785068512, | |
| "learning_rate": 5.9234850347197335e-08, | |
| "loss": 0.0117, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.902854150301126, | |
| "grad_norm": 1.1201688051223755, | |
| "learning_rate": 4.8594285699928854e-08, | |
| "loss": 0.0208, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 3.9133280963603037, | |
| "grad_norm": 0.9653613567352295, | |
| "learning_rate": 3.900400177971775e-08, | |
| "loss": 0.0275, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.9238020424194815, | |
| "grad_norm": 1.7483731508255005, | |
| "learning_rate": 3.04650107541582e-08, | |
| "loss": 0.0229, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 3.9342759884786593, | |
| "grad_norm": 0.9113327264785767, | |
| "learning_rate": 2.2978213836400974e-08, | |
| "loss": 0.0241, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.944749934537837, | |
| "grad_norm": 0.7190056443214417, | |
| "learning_rate": 1.6544401190040638e-08, | |
| "loss": 0.0086, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 3.955223880597015, | |
| "grad_norm": 0.8524140119552612, | |
| "learning_rate": 1.1164251845718899e-08, | |
| "loss": 0.0201, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.965697826656193, | |
| "grad_norm": 1.4827734231948853, | |
| "learning_rate": 6.838333629465288e-09, | |
| "loss": 0.0212, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 3.9761717727153707, | |
| "grad_norm": 1.2435967922210693, | |
| "learning_rate": 3.5671031027595394e-09, | |
| "loss": 0.0168, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.9761717727153707, | |
| "eval_loss": 0.054999224841594696, | |
| "eval_runtime": 182.2951, | |
| "eval_samples_per_second": 10.472, | |
| "eval_steps_per_second": 2.622, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.9866457187745485, | |
| "grad_norm": 1.00032377243042, | |
| "learning_rate": 1.3509055143490213e-09, | |
| "loss": 0.0186, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 3.9971196648337264, | |
| "grad_norm": 0.9600237011909485, | |
| "learning_rate": 1.8997476381565905e-10, | |
| "loss": 0.0143, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 1912, | |
| "total_flos": 1.5735198828619366e+17, | |
| "train_loss": 0.07373801102690306, | |
| "train_runtime": 28333.2715, | |
| "train_samples_per_second": 2.156, | |
| "train_steps_per_second": 0.067 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1912, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5735198828619366e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |