Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.973821989528796, | |
| "eval_steps": 500, | |
| "global_step": 950, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05235602094240838, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0, | |
| "loss": 9.7408, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10471204188481675, | |
| "grad_norm": 202209.765625, | |
| "learning_rate": 3.1413612565445024e-08, | |
| "loss": 9.1548, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15706806282722513, | |
| "grad_norm": 27189.787109375, | |
| "learning_rate": 1.3612565445026178e-07, | |
| "loss": 3.8451, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2094240837696335, | |
| "grad_norm": 10457.34765625, | |
| "learning_rate": 2.4083769633507854e-07, | |
| "loss": 3.267, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2617801047120419, | |
| "grad_norm": 8087.2939453125, | |
| "learning_rate": 3.4554973821989523e-07, | |
| "loss": 3.0939, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.31413612565445026, | |
| "grad_norm": 7214.744140625, | |
| "learning_rate": 4.50261780104712e-07, | |
| "loss": 3.0211, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.36649214659685864, | |
| "grad_norm": 6162.826171875, | |
| "learning_rate": 5.549738219895288e-07, | |
| "loss": 2.846, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.418848167539267, | |
| "grad_norm": 4688.05615234375, | |
| "learning_rate": 6.596858638743455e-07, | |
| "loss": 2.8104, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4712041884816754, | |
| "grad_norm": 3856.7578125, | |
| "learning_rate": 7.643979057591623e-07, | |
| "loss": 2.8735, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5235602094240838, | |
| "grad_norm": 3529.413330078125, | |
| "learning_rate": 8.691099476439791e-07, | |
| "loss": 2.8117, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5759162303664922, | |
| "grad_norm": 2830.52734375, | |
| "learning_rate": 9.738219895287958e-07, | |
| "loss": 2.7099, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6282722513089005, | |
| "grad_norm": 2316.537353515625, | |
| "learning_rate": 1.0785340314136124e-06, | |
| "loss": 2.6387, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.680628272251309, | |
| "grad_norm": 2685.246826171875, | |
| "learning_rate": 1.1832460732984293e-06, | |
| "loss": 2.6667, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7329842931937173, | |
| "grad_norm": 2066.593017578125, | |
| "learning_rate": 1.2879581151832458e-06, | |
| "loss": 2.5786, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7853403141361257, | |
| "grad_norm": 2110.41748046875, | |
| "learning_rate": 1.3926701570680628e-06, | |
| "loss": 2.4927, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.837696335078534, | |
| "grad_norm": 1557.745849609375, | |
| "learning_rate": 1.4973821989528795e-06, | |
| "loss": 2.6125, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8900523560209425, | |
| "grad_norm": 1510.9991455078125, | |
| "learning_rate": 1.6020942408376963e-06, | |
| "loss": 2.5048, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9424083769633508, | |
| "grad_norm": 1395.5841064453125, | |
| "learning_rate": 1.706806282722513e-06, | |
| "loss": 2.5049, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9947643979057592, | |
| "grad_norm": 1400.4466552734375, | |
| "learning_rate": 1.8115183246073297e-06, | |
| "loss": 2.4902, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0471204188481675, | |
| "grad_norm": 1328.171142578125, | |
| "learning_rate": 1.9162303664921463e-06, | |
| "loss": 2.3063, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0994764397905759, | |
| "grad_norm": 1169.1490478515625, | |
| "learning_rate": 1.997673065735893e-06, | |
| "loss": 2.3826, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1518324607329844, | |
| "grad_norm": 1007.3028564453125, | |
| "learning_rate": 1.9860383944153577e-06, | |
| "loss": 2.2646, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2041884816753927, | |
| "grad_norm": 905.8086547851562, | |
| "learning_rate": 1.9744037230948225e-06, | |
| "loss": 2.3065, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.256544502617801, | |
| "grad_norm": 904.2677001953125, | |
| "learning_rate": 1.9627690517742874e-06, | |
| "loss": 2.369, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3089005235602094, | |
| "grad_norm": 878.70751953125, | |
| "learning_rate": 1.951134380453752e-06, | |
| "loss": 2.2916, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3612565445026177, | |
| "grad_norm": 785.525146484375, | |
| "learning_rate": 1.9394997091332166e-06, | |
| "loss": 2.2916, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4136125654450262, | |
| "grad_norm": 715.8485107421875, | |
| "learning_rate": 1.927865037812682e-06, | |
| "loss": 2.247, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4659685863874345, | |
| "grad_norm": 742.1319580078125, | |
| "learning_rate": 1.9162303664921463e-06, | |
| "loss": 2.2293, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.518324607329843, | |
| "grad_norm": 777.41259765625, | |
| "learning_rate": 1.9045956951716113e-06, | |
| "loss": 2.1447, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5706806282722514, | |
| "grad_norm": 693.8157348632812, | |
| "learning_rate": 1.8929610238510761e-06, | |
| "loss": 2.1851, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6230366492146597, | |
| "grad_norm": 707.2672119140625, | |
| "learning_rate": 1.881326352530541e-06, | |
| "loss": 2.1879, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.675392670157068, | |
| "grad_norm": 727.61767578125, | |
| "learning_rate": 1.8696916812100056e-06, | |
| "loss": 2.1962, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7277486910994764, | |
| "grad_norm": 695.4833984375, | |
| "learning_rate": 1.8580570098894706e-06, | |
| "loss": 2.2057, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7801047120418847, | |
| "grad_norm": 614.199462890625, | |
| "learning_rate": 1.8464223385689352e-06, | |
| "loss": 2.0654, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8324607329842932, | |
| "grad_norm": 724.0316162109375, | |
| "learning_rate": 1.8347876672484e-06, | |
| "loss": 2.0803, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8848167539267016, | |
| "grad_norm": 664.735595703125, | |
| "learning_rate": 1.823152995927865e-06, | |
| "loss": 1.8995, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.93717277486911, | |
| "grad_norm": 725.57373046875, | |
| "learning_rate": 1.8115183246073297e-06, | |
| "loss": 1.9195, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9895287958115184, | |
| "grad_norm": 680.0363159179688, | |
| "learning_rate": 1.7998836532867946e-06, | |
| "loss": 1.9157, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.0418848167539267, | |
| "grad_norm": 656.7247314453125, | |
| "learning_rate": 1.7882489819662594e-06, | |
| "loss": 1.8435, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.094240837696335, | |
| "grad_norm": 754.1705322265625, | |
| "learning_rate": 1.776614310645724e-06, | |
| "loss": 1.8308, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.1465968586387434, | |
| "grad_norm": 811.6585693359375, | |
| "learning_rate": 1.764979639325189e-06, | |
| "loss": 1.8349, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1989528795811517, | |
| "grad_norm": 743.0385131835938, | |
| "learning_rate": 1.7533449680046537e-06, | |
| "loss": 1.8507, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.25130890052356, | |
| "grad_norm": 748.2722778320312, | |
| "learning_rate": 1.7417102966841187e-06, | |
| "loss": 1.7967, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.303664921465969, | |
| "grad_norm": 587.875732421875, | |
| "learning_rate": 1.7300756253635833e-06, | |
| "loss": 1.9958, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.356020942408377, | |
| "grad_norm": 623.1217651367188, | |
| "learning_rate": 1.7184409540430482e-06, | |
| "loss": 1.8716, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.4083769633507854, | |
| "grad_norm": 689.55126953125, | |
| "learning_rate": 1.706806282722513e-06, | |
| "loss": 1.8947, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.4607329842931938, | |
| "grad_norm": 656.4078369140625, | |
| "learning_rate": 1.6951716114019778e-06, | |
| "loss": 1.8584, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.513089005235602, | |
| "grad_norm": 672.116455078125, | |
| "learning_rate": 1.6835369400814424e-06, | |
| "loss": 1.8129, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.5654450261780104, | |
| "grad_norm": 586.6629638671875, | |
| "learning_rate": 1.6719022687609075e-06, | |
| "loss": 1.8214, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.6178010471204187, | |
| "grad_norm": 593.3026123046875, | |
| "learning_rate": 1.6602675974403721e-06, | |
| "loss": 1.802, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.670157068062827, | |
| "grad_norm": 591.8192749023438, | |
| "learning_rate": 1.6486329261198371e-06, | |
| "loss": 1.8523, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.7225130890052354, | |
| "grad_norm": 655.689453125, | |
| "learning_rate": 1.6369982547993018e-06, | |
| "loss": 1.7537, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.774869109947644, | |
| "grad_norm": 727.9883422851562, | |
| "learning_rate": 1.6253635834787666e-06, | |
| "loss": 1.7047, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.8272251308900525, | |
| "grad_norm": 547.6102905273438, | |
| "learning_rate": 1.6137289121582314e-06, | |
| "loss": 1.8178, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.8795811518324608, | |
| "grad_norm": 565.3403930664062, | |
| "learning_rate": 1.6020942408376963e-06, | |
| "loss": 1.7221, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.931937172774869, | |
| "grad_norm": 623.0109252929688, | |
| "learning_rate": 1.5904595695171609e-06, | |
| "loss": 1.7912, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.9842931937172774, | |
| "grad_norm": 600.0060424804688, | |
| "learning_rate": 1.578824898196626e-06, | |
| "loss": 1.8453, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.0366492146596857, | |
| "grad_norm": 719.7506103515625, | |
| "learning_rate": 1.5671902268760905e-06, | |
| "loss": 1.5766, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.089005235602094, | |
| "grad_norm": 836.7677612304688, | |
| "learning_rate": 1.5555555555555556e-06, | |
| "loss": 1.5165, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.141361256544503, | |
| "grad_norm": 736.6253662109375, | |
| "learning_rate": 1.5439208842350202e-06, | |
| "loss": 1.5065, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.193717277486911, | |
| "grad_norm": 796.0474243164062, | |
| "learning_rate": 1.532286212914485e-06, | |
| "loss": 1.4979, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.2460732984293195, | |
| "grad_norm": 717.6380615234375, | |
| "learning_rate": 1.5206515415939499e-06, | |
| "loss": 1.488, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.298429319371728, | |
| "grad_norm": 726.650634765625, | |
| "learning_rate": 1.5090168702734147e-06, | |
| "loss": 1.4923, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.350785340314136, | |
| "grad_norm": 660.4285278320312, | |
| "learning_rate": 1.4973821989528795e-06, | |
| "loss": 1.5328, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.4031413612565444, | |
| "grad_norm": 717.77490234375, | |
| "learning_rate": 1.4857475276323443e-06, | |
| "loss": 1.5207, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.4554973821989527, | |
| "grad_norm": 665.8229370117188, | |
| "learning_rate": 1.474112856311809e-06, | |
| "loss": 1.497, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.507853403141361, | |
| "grad_norm": 726.3001098632812, | |
| "learning_rate": 1.462478184991274e-06, | |
| "loss": 1.5739, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.5602094240837694, | |
| "grad_norm": 805.164306640625, | |
| "learning_rate": 1.4508435136707386e-06, | |
| "loss": 1.5005, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.612565445026178, | |
| "grad_norm": 672.5020751953125, | |
| "learning_rate": 1.4392088423502037e-06, | |
| "loss": 1.4879, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.6649214659685865, | |
| "grad_norm": 731.792236328125, | |
| "learning_rate": 1.4275741710296683e-06, | |
| "loss": 1.4545, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.717277486910995, | |
| "grad_norm": 764.9650268554688, | |
| "learning_rate": 1.4159394997091331e-06, | |
| "loss": 1.5199, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.769633507853403, | |
| "grad_norm": 888.060302734375, | |
| "learning_rate": 1.404304828388598e-06, | |
| "loss": 1.4832, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.8219895287958114, | |
| "grad_norm": 691.344970703125, | |
| "learning_rate": 1.3926701570680628e-06, | |
| "loss": 1.5045, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.8743455497382198, | |
| "grad_norm": 706.8125, | |
| "learning_rate": 1.3810354857475274e-06, | |
| "loss": 1.5049, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.9267015706806285, | |
| "grad_norm": 744.5066528320312, | |
| "learning_rate": 1.3694008144269924e-06, | |
| "loss": 1.4492, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.979057591623037, | |
| "grad_norm": 752.1239624023438, | |
| "learning_rate": 1.357766143106457e-06, | |
| "loss": 1.5103, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.031413612565445, | |
| "grad_norm": 684.4227294921875, | |
| "learning_rate": 1.346131471785922e-06, | |
| "loss": 1.2956, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.0837696335078535, | |
| "grad_norm": 712.1319580078125, | |
| "learning_rate": 1.3344968004653867e-06, | |
| "loss": 1.1727, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.136125654450262, | |
| "grad_norm": 802.2329711914062, | |
| "learning_rate": 1.3228621291448515e-06, | |
| "loss": 1.0637, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.18848167539267, | |
| "grad_norm": 782.2645263671875, | |
| "learning_rate": 1.3112274578243164e-06, | |
| "loss": 1.2483, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.2408376963350785, | |
| "grad_norm": 756.0220947265625, | |
| "learning_rate": 1.2995927865037812e-06, | |
| "loss": 1.135, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 4.293193717277487, | |
| "grad_norm": 726.4359741210938, | |
| "learning_rate": 1.2879581151832458e-06, | |
| "loss": 1.1676, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.345549738219895, | |
| "grad_norm": 803.60791015625, | |
| "learning_rate": 1.2763234438627109e-06, | |
| "loss": 1.1466, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 4.397905759162303, | |
| "grad_norm": 964.1234741210938, | |
| "learning_rate": 1.2646887725421755e-06, | |
| "loss": 1.243, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.450261780104712, | |
| "grad_norm": 774.6426391601562, | |
| "learning_rate": 1.2530541012216405e-06, | |
| "loss": 1.2794, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.50261780104712, | |
| "grad_norm": 954.2877197265625, | |
| "learning_rate": 1.2414194299011051e-06, | |
| "loss": 1.2563, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.554973821989529, | |
| "grad_norm": 854.4068603515625, | |
| "learning_rate": 1.22978475858057e-06, | |
| "loss": 1.2186, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.607329842931938, | |
| "grad_norm": 842.273193359375, | |
| "learning_rate": 1.2181500872600348e-06, | |
| "loss": 1.1172, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.659685863874346, | |
| "grad_norm": 794.1563720703125, | |
| "learning_rate": 1.2065154159394996e-06, | |
| "loss": 1.2818, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.712041884816754, | |
| "grad_norm": 864.4095458984375, | |
| "learning_rate": 1.1948807446189645e-06, | |
| "loss": 1.2335, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.7643979057591626, | |
| "grad_norm": 781.428955078125, | |
| "learning_rate": 1.1832460732984293e-06, | |
| "loss": 1.2563, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 4.816753926701571, | |
| "grad_norm": 808.9722900390625, | |
| "learning_rate": 1.171611401977894e-06, | |
| "loss": 1.2301, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.869109947643979, | |
| "grad_norm": 838.6384887695312, | |
| "learning_rate": 1.159976730657359e-06, | |
| "loss": 1.1707, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 4.9214659685863875, | |
| "grad_norm": 852.7664184570312, | |
| "learning_rate": 1.1483420593368236e-06, | |
| "loss": 1.1906, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.973821989528796, | |
| "grad_norm": 689.58154296875, | |
| "learning_rate": 1.1367073880162884e-06, | |
| "loss": 1.2978, | |
| "step": 950 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1910, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.260298693445818e+16, | |
| "train_batch_size": 5, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |